赵占旭的博客

[转]Cisco VPP BiHash分析

注:本文是转载,但不是100%的转载,可能稍微有些出入,原文地址点击这里

基本概念


VPP里的Bihash全名为Bounded-index extensible hash。它的最大特点是,在查找时是无锁并且线程安全的。修改操作之间会有互斥,但是修改操作时仍然可以进行查找操作。
vpp里的Bihash优化成了两种,bihash_kv_8_8bihash_kv_24_8,区别在于hash key是8字节还是24字 节。最大限度的利用SSE4.2指令集中的_mm_crc32_u64来进行hash计算。
核心函数在bihash_template.c中。根据包含的头文件是bihash_8_8.h还是bihash_24_8.h,BV宏和BTV 宏将把名字做出对应扩展。例如:BV (clib_bihash_init)扩展为clib_bihash_init_8_8()或者clib_bihash_init_24_8()BVT (clib_bihash)扩展为clib_bihash_8_8_t或者为clib_bihash_24_8_t

clib_bihash_bucket_t

hash桶

1
2
3
4
5
6
7
8
9
10
11
12
typedef struct {
union {
struct {
//这个桶中记录得kv对,在heap中的起始位置
u32 offset;
u8 pad[3];
//这个桶中记录的kv对,一共占用了1 << log2_pages个page
u8 log2_pages;
};
u64 as_u64;
};
} clib_bihash_bucket_t;

page数据结构,中间包含了kv对。

1
2
3
4
5
6
typedef struct BV (clib_bihash_value) {
union {
BVT (clib_bihash_kv) kvp[BIHASH_KVP_PER_PAGE];
struct BV (clib_bihash_value) * next_free;
};
} BVT (clib_bihash_value);

avatar

核心函数


clib_bihash_init_8_8()clib_bihash_init_24_8()初始化bihash,并分配一个独占的内存heap给它。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
void BV (clib_bihash_init)(BVT (clib_bihash) * h, char *name,
u32 nbuckets, uword memory_size)
{
void *oldheap;

//方便之后hash值映射到bucket,用&替代昂贵的%操作。
nbuckets = 1 << (max_log2 (nbuckets));

h->name = (u8 *) name;
h->nbuckets = nbuckets;
h->log2_nbuckets = max_log2 (nbuckets);

h->mheap = mheap_alloc (0 /* use VM */ , memory_size);

//常用操作,这样才能使用内存操作函数,
//注意这里单核心上只能有一个线程独占此操作,不同核心可以并发
oldheap = clib_mem_set_heap (h->mheap);
vec_validate_aligned (h->buckets, nbuckets - 1, CLIB_CACHE_LINE_BYTES);
h->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
CLIB_CACHE_LINE_BYTES);

clib_mem_set_heap (oldheap);
}

clib_bihash_search_8_8()clib_bihash_search_24_8()给定key,查找value。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
int BV (clib_bihash_search)(const BVT (clib_bihash) * h,
BVT (clib_bihash_kv) * search_key, BVT (clib_bihash_kv) * valuep)
{
//hash计算,者利用了SSE4.2指令集特性。值得记住
hash = BV (clib_bihash_hash) (search_key);

//hash值的低log2_nbuckets bit用来索引桶号
bucket_index = hash & (h->nbuckets - 1);
b = &h->buckets[bucket_index];

if (b->offset == 0)
return -1;

hash >>= h->log2_nbuckets;

//在heap中的offset字节开始,属于桶。
v = BV (clib_bihash_get_value) (h, b->offset);
//hash值的中间log2_pages bit用来索引桶中的page号
value_index = hash & ((1 << b->log2_pages) - 1);
v += value_index;

//遍历page,查找key值。
for (i = 0; i < BIHASH_KVP_PER_PAGE; i++)
{
if (BV (clib_bihash_key_compare) (v->kvp[i].key, search_key->key))
{
*valuep = v->kvp[i];
return 0;
}
}
return -1;
}

clib_bihash_add_del_8_8()clib_bihash_add_del_24_8()添加删除kv对。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
int BV (clib_bihash_add_del)(BVT (clib_bihash) * h,
BVT (clib_bihash_kv) * add_v, int is_add)
{
u32 cpu_number = os_get_cpu_number ();

hash = BV (clib_bihash_hash) (add_v);

//hash值的低log2_nbuckets bit用来索引桶号
bucket_index = hash & (h->nbuckets - 1);
b = &h->buckets[bucket_index];

hash >>= h->log2_nbuckets;

while (__sync_lock_test_and_set (h->writer_lock, 1))
;

/* First elt in the bucket? */
if (b->offset == 0)
{
if (is_add == 0)
{
rv = -1;
goto unlock;
}
//桶中最初没有kv对,现在分配一个page
v = BV (value_alloc) (h, 0);
*v->kvp = *add_v;
tmp_b.as_u64 = 0;
tmp_b.offset = BV (clib_bihash_get_offset) (h, v);

b->as_u64 = tmp_b.as_u64;
goto unlock;
}
//把桶b中的kv对拷贝到缓存中,缓存加入到b中,b中原来的kv对留给下文修改。
BV (make_working_copy) (h, b);

//b中原来的kv对page
v = BV (clib_bihash_get_value) (h, h->saved_bucket.offset);
//hash值的中间log2_pages bit用来索引桶中的page号
value_index = hash & ((1 << h->saved_bucket.log2_pages) - 1);
v += value_index;

if (is_add)
{
/*
* For obvious (in hindsight) reasons, see if we're supposed to
* replace an existing key, then look for an empty slot.
*/
for (i = 0; i < BIHASH_KVP_PER_PAGE; i++)
{
if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
{
//有重复的key值,把value覆盖旧的
clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
CLIB_MEMORY_BARRIER ();
/* Restore the previous (k,v) pairs */
//修改完的page重新保存回b中
b->as_u64 = h->saved_bucket.as_u64;
goto unlock;
}
}
for (i = 0; i < BIHASH_KVP_PER_PAGE; i++)
{
if (BV (clib_bihash_is_free) (&(v->kvp[i])))
{
//要保存的kv内容拷贝到第一个空闲的空间中
clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
CLIB_MEMORY_BARRIER ();
b->as_u64 = h->saved_bucket.as_u64;
goto unlock;
}
}
/* no room at the inn... split case... */
}
else
{
for (i = 0; i < BIHASH_KVP_PER_PAGE; i++)
{
if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
{
memset (&(v->kvp[i]), 0xff, sizeof (*(add_v)));
CLIB_MEMORY_BARRIER ();
b->as_u64 = h->saved_bucket.as_u64;
goto unlock;
}
}
rv = -3;
b->as_u64 = h->saved_bucket.as_u64;
goto unlock;
}

//添加kv发现空间不够了,该桶的page数量增加一倍
new_log2_pages = h->saved_bucket.log2_pages + 1;

expand_again:
working_copy = h->working_copies[cpu_number];
//扩充page,其中的kv对需要重新排列下,因为hash值中需要用new_log2_pages个bit来确定page位置
new_v = BV (split_and_rehash) (h, working_copy, new_log2_pages);
if (new_v == 0)
{
new_log2_pages++;
goto expand_again;
}

/* Try to add the new entry */
save_new_v = new_v;
new_hash = BV (clib_bihash_hash) (add_v);
new_hash >>= h->log2_nbuckets;
new_hash &= (1 << min_log2 (vec_len (new_v))) - 1;
new_v += new_hash;

for (i = 0; i < BIHASH_KVP_PER_PAGE; i++)
{
if (BV (clib_bihash_is_free) (&(new_v->kvp[i])))
{
clib_memcpy (&(new_v->kvp[i]), add_v, sizeof (*add_v));
goto expand_ok;
}
}
/* Crap. Try again */
new_log2_pages++;
BV (value_free) (h, save_new_v);
goto expand_again;

expand_ok:
tmp_b.log2_pages = min_log2 (vec_len (save_new_v));
tmp_b.offset = BV (clib_bihash_get_offset) (h, save_new_v);
CLIB_MEMORY_BARRIER ();
b->as_u64 = tmp_b.as_u64;
v = BV (clib_bihash_get_value) (h, h->saved_bucket.offset);
BV (value_free) (h, v);

unlock:
CLIB_MEMORY_BARRIER ();
h->writer_lock[0] = 0;
return rv;
}

make_working_copy_8_8()make_working_copy_24_8()用来生成桶内pages的副本,供添加删除修改使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static inline void BV (make_working_copy) (BVT (clib_bihash) * h,
clib_bihash_bucket_t * b)
{
BVT (clib_bihash_value) * v;
clib_bihash_bucket_t working_bucket __attribute__ ((aligned (8)));
void *oldheap;
BVT (clib_bihash_value) * working_copy;
u32 cpu_number = os_get_cpu_number ();

//working_copies是per-cpu的
if (cpu_number >= vec_len (h->working_copies))
{
oldheap = clib_mem_set_heap (h->mheap);
vec_validate (h->working_copies, cpu_number);
clib_mem_set_heap (oldheap);
}

/*
* working_copies are per-cpu so that near-simultaneous
* updates from multiple threads will not result in sporadic, spurious
* lookup failures.
*/
working_copy = h->working_copies[cpu_number];

//博主觉得saved_bucket又不是per-cpu的,那么working_copies就没必要做成per-cpu了
h->saved_bucket.as_u64 = b->as_u64;
oldheap = clib_mem_set_heap (h->mheap);

if ((1 << b->log2_pages) > vec_len (working_copy))
{
vec_validate_aligned (working_copy, (1 << b->log2_pages) - 1,
sizeof (u64));
h->working_copies[cpu_number] = working_copy;
}

_vec_len (working_copy) = 1 << b->log2_pages;
clib_mem_set_heap (oldheap);

v = BV (clib_bihash_get_value) (h, b->offset);

//b中原有的kv内容拷贝到working_copy中,然后把b的page指向working_copy中的。这样b中的kv其实是副本。
clib_memcpy (working_copy, v, sizeof (*v) * (1 << b->log2_pages));
working_bucket.as_u64 = b->as_u64;
working_bucket.offset = BV (clib_bihash_get_offset) (h, working_copy);
CLIB_MEMORY_BARRIER ();
b->as_u64 = working_bucket.as_u64;
h->working_copies[cpu_number] = working_copy;
}

split_and_rehash_8_8()split_and_rehash_24_8()桶中的page数量扩张后,原有的kv需要重新插入一边。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

static BVT (clib_bihash_value) *BV (split_and_rehash)(BVT (clib_bihash) * h,
BVT (clib_bihash_value) * old_values, u32 new_log2_pages)
{
BVT (clib_bihash_value) * new_values, *v, *new_v;
int i, j, k;

new_values = BV (value_alloc) (h, new_log2_pages);

//v会遍历原有的每个page
v = old_values;
for (i = 0; i < vec_len (old_values); i++)
{
u64 new_hash;

//遍历原有桶中特定page中的kv
for (j = 0; j < BIHASH_KVP_PER_PAGE; j++)
{
if (BV (clib_bihash_is_free) (&(v->kvp[j])) == 0)
{
new_hash = BV (clib_bihash_hash) (&(v->kvp[j]));
new_hash >>= h->log2_nbuckets;
new_hash &= (1 << new_log2_pages) - 1;

new_v = &new_values[new_hash];

for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
{
if (BV (clib_bihash_is_free) (&(new_v->kvp[k])))
{
clib_memcpy (&(new_v->kvp[k]), &(v->kvp[j]),
sizeof (new_v->kvp[k]));
goto doublebreak;
}
}
/* Crap. Tell caller to try again */
BV (value_free) (h, new_values);
return 0;
}
doublebreak:
;
}
v++;
}
return new_values;
}

clib_bihash_value_8_8()clib_bihash_value_24_8()分配page用,page用来保存kv对。内存分配以page为单位,分配1 << log2_pages个page,并且对回收的page做了缓存。但是没有用伙伴算法进行碎片内存合并。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static BVT (clib_bihash_value) *BV (value_alloc) (BVT (clib_bihash) * h, u32 log2_pages)
{
BVT (clib_bihash_value) * rv = 0;
void *oldheap;

ASSERT (h->writer_lock[0]);
//h->freelists用log2_pages值来索引空闲page
if (log2_pages >= vec_len (h->freelists) || h->freelists[log2_pages] == 0)
{
oldheap = clib_mem_set_heap (h->mheap);
vec_validate (h->freelists, log2_pages);
//分配1 << log2_pages 个page,方便从hash值中计算出page编号。
vec_validate_aligned (rv, (1 << log2_pages) - 1, CLIB_CACHE_LINE_BYTES);
clib_mem_set_heap (oldheap);
goto initialize;
}
rv = h->freelists[log2_pages];
h->freelists[log2_pages] = rv->next_free;

initialize:
ASSERT (rv);
ASSERT (vec_len (rv) == (1 << log2_pages));
/*
* Latest gcc complains that the length arg is zero
* if we replace (1<<log2_pages) with vec_len(rv).
* No clue.
*/
memset (rv, 0xff, sizeof (*rv) * (1 << log2_pages));
return rv;
}

注意:所有文章非特别说明皆为原创。为保证信息与源同步,转载时请务必注明文章出处!谢谢合作 :-)

原始链接:http://zhaozhanxu.com/2017/02/14/VPP/2017-02-14-VPP-BiHash/

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。