赵占旭的博客

OVN controller工作

前面我们介绍到openflow流表和logical流表的大致对应,今天我们从代码的角度看一下他是怎么实现的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
int
main(int argc, char *argv[])
{
//为LB初始化group id
struct group_table group_table;
group_table.group_ids = bitmap_allocate(MAX_OVN_GROUPS);
bitmap_set1(group_table.group_ids, 0); /* Group id 0 is invalid. */
hmap_init(&group_table.desired_groups);
hmap_init(&group_table.existing_groups);

//连接ovs的ovsdb实例,我们不监控所有的表,所以模块必须注册感兴趣的部分
struct ovsdb_idl_loop ovs_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovs_remote, &ovsrec_idl_class, false, true));
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl,
&ovsrec_open_vswitch_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_fail_mode);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_other_config);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_external_ids);
//chassis_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_iface_types);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_datapath_type);
//encaps_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
//binding_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_qos);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_status);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_qos);
//physical_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
//确保成功连接到ovs的ovsdb,并且取回上面设置的内容
ovsdb_idl_get_initial_snapshot(ovs_idl_loop.idl);

//连接控制节点的sbdb,监听表的所有内容,除了下面忽略的nb_cfg
//确保成功连接,并且取回相关的内容
char *ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnsb_remote, &sbrec_idl_class, true, true));
ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
update_sb_monitors(ovnsb_idl_loop.idl, NULL, NULL, NULL);
ovsdb_idl_get_initial_snapshot(ovnsb_idl_loop.idl);

//初始化conntrack zone
struct simap ct_zones = SIMAP_INITIALIZER(&ct_zones);
struct shash pending_ct_zones = SHASH_INITIALIZER(&pending_ct_zones);
unsigned long ct_zone_bitmap[BITMAP_N_LONGS(MAX_CT_ZONES)];
memset(ct_zone_bitmap, 0, sizeof ct_zone_bitmap);
//默认是zone 0
bitmap_set1(ct_zone_bitmap, 0);
restore_ct_zones(ovs_idl_loop.idl, &ct_zones, ct_zone_bitmap);
unixctl_command_register("ct-zone-list", "", 0, 0,
ct_zone_list, &ct_zones);

struct pending_pkt pending_pkt = { .conn = NULL };
unixctl_command_register("inject-pkt", "MICROFLOW", 1, 1, inject_pkt,
&pending_pkt);

while (!exiting) {
//检测sbdb是否变化
char *new_ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
if (strcmp(ovnsb_remote, new_ovnsb_remote)) {
free(ovnsb_remote);
ovnsb_remote = new_ovnsb_remote;
ovsdb_idl_set_remote(ovnsb_idl_loop.idl, ovnsb_remote, true);
} else {
free(new_ovnsb_remote);
}

struct controller_ctx ctx = {
.ovs_idl = ovs_idl_loop.idl,
.ovs_idl_txn = ovsdb_idl_loop_run(&ovs_idl_loop),
.ovnsb_idl = ovnsb_idl_loop.idl,
.ovnsb_idl_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
};

//默认5s的探测sbdb
update_probe_interval(&ctx);

//包含了struct local_datapath节点
struct hmap local_datapaths = HMAP_INITIALIZER(&local_datapaths);

//包含了驻留在本地的所有逻辑端口的名字。
//这些逻辑端口包括虚拟机的虚拟端口、
//l2gateway-chassis指派的l2网关端口以及localnet端口
struct sset local_lports = SSET_INITIALIZER(&local_lports);

//获取br-int,如果没有则创建,名字不一定是br-int
const struct ovsrec_bridge *br_int = get_br_int(&ctx);
//获取本地ovs的system-id
const char *chassis_id = get_chassis_id(ctx.ovs_idl);

struct ldatapath_index ldatapaths;
struct lport_index lports;
struct mcgroup_index mcgroups;
ldatapath_index_init(&ldatapaths, ctx.ovnsb_idl);
lport_index_init(&lports, ctx.ovnsb_idl);
mcgroup_index_init(&mcgroups, ctx.ovnsb_idl);

const struct sbrec_chassis *chassis = NULL;
if (chassis_id) {
//根据本地的system-id,获取本地encap type和IP、hostname
//datapath type、端口type并且更新到ovn的chassis中
//查看ovn的chassis方法为ovn-sbctl list Chassis
chassis = chassis_run(&ctx, chassis_id, br_int);
//根据ovn配置的创建tunnel信息
//找到需要和其他chassis建立tunnel的隧道信息,并且创建隧道
//所谓的创建,其实就是给ovs的数据库插入端口信息
encaps_run(&ctx, br_int, chassis_id);
//首先获取端口的信息,本地端口获取external-ids的iface-id
//egress端口获取options的remote_ip,查看命令为ovs-vsctl list interface
//将上面获取的iface-id填充到sbdb的port_binding的logical_port中
//给egress端口设置qos,因为ovn现在给vm的出口配置qos最终会通过queue配置到出口
binding_run(&ctx, br_int, chassis, &ldatapaths, &lports,
&local_datapaths, &local_lports);
}

if (br_int && chassis) {
struct shash addr_sets = SHASH_INITIALIZER(&addr_sets);
addr_sets_init(&ctx, &addr_sets);

//这块没找到相关信息,据我所知ovs2.6还有patch port
//ovs2.7开始就没有patch port了,都是通过流表resubmit重新从表16开始匹配
patch_run(&ctx, br_int, chassis, &local_datapaths);

enum mf_field_id mff_ovn_geneve = ofctrl_run(br_int,
&pending_ct_zones);

pinctrl_run(&ctx, &lports, br_int, chassis, &local_datapaths);
update_ct_zones(&local_lports, &local_datapaths, &ct_zones,
ct_zone_bitmap, &pending_ct_zones);
if (ctx.ovs_idl_txn) {

//设置ct_zone,通过命令ovs-vsctl list Bridge可以查看
commit_ct_zones(br_int, &pending_ct_zones);

struct hmap flow_table = HMAP_INITIALIZER(&flow_table);
//此处是生成逻辑流表的地方,这里不多说,下面详说
lflow_run(&ctx, chassis, &lports, &mcgroups,
&local_datapaths, &group_table, &ct_zones,
&addr_sets, &flow_table);

//一些和物理设备相关和逻辑流表无关的,下面也会细看
physical_run(&ctx, mff_ovn_geneve,
br_int, chassis, &ct_zones, &lports,
&flow_table, &local_datapaths);

}
}
}
}

lflow_run主要有两块的工作,一个是翻译逻辑流表的函数add_logical_flows-->consider_logical_flow,另外一个是添加邻居子系统相关流表的函数add_neighbor_flows-->consider_neighbor_flow

  • 逻辑流表翻译是ingress table 0-15对应openflow的table 16-31,egress table 0-15 对应openflow的table 48-63
  • 邻居子系统相关的流表是对应openflow的table 66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
static void
consider_logical_flow(const struct lport_index *lports,
const struct mcgroup_index *mcgroups,
const struct sbrec_logical_flow *lflow,
const struct hmap *local_datapaths,
struct group_table *group_table,
const struct simap *ct_zones,
const struct sbrec_chassis *chassis,
struct hmap *dhcp_opts,
struct hmap *dhcpv6_opts,
uint32_t *conj_id_ofs,
const struct shash *addr_sets,
struct hmap *flow_table)
{
//判定逻辑流表是ingress还是egress
bool ingress = !strcmp(lflow->pipeline, "ingress");

const struct sbrec_datapath_binding *ldp = lflow->logical_datapath;
if (!get_local_datapath(local_datapaths, ldp->tunnel_key)) {
return;
}

//逻辑流表的ID和openflow流表ID的对应关系,ingress从表16开始,egress从48开始
uint8_t first_ptable = (ingress
? OFTABLE_LOG_INGRESS_PIPELINE
: OFTABLE_LOG_EGRESS_PIPELINE);
uint8_t ptable = first_ptable + lflow->table_id;
//这块可能是关于发送口的流表,ingress从表32开始,egress从64开始
uint8_t output_ptable = (ingress
? OFTABLE_REMOTE_OUTPUT
: OFTABLE_SAVE_INPORT);

//此处开始解析逻辑流表的action到openflow流表的action
uint64_t ovnacts_stub[1024 / 8];
struct ofpbuf ovnacts = OFPBUF_STUB_INITIALIZER(ovnacts_stub);
struct ovnact_parse_params pp = {
.symtab = &symtab,
.dhcp_opts = dhcp_opts,
.dhcpv6_opts = dhcpv6_opts,

.pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
.n_tables = LOG_PIPELINE_LEN,
.cur_ltable = lflow->table_id,
};
struct expr *prereqs;
char *error;

//新加的,没看懂,说是用local info替换lflow->actions.....
const char *new_actions = (const char *)replace_local_info(lflow->actions, chassis);

//详细的解析action不说了,可以查看函数parse_action
error = ovnacts_parse_string(new_actions, &pp, &ovnacts, &prereqs);
if (error) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "error parsing actions \"%s\": %s",
lflow->actions, error);
free(error);
ovnacts_free(ovnacts.data, ovnacts.size);
ofpbuf_uninit(&ovnacts);
return;
}

//将之前解析的action转换成openflow的action
uint64_t ofpacts_stub[1024 / 8];
struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
struct lookup_port_aux aux = {
.lports = lports,
.mcgroups = mcgroups,
.dp = lflow->logical_datapath
};
struct ovnact_encode_params ep = {
.lookup_port = lookup_port_cb,
.aux = &aux,
.is_switch = is_switch(ldp),
.is_gateway_router = is_gateway_router(ldp, local_datapaths),
.ct_zones = ct_zones,
.group_table = group_table,

.pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
.ingress_ptable = OFTABLE_LOG_INGRESS_PIPELINE,
.egress_ptable = OFTABLE_LOG_EGRESS_PIPELINE,
.output_ptable = output_ptable,
.mac_bind_ptable = OFTABLE_MAC_BINDING,
};
//每个action都有一个对应的两个函数进行转换,不细看了
ovnacts_encode(ovnacts.data, ovnacts.size, &ep, &ofpacts);
ovnacts_free(ovnacts.data, ovnacts.size);
ofpbuf_uninit(&ovnacts);

//将ovn的match信息翻译为openflow的match
struct hmap matches;
struct expr *expr;

//同上不造什么鬼
const char *new_match = (const char *)replace_local_info(lflow->match, chassis);

//解析匹配项,将字符串解析为宏定义,比如&解析成LEX_T_LOG_AND
expr = expr_parse_string(new_match, &symtab, addr_sets, &error);
if (!error) {
if (prereqs) {
expr = expr_combine(EXPR_T_AND, expr, prereqs);
prereqs = NULL;
}
expr = expr_annotate(expr, &symtab, &error);
}
if (error) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "error parsing match \"%s\": %s",
lflow->match, error);
expr_destroy(prereqs);
ofpbuf_uninit(&ofpacts);
free(error);
return;
}

struct condition_aux cond_aux = { lports, chassis };
expr = expr_simplify(expr, is_chassis_resident_cb, &cond_aux);
expr = expr_normalize(expr);
//表达式转换成到matches
uint32_t n_conjs = expr_to_matches(expr, lookup_port_cb, &aux,
&matches);
expr_destroy(expr);

struct expr_match *m;
HMAP_FOR_EACH (m, hmap_node, &matches) {
match_set_metadata(&m->match,
htonll(lflow->logical_datapath->tunnel_key));
if (m->match.wc.masks.conj_id) {
m->match.flow.conj_id += *conj_id_ofs;
}
if (!m->n) {
//生成openflow流表
ofctrl_add_flow(flow_table, ptable, lflow->priority,
lflow->header_.uuid.parts[0], &m->match, &ofpacts);
} else {
uint64_t conj_stubs[64 / 8];
struct ofpbuf conj;

ofpbuf_use_stub(&conj, conj_stubs, sizeof conj_stubs);
for (int i = 0; i < m->n; i++) {
const struct cls_conjunction *src = &m->conjunctions[i];
struct ofpact_conjunction *dst;

dst = ofpact_put_CONJUNCTION(&conj);
dst->id = src->id + *conj_id_ofs;
dst->clause = src->clause;
dst->n_clauses = src->n_clauses;
}
ofctrl_add_flow(flow_table, ptable, lflow->priority, 0, &m->match,
&conj);
ofpbuf_uninit(&conj);
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static void
consider_neighbor_flow(const struct lport_index *lports,
const struct sbrec_mac_binding *b,
struct hmap *flow_table)
{
//通过MAC_Binding查找到Port_Binding
const struct sbrec_port_binding *pb
= lport_lookup_by_name(lports, b->logical_port);

struct eth_addr mac;
//解析mac
eth_addr_from_string(b->mac, &mac);

struct match match = MATCH_CATCHALL_INITIALIZER;
if (strchr(b->ip, '.')) {
ovs_be32 ip;
//解析IP
ip_parse(b->ip, &ip);
//然后match中reg0匹配IP
match_set_reg(&match, 0, ntohl(ip));
}

uint64_t stub[1024 / 8];
struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(stub);
//设置action是修改目的MAC
put_load(mac.ea, sizeof mac.ea, MFF_ETH_DST, 0, 48, &ofpacts);
//添加流表
ofctrl_add_flow(flow_table, OFTABLE_MAC_BINDING, 100, 0, &match, &ofpacts);
ofpbuf_uninit(&ofpacts);
}

physical_run

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
void
physical_run(struct controller_ctx *ctx, enum mf_field_id mff_ovn_geneve,
const struct ovsrec_bridge *br_int,
const struct sbrec_chassis *chassis,
const struct simap *ct_zones, struct lport_index *lports,
struct hmap *flow_table, struct hmap *local_datapaths)
{

bool physical_map_changed = false;

struct simap new_localvif_to_ofport =
SIMAP_INITIALIZER(&new_localvif_to_ofport);
struct simap new_tunnel_to_ofport =
SIMAP_INITIALIZER(&new_tunnel_to_ofport);
for (int i = 0; i < br_int->n_ports; i++) {
const struct ovsrec_port *port_rec = br_int->ports[i];
//对应的数据查看用ovs-vsctl list port
const char *chassis_id = smap_get(&port_rec->external_ids,
"ovn-chassis-id");
const char *localnet = smap_get(&port_rec->external_ids,
"ovn-localnet-port");
const char *l2gateway = smap_get(&port_rec->external_ids,
"ovn-l2gateway-port");

for (int j = 0; j < port_rec->n_interfaces; j++) {
const struct ovsrec_interface *iface_rec = port_rec->interfaces[j];
//查看信息用ovs-vsctl list interface
int64_t ofport = iface_rec->ofport[0];

/* Record as patch to local net, logical patch port, chassis, or
* local logical port. */
bool is_patch = !strcmp(iface_rec->type, "patch");
//localnet patch ports和L2 gateway patch ports都当做VIF处理
if (is_patch && localnet) {
simap_put(&new_localvif_to_ofport, localnet, ofport);
break;
} else if (is_patch && l2gateway) {
simap_put(&new_localvif_to_ofport, l2gateway, ofport);
break;
//有chassis_id表示是连接到某个宿主机的tunnel口
} else if (chassis_id) {
enum chassis_tunnel_type tunnel_type;
if (!strcmp(iface_rec->type, "geneve")) {
tunnel_type = GENEVE;
if (!mff_ovn_geneve) {
continue;
}
} else if (!strcmp(iface_rec->type, "stt")) {
tunnel_type = STT;
} else if (!strcmp(iface_rec->type, "vxlan")) {
tunnel_type = VXLAN;
} else {
continue;
}

//找到chassis_id对应的tunnel口,存在即更新,不存在创建
simap_put(&new_tunnel_to_ofport, chassis_id, ofport);
struct chassis_tunnel *tun = chassis_tunnel_find(chassis_id);
if (tun) {
if (tun->ofport != u16_to_ofp(ofport) ||
tun->type != tunnel_type) {
tun->ofport = u16_to_ofp(ofport);
tun->type = tunnel_type;
physical_map_changed = true;
}
} else {
tun = xmalloc(sizeof *tun);
hmap_insert(&tunnels, &tun->hmap_node,
hash_string(chassis_id, 0));
tun->chassis_id = chassis_id;
tun->ofport = u16_to_ofp(ofport);
tun->type = tunnel_type;
physical_map_changed = true;
}
break;
} else {
const char *iface_id = smap_get(&iface_rec->external_ids,
"iface-id");
if (iface_id) {
simap_put(&new_localvif_to_ofport, iface_id, ofport);
}
}
}
}

//清理不存在的tunnel口
struct chassis_tunnel *tun, *tun_next;
HMAP_FOR_EACH_SAFE (tun, tun_next, hmap_node, &tunnels) {
if (!simap_find(&new_tunnel_to_ofport, tun->chassis_id)) {
hmap_remove(&tunnels, &tun->hmap_node);
physical_map_changed = true;
free(tun);
}
}

//记录下更改或者删除的openflow ports
struct simap_node *vif_name, *vif_name_next;
SIMAP_FOR_EACH_SAFE (vif_name, vif_name_next, &localvif_to_ofport) {
int newport;
if ((newport = simap_get(&new_localvif_to_ofport, vif_name->name))) {
if (newport != simap_get(&localvif_to_ofport, vif_name->name)) {
simap_put(&localvif_to_ofport, vif_name->name, newport);
physical_map_changed = true;
}
} else {
simap_find_and_delete(&localvif_to_ofport, vif_name->name);
physical_map_changed = true;
}
}
SIMAP_FOR_EACH (vif_name, &new_localvif_to_ofport) {
if (!simap_get(&localvif_to_ofport, vif_name->name)) {
simap_put(&localvif_to_ofport, vif_name->name,
simap_get(&new_localvif_to_ofport, vif_name->name));
physical_map_changed = true;
}
}
if (physical_map_changed) {
//调用此处触发logical flow table的处理
poll_immediate_wake();
}

struct ofpbuf ofpacts;
ofpbuf_init(&ofpacts, 0);

//建立表0,建立physical-to-logical的映射
//建立表32,报文到远端宿主机的tunnel流量
//建立表65,走完一个逻辑网元到下一个逻辑网元的流表
const struct sbrec_port_binding *binding;
SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
consider_port_binding(mff_ovn_geneve, ct_zones, lports,
local_datapaths, binding, chassis,
&ofpacts, flow_table);
}

//table 32 33处理报文到multicast groups
const struct sbrec_multicast_group *mc;
struct ofpbuf remote_ofpacts;
ofpbuf_init(&remote_ofpacts, 0);
SBREC_MULTICAST_GROUP_FOR_EACH (mc, ctx->ovnsb_idl) {
consider_mc_group(mff_ovn_geneve, ct_zones, local_datapaths, chassis,
mc, &ofpacts, &remote_ofpacts, flow_table);
}

ofpbuf_uninit(&remote_ofpacts);

//表0 100优先级,处理从tunnel口收到的报文
//Geneve和STT封装因为包含了ingress和egress的逻辑口
//我们从封装信息获取MFF_LOG_DATAPATH, MFF_LOG_INPORT, MFF_LOG_OUTPORT
//重新提交到表33,发送报文到本地vm
//这里先不处理vxlan的报文
HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
struct match match = MATCH_CATCHALL_INITIALIZER;
match_set_in_port(&match, tun->ofport);

ofpbuf_clear(&ofpacts);
if (tun->type == GENEVE) {
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
put_move(mff_ovn_geneve, 16, MFF_LOG_INPORT, 0, 15,
&ofpacts);
put_move(mff_ovn_geneve, 0, MFF_LOG_OUTPORT, 0, 16,
&ofpacts);
} else if (tun->type == STT) {
put_move(MFF_TUN_ID, 40, MFF_LOG_INPORT, 0, 15, &ofpacts);
put_move(MFF_TUN_ID, 24, MFF_LOG_OUTPORT, 0, 16, &ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
} else if (tun->type == VXLAN) {
continue;
} else {
OVS_NOT_REACHED();
}

put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);

ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
&ofpacts);
}

//目前只支持VXLAN连接网关,VNI存于MFF_LOG_INPORT
//然后报文提交到表16,然后确定出口
//目前测试是跨子网的东西流量不通
HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
struct match match = MATCH_CATCHALL_INITIALIZER;

match_set_in_port(&match, tun->ofport);
match_set_tun_id(&match, htonll(binding->datapath->tunnel_key));

ofpbuf_clear(&ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
put_load(binding->tunnel_key, MFF_LOG_INPORT, 0, 15, &ofpacts);
put_load(1, MFF_LOG_FLAGS, MLF_RCV_FROM_VXLAN_BIT, 1, &ofpacts);
put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);

ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
&ofpacts);
}
}

//表32 150优先级,处理收到的vxlan报文,因为缺乏metadata
//不发送任何隧道,重新提交到表33
struct match match;
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
MLF_RCV_FROM_VXLAN, MLF_RCV_FROM_VXLAN);

put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 150, 0,
&match, &ofpacts);

//表32 0优先级,不是组播或者tunnel到本地出口报文就重新提交到表33
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 0, 0, &match, &ofpacts);

//表34 0优先级,不是出ingress口,而是进入egress流程的话,清楚寄存器
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
put_load(0, MFF_REG0 + i, 0, 32, &ofpacts);
}
put_resubmit(OFTABLE_LOG_EGRESS_PIPELINE, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_CHECK_LOOPBACK, 0, 0, &match,
&ofpacts);

//表64 0优先级,没有MLF_ALLOW_LOOPBACK标志的报文
//重新提交到表65进行logical-to-physical的转换
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
put_resubmit(OFTABLE_LOG_TO_PHY, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_SAVE_INPORT, 0, 0, &match, &ofpacts);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
static void
consider_port_binding(enum mf_field_id mff_ovn_geneve,
const struct simap *ct_zones,
const struct lport_index *lports,
struct hmap *local_datapaths,
const struct sbrec_port_binding *binding,
const struct sbrec_chassis *chassis,
struct ofpbuf *ofpacts_p,
struct hmap *flow_table)
{
uint32_t dp_key = binding->datapath->tunnel_key;
uint32_t port_key = binding->tunnel_key;
if (!get_local_datapath(local_datapaths, dp_key)) {
return;
}

struct match match;
if (!strcmp(binding->type, "patch")
|| (!strcmp(binding->type, "l3gateway")
&& binding->chassis == chassis)) {
const char *peer_name = smap_get(&binding->options, "peer");
const struct sbrec_port_binding *peer = lport_lookup_by_name(
lports, peer_name);
const char *peer_peer_name = smap_get(&peer->options, "peer");
struct zone_ids binding_zones = get_zone_ids(binding, ct_zones);
//表33 100优先级,每个flow对应一个端口,resubmit到表34
//表34 100优先级,logical入口和出口一样的报文丢弃
//表64 100优先级,入端口修改为0,resubmit到表65
put_local_common_flows(dp_key, port_key, false, &binding_zones,
ofpacts_p, flow_table);

//表65 100优先级,清空寄存器,resubmit到表16
//表示过完了当前的一个网元,要进入下一个网元
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

size_t clone_ofs = ofpacts_p->size;
struct ofpact_nest *clone = ofpact_put_CLONE(ofpacts_p);
ofpact_put_CT_CLEAR(ofpacts_p);
put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
put_load(0, MFF_LOG_REG0 + i, 0, 32, ofpacts_p);
}
put_load(0, MFF_IN_PORT, 0, 16, ofpacts_p);
put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
clone = ofpbuf_at_assert(ofpacts_p, clone_ofs, sizeof *clone);
ofpacts_p->header = clone;
ofpact_finish_CLONE(ofpacts_p, &clone);

ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
&match, ofpacts_p);
return;
}

if (!strcmp(binding->type, "chassisredirect")
&& binding->chassis == chassis) {

//表33 100优先级,每个flow匹配一个逻辑出口到reg15,resubmit到表34
//如果端口类型是chassisredirect,则修改逻辑出口
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

const char *distributed_port = smap_get(&binding->options,
"distributed-port");
const struct sbrec_port_binding *distributed_binding
= lport_lookup_by_name(lports, distributed_port);

if (!distributed_binding) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "No port binding record for distributed "
"port %s referred by chassisredirect port %s",
distributed_port,
binding->logical_port);
} else if (binding->datapath !=
distributed_binding->datapath) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl,
"chassisredirect port %s refers to "
"distributed port %s in wrong datapath",
binding->logical_port,
distributed_port);
} else {
put_load(distributed_binding->tunnel_key,
MFF_LOG_OUTPORT, 0, 32, ofpacts_p);

struct zone_ids zone_ids = get_zone_ids(distributed_binding,
ct_zones);
if (zone_ids.ct) {
put_load(zone_ids.ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
}
if (zone_ids.dnat) {
put_load(zone_ids.dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
}
if (zone_ids.snat) {
put_load(zone_ids.snat, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
}

put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts_p);
}

ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
&match, ofpacts_p);
return;
}

//根据逻辑端口查找ofport
//如果端口是VIF,那么ofport是VIF,tun为NULL
//如果端口是远程的chassis,那么ofport是tunnel,tun的id是对端的chassis tun_key
int tag = 0;
bool nested_container = false;
ofp_port_t ofport;
bool is_remote = false;
if (binding->parent_port && *binding->parent_port) {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
binding->parent_port));
if (ofport) {
tag = *binding->tag;
nested_container = true;
}
} else {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
binding->logical_port));
if ((!strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway"))
&& ofport && binding->tag) {
tag = *binding->tag;
}
}

const struct chassis_tunnel *tun = NULL;
const struct sbrec_port_binding *localnet_port =
get_localnet_port(local_datapaths, dp_key);
if (!ofport) {
is_remote = true;
if (!binding->chassis) {
return;
}
if (localnet_port) {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
localnet_port->logical_port));
if (!ofport) {
return;
}
} else {
tun = chassis_tunnel_find(binding->chassis->name);
if (!tun) {
return;
}
ofport = tun->ofport;
}
}

if (!is_remote) {
struct zone_ids zone_ids = get_zone_ids(binding, ct_zones);
put_local_common_flows(dp_key, port_key, nested_container, &zone_ids,
ofpacts_p, flow_table);

//表0 100和150优先级
//150优先级是标记的流量,可能是VM中的容器或者local network的vlan情况下
//会匹配标记和剥去标记
//100优先级是vm的流量或者没有标记的本地网络
//这两种流量都需要设置逻辑入端口,逻辑datapath和resubmit到表16
ofpbuf_clear(ofpacts_p);
match_init_catchall(&match);
match_set_in_port(&match, ofport);

if (tag || !strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway")) {
match_set_dl_vlan(&match, htons(tag));
if (nested_container) {
put_load(MLF_ALLOW_LOOPBACK, MFF_LOG_FLAGS, 0, 1, ofpacts_p);
}
ofpact_put_STRIP_VLAN(ofpacts_p);
}

uint32_t ofpacts_orig_size = ofpacts_p->size;

load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);

put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG,
tag ? 150 : 100, 0, &match, ofpacts_p);

if (!tag && (!strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway"))) {

ofpbuf_pull(ofpacts_p, ofpacts_orig_size);
match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI));
ofctrl_add_flow(flow_table, 0, 100, 0, &match, ofpacts_p);
}

//表65 100优先级,转发报文到本地的vif
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
if (tag) {
struct ofpact_vlan_vid *vlan_vid;
vlan_vid = ofpact_put_SET_VLAN_VID(ofpacts_p);
vlan_vid->vlan_vid = tag;
vlan_vid->push_vlan_if_needed = true;
}
ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
if (tag) {
ofpact_put_STRIP_VLAN(ofpacts_p);
}
ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
&match, ofpacts_p);
} else if (!tun) {
//表33 100优先级,完成交换到localnet端口,每个flow匹配一个逻辑出口
//切换出口到localnet口和resubmit到同样的表,即表33
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);

match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

put_load(localnet_port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);

put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
&match, ofpacts_p);
} else {
//表32 100优先级,将流量发送到远端设备,每一条flow匹配一个出口
//封装报文到远端
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);

match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

put_encapsulation(mff_ovn_geneve, tun, binding->datapath,
port_key, ofpacts_p);

ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, 0,
&match, ofpacts_p);
}
}

注意:所有文章非特别说明皆为原创。为保证信息与源同步,转载时请务必注明文章出处!谢谢合作 :-)

原始链接:http://zhaozhanxu.com/2017/03/11/SDN/OVN/2017-03-11-ovn-controller/

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。