赵占旭的博客

OVN Northd服务工作

前面我们介绍到ovn-northd服务主要是将Northbound DB存储的逻辑拓扑翻译为chassis和Logical Flow,今天我们从代码的角度看一下他是怎么实现的。

最近发现了一篇不错的文章,点击这里查看part 1part 2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
int main(int argc, char *argv[])
{
//创建到nb db的连接,连接维护的数据记录在ovn-nb.ovsschema
//注意函数ovsdb_idl_create的第三个参数,表示默认监控所有更新
struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
//I think... maybe... 分别忽略sb和hv配置更新的提示
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_sb_cfg);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_hv_cfg);

//创建到sb db的连接,连接维护的数据记录在ovn-sb.ovsschema
//注意函数ovsdb_idl_create的第三个参数,表示默认都不管
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));

//当上面的ovsdb_idl_create的第三个参数是false的时候,需要调用这个接口
//用来确认带有sbrec_table_sb_global的类会被复制
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_sb_global);
//用来确认带有sbrec_sb_global_col_nb_cfg的列会被复制,不提示
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);

//确认Logical Flow表会被复制,确认该表里面的datapath、
//pipeline、table_id、match、action会被复制并且不提示。
//以下这些操作都是类似这里的操作,至于这些表和col可以参照前面介绍。
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_logical_flow_col_logical_datapath);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_multicast_group);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_multicast_group_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_multicast_group_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_name);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_ports);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_datapath_binding);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_datapath_binding_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_datapath_binding_col_external_ids);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_logical_port);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_parent_port);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_tag);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_mac);
ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_port_binding_col_chassis);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_mac_binding);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_ip);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_mac);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_mac_binding_col_logical_port);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcp_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_code);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_name);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcpv6_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_code);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_name);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_address_set);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses);

ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_chassis);
ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);

/* Main loop. */
exiting = false;
while (!exiting) {
struct northd_context ctx = {
.ovnnb_idl = ovnnb_idl_loop.idl,
//加载nbdb的数据
.ovnnb_txn = ovsdb_idl_loop_run(&ovnnb_idl_loop),
.ovnsb_idl = ovnsb_idl_loop.idl,
//加载sbdb的数据
.ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
};

//将ctx->ovnnb_txn的各项数据通过翻译加载到ctx->ovnsb_txn中
ovnnb_db_run(&ctx, &ovnsb_idl_loop);
//主要更新端口信息
ovnsb_db_run(&ctx, &ovnsb_idl_loop);
if (ctx.ovnsb_txn) {
check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
}

//监听服务,主要是处理各个ovn-controller的连接,通过json rpc的方式通信
unixctl_server_run(unixctl);
unixctl_server_wait(unixctl);
if (exiting) {
poll_immediate_wake();
}
//提交ctx的信息到nbdb和sbdb
ovsdb_idl_loop_commit_and_wait(&ovnnb_idl_loop);
ovsdb_idl_loop_commit_and_wait(&ovnsb_idl_loop);

poll_block();
if (should_service_stop()) {
exiting = true;
}
}

unixctl_server_destroy(unixctl);
ovsdb_idl_loop_destroy(&ovnnb_idl_loop);
ovsdb_idl_loop_destroy(&ovnsb_idl_loop);
service_stop();

exit(res);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static void
ovnnb_db_run(struct northd_context *ctx, struct ovsdb_idl_loop *sb_loop)
{
//将nbdb数据库中的logical switches、routers和datapath的对应关系
//更新到sb表Datapath_Binding中,并且存储在struct ovn_datapath
build_datapaths(ctx, &datapaths);
//将nbdb数据库中的logical switch ports
//更新到sb表Port_Binding中,并且存储在struct ovn_port
build_ports(ctx, &datapaths, &ports);
//用来创建管理IP和MAC的表,比如配置了IP或者dynamic以及MAC时
build_ipam(&datapaths, &ports);
//基于nbdb的内容生成的Logical_Flow和Multicast_Group
//生成的各级流表前面一篇文章都有介绍
build_lflows(ctx, &datapaths, &ports);

//Address_Set表更新
sync_address_sets(ctx);
}
1
2
3
4
5
6
7
8
9
10
11
static void
ovnsb_db_run(struct northd_context *ctx, struct ovsdb_idl_loop *sb_loop)
{
if (!ctx->ovnnb_txn || !ovsdb_idl_has_ever_connected(ctx->ovnsb_idl)) {
return;
}

//更新Port_Binding表chassis列,不为空时意味着nbdb中将这个端口设置为UP
update_logical_port_status(ctx);
update_northbound_cfg(ctx, sb_loop);
}

Datapath_Binding、Port_Binding、MAC_Binding的创建都还比较简单,所以就不看了,主要看一下逻辑流表的生成,首先会创建逻辑交换网元的逻辑流表,然后创建逻辑路由网元的逻辑流表,最后会查看目前的逻辑流表,然后对比变化,将更改的逻辑流表写入数据库中,我们重点关注一下逻辑交换和逻辑路由两个网元的逻辑流表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
static void
build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
struct hmap *lflows, struct hmap *mcgroups)
{
//创建ingress和egress的pre-ACL和ACL表,对应着ingress的表3-9和egress的0-6
//这里面的每个逻辑都和下面类似,我们就不每个都细看了
struct ovn_datapath *od;
HMAP_FOR_EACH (od, key_node, datapaths) {
build_pre_acls(od, lflows);
build_pre_lb(od, lflows);
build_pre_stateful(od, lflows);
build_acls(od, lflows);
build_flows(od, lflows);
build_qos(od, lflows);
build_lb(od, lflows);
build_stateful(od, lflows);
}

//ingress 表0 100优先级的入口控制
HMAP_FOR_EACH (od, key_node, datapaths) {
//带VLAN标签的报文丢弃
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present",
"drop;");
//源地址多播或者广播的报文丢弃
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]",
"drop;");
}

//ingress 表0 50优先级的端口安全控制
struct ovn_port *op;
HMAP_FOR_EACH (op, key_node, ports) {
ds_put_format(&match, "inport == %s", op->json_key);
build_port_security_l2("eth.src", op->ps_addrs, op->n_ps_addrs,
&match);

const char *queue_id = smap_get(&op->sb->options, "qdisc_queue_id");
if (queue_id) {
ds_put_format(&actions, "set_queue(%s); ", queue_id);
}
ds_put_cstr(&actions, "next;");
//符合条件的报文通过
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_L2, 50,
ds_cstr(&match), ds_cstr(&actions));

if (op->nbsp->n_port_security) {
//根据IP或者ND的安全配置,确定是否有ingress 表1和2的端口安全设置,优先级80和90
build_port_security_ip(P_IN, op, lflows);
build_port_security_nd(op, lflows);
}
}

//ingress 表1-2的端口安全默认优先级0的操作是允许通过
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_ND, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_IP, 0, "1", "next;");
}

//ingress 表10 100优先级,ARP请求,如果是localnet或者vtep的端口则通过
HMAP_FOR_EACH (op, key_node, ports) {
if ((!strcmp(op->nbsp->type, "localnet")) ||
(!strcmp(op->nbsp->type, "vtep"))) {
ds_clear(&match);
ds_put_format(&match, "inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
ds_cstr(&match), "next;");
}
}

//ingress 表10 50优先级 ARP请求,代答已知的IP
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->n_lsp_addrs; i++) {
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
ds_clear(&match);
ds_put_format(&match, "arp.tpa == %s && arp.op == 1",
op->lsp_addrs[i].ipv4_addrs[j].addr_s);
ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = %s; "
"outport = inport; "
"flags.loopback = 1; "
"output;",
op->lsp_addrs[i].ea_s, op->lsp_addrs[i].ea_s,
op->lsp_addrs[i].ipv4_addrs[j].addr_s);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
ds_cstr(&match), ds_cstr(&actions));

//DHCP客户端通过ARP请求判定是否有IP冲突的时候,会请求自己已有的IP地址
//这个时候不要代答,所以需要判定ARP请求的IP和发送端的IP一致时,通过
ds_put_format(&match, " && inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
ds_cstr(&match), "next;");
}
}
}

//ingress 表10 0优先级,默认通过ARP请求
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 0, "1", "next;");
}

//ingress 表11-12 100优先级,DHCP options和response操作
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->n_lsp_addrs; i++) {
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
if (build_dhcpv4_action(
op, op->lsp_addrs[i].ipv4_addrs[j].addr,
&options_action, &response_action, &ipv4_addr_match)) {
ds_put_format(
&match, "inport == %s && eth.src == %s && "
"ip4.src == 0.0.0.0 && ip4.dst == 255.255.255.255 && "
"udp.src == 68 && udp.dst == 67", op->json_key,
op->lsp_addrs[i].ea_s);

//修改DHCP报文的option部分,给出分配的IP地址,以及回复信息
//然后进入下一级即response
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
100, ds_cstr(&match),
ds_cstr(&options_action));
ds_clear(&match);
ds_put_format(
&match, "inport == %s && eth.src == %s && "
"%s && udp.src == 68 && udp.dst == 67", op->json_key,
op->lsp_addrs[i].ea_s, ds_cstr(&ipv4_addr_match));

//当客户端需要renew一个新的IP的时候,需要match项更改如下
//ip4.src = OFFER_IP and ip4.dst = {SERVER_IP, 255.255.255.255}
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
100, ds_cstr(&match),
ds_cstr(&options_action));
ds_clear(&match);

ds_put_format(
&match, "inport == %s && eth.src == %s && "
"ip4 && udp.src == 68 && udp.dst == 67"
" && "REGBIT_DHCP_OPTS_RESULT, op->json_key,
op->lsp_addrs[i].ea_s);
//如果option已经修改,则修改报文的源目的,变成回复报文
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_RESPONSE,
100, ds_cstr(&match),
ds_cstr(&response_action));
ds_destroy(&match);
ds_destroy(&options_action);
ds_destroy(&response_action);
ds_destroy(&ipv4_addr_match);
break;
}
}
}
}

//ingress 表11-12 0优先级,默认通过
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_OPTIONS, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_RESPONSE, 0, "1", "next;");
}

//ingress 表13 100优先级,多播广播flood
HMAP_FOR_EACH (op, key_node, ports) {
if (lsp_is_enabled(op->nbsp)) {
ovn_multicast_add(mcgroups, &mc_flood, op);
}
}
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100, "eth.mcast",
"outport = \""MC_FLOOD"\"; output;");
}

//ingress 表13, 50优先级,根据目的MAC查找出口
//逻辑交换的逻辑端口本来应该没有MAC地址,都是记录的对端的MAC
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
struct eth_addr mac;
//将逻辑交换的逻辑端口MAC地址进行格式转换,然后写到match里面
if (ovs_scan(op->nbsp->addresses[i],
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));

ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));
//unkown...
} else if (!strcmp(op->nbsp->addresses[i], "unknown")) {
if (lsp_is_enabled(op->nbsp)) {
ovn_multicast_add(mcgroups, &mc_unknown, op);
op->od->has_unknown = true;
}
//动态地址的MAC转换,需要从dynamic_addresses
} else if (is_dynamic_lsp_address(op->nbsp->addresses[i])) {
if (!op->nbsp->dynamic_addresses
|| !ovs_scan(op->nbsp->dynamic_addresses,
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
continue;
}
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));

ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));
//router的时候,根据peer的MAC地址进行转换
} else if (!strcmp(op->nbsp->addresses[i], "router")) {
if (!op->peer || !op->peer->nbrp
|| !ovs_scan(op->peer->nbrp->mac,
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
continue;
}
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));
if (op->peer->od->l3dgw_port
&& op->peer == op->peer->od->l3dgw_port
&& op->peer->od->l3redirect_port) {
//???是不是端口转发,不负责的猜测
ds_put_format(&match, " && is_chassis_resident(%s)",
op->peer->od->l3redirect_port->json_key);
}

ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));

//分布式逻辑路由的NAT规则
if (op->peer->od->l3dgw_port
&& op->peer == op->peer->od->l3dgw_port) {
for (int i = 0; i < op->peer->od->nbr->n_nat; i++) {
const struct nbrec_nat *nat
= op->peer->od->nbr->nat[i];
if (!strcmp(nat->type, "dnat_and_snat")
&& nat->logical_port && nat->external_mac
&& eth_addr_from_string(nat->external_mac, &mac)) {

ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT
" && is_chassis_resident(\"%s\")",
ETH_ADDR_ARGS(mac),
nat->logical_port);

ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;",
op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP,
50, ds_cstr(&match),
ds_cstr(&actions));
}
}
}
} else {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);

VLOG_INFO_RL(&rl,
"%s: invalid syntax '%s' in addresses column",
op->nbsp->name, op->nbsp->addresses[i]);
}
}
}

//ingress 表13 0优先级,unkown的到哪里了???
HMAP_FOR_EACH (od, key_node, datapaths) {
if (od->has_unknown) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
"outport = \""MC_UNKNOWN"\"; output;");
}
}

//egress 表7 0优先级,IP端口安全默认允许过
//egress 表8 100优先级,多播广播直接发送了
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_IP, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_L2, 100, "eth.mcast",
"output;");
}

//egress 表8 50优先级,根据配置设置不允许通过的流量
//150优先级,丢掉没有使能的逻辑端口的流量
HMAP_FOR_EACH (op, key_node, ports) {
ds_clear(&match);
ds_put_format(&match, "outport == %s", op->json_key);
if (lsp_is_enabled(op->nbsp)) {
build_port_security_l2("eth.dst", op->ps_addrs, op->n_ps_addrs,
&match);
ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 50,
ds_cstr(&match), "output;");
} else {
ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 150,
ds_cstr(&match), "drop;");
}

//egress 表7 80 90优先级,根据配置设置不允许通过流量
if (op->nbsp->n_port_security) {
build_port_security_ip(P_OUT, op, lflows);
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
static void
build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
struct hmap *lflows)
{
//ingress 表0 100优先级的入口控制
struct ovn_datapath *od;
HMAP_FOR_EACH (od, key_node, datapaths) {
//带VLAN标签的报文丢弃
//源地址多播或者广播的报文丢弃
ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 100,
"vlan.present || eth.src[40]", "drop;");
}

//ingress 表0 50优先级的端口安全控制
//允许端口配置的MAC和广播MAC的报文传输
struct ovn_port *op;
HMAP_FOR_EACH (op, key_node, ports) {
ds_put_format(&match, "eth.mcast && inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");

ds_clear(&match);
ds_put_format(&match, "eth.dst == %s && inport == %s",
op->lrp_networks.ea_s, op->json_key);
if (op->od->l3dgw_port && op == op->od->l3dgw_port
&& op->od->l3redirect_port) {
//是不是端口转发???
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");
}

//ingress 表1 ip input的一些固定设置
HMAP_FOR_EACH (od, key_node, datapaths) {
//优先级100,不合规的报文不准通过
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
"ip4.mcast || "
"ip4.src == 255.255.255.255 || "
"ip4.src == 127.0.0.0/8 || "
"ip4.dst == 127.0.0.0/8 || "
"ip4.src == 0.0.0.0/8 || "
"ip4.dst == 0.0.0.0/8",
"drop;");

//90优先级,ARP reply报文,存储在逻辑交换的ARP表,即MAC_Binding
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
"put_arp(inport, arp.spa, arp.sha);");

//50优先级,逻辑路由不能发送广播包,所以丢弃
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 50,
"eth.bcast", "drop;");

//30优先级,ttl消亡的报文丢弃
ds_clear(&match);
ds_put_cstr(&match, "ip4 && ip.ttl == {0, 1}");
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 30,
ds_cstr(&match), "drop;");

//0优先级,放过
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 0, "1", "next;");
}

//ingress 表1 ip input的设置,根据具体的IP情况
HMAP_FOR_EACH (op, key_node, ports) {
if (op->lrp_networks.n_ipv4_addrs) {
//100优先级,源IP是逻辑路由端口IP或者广播IP的丢掉
ds_clear(&match);
ds_put_cstr(&match, "ip4.src == ");
op_put_v4_networks(&match, op, true);
ds_put_cstr(&match, " && "REGBIT_EGRESS_LOOPBACK" == 0");
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 100,
ds_cstr(&match), "drop;");

//90优先级,对逻辑路由内IP的ICMP echo requests报文进行回复
ds_clear(&match);
ds_put_cstr(&match, "ip4.dst == ");
op_put_v4_networks(&match, op, false);
ds_put_cstr(&match, " && icmp4.type == 8 && icmp4.code == 0");

ds_clear(&actions);
ds_put_format(&actions,
"ip4.dst <-> ip4.src; "
"ip.ttl = 255; "
"icmp4.type = 0; "
"flags.loopback = 1; "
"next; ");
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}

//90优先级,针对逻辑路由内IP的ARP requests进行回复
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == %s && arp.op == 1",
op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s);
if (op->od->l3dgw_port && op == op->od->l3dgw_port
&& op->od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}

ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = %s; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s,
op->lrp_networks.ipv4_addrs[i].addr_s,
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}

//LB的VIP也需要ARP responses的报文进行回复,同上的操作
struct sset all_ips = SSET_INITIALIZER(&all_ips);

for (int i = 0; i < op->od->nbr->n_load_balancer; i++) {
struct nbrec_load_balancer *lb = op->od->nbr->load_balancer[i];
struct smap *vips = &lb->vips;
struct smap_node *node;

SMAP_FOR_EACH (node, vips) {
char *ip_address = NULL;
uint16_t port;

ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
if (!sset_contains(&all_ips, ip_address)) {
sset_add(&all_ips, ip_address);
}

free(ip_address);
}
}

const char *ip_address;
SSET_FOR_EACH(ip_address, &all_ips) {
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
op->json_key, IP_ARGS(ip));

ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = "IP_FMT"; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s,
IP_ARGS(ip),
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}


/* A gateway router can have 2 SNAT IP addresses to force DNATed and
* LBed traffic respectively to be SNATed. In addition, there can be
* a number of SNAT rules in the NAT table. */
ovs_be32 *snat_ips = xmalloc(sizeof *snat_ips *
(op->od->nbr->n_nat + 2));
size_t n_snat_ips = 0;

ovs_be32 snat_ip;
const char *dnat_force_snat_ip = get_force_snat_ip(op->od, "dnat",
&snat_ip);
if (dnat_force_snat_ip) {
snat_ips[n_snat_ips++] = snat_ip;
}

const char *lb_force_snat_ip = get_force_snat_ip(op->od, "lb",
&snat_ip);
if (lb_force_snat_ip) {
snat_ips[n_snat_ips++] = snat_ip;
}

for (int i = 0; i < op->od->nbr->n_nat; i++) {
const struct nbrec_nat *nat;

nat = op->od->nbr->nat[i];

ovs_be32 ip;
if (!ip_parse(nat->external_ip, &ip) || !ip) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip address %s in nat configuration "
"for router %s", nat->external_ip, op->key);
continue;
}

if (!strcmp(nat->type, "snat")) {
snat_ips[n_snat_ips++] = ip;
continue;
}

//EIP(DNAT)需要的arp处理
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
op->json_key, IP_ARGS(ip));

ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; ");

//说实话,这块没懂
if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
struct eth_addr mac;
if (nat->external_mac &&
eth_addr_from_string(nat->external_mac, &mac)
&& nat->logical_port) {
//分布式NAT的场景,回复的源MAC是nat->external_mac
ds_put_format(&actions,
"eth.src = "ETH_ADDR_FMT"; "
"arp.sha = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac),
ETH_ADDR_ARGS(mac));
ds_put_format(&match, " && is_chassis_resident(\"%s\")",
nat->logical_port);
} else {
//网络节点的NAT场景,回复的源MAC是网关的MAC
ds_put_format(&actions,
"eth.src = %s; "
"arp.sha = %s; ",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s);
if (op->od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}
}
} else {
ds_put_format(&actions,
"eth.src = %s; "
"arp.sha = %s; ",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s);
}
ds_put_format(&actions,
"arp.tpa = arp.spa; "
"arp.spa = "IP_FMT"; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
IP_ARGS(ip),
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}

ds_clear(&match);
ds_put_cstr(&match, "ip4.dst == {");
bool has_drop_ips = false;
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
bool snat_ip_is_router_ip = false;
for (int j = 0; j < n_snat_ips; j++) {
//到SNAT IP的报文丢弃,因为这是一个虚拟IP,不处理任何报文
if (op->lrp_networks.ipv4_addrs[i].addr == snat_ips[j]) {
snat_ip_is_router_ip = true;
break;
}
}
if (snat_ip_is_router_ip) {
continue;
}
ds_put_format(&match, "%s, ",
op->lrp_networks.ipv4_addrs[i].addr_s);
has_drop_ips = true;
}
ds_chomp(&match, ' ');
ds_chomp(&match, ',');
ds_put_cstr(&match, "}");

if (has_drop_ips) {
//到网关router的报文也不处理,之前的ARP和ICMP已经代答
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 60,
ds_cstr(&match), "drop;");
}

free(snat_ips);
}

//处理NAT、分片和LB的操作
HMAP_FOR_EACH (od, key_node, datapaths) {
//ingress的表2-4,egress的表0-3的默认流表0优先级是通过
ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");

if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port
&& !smap_get(&od->nbr->options, "underlay-gateway")) {
continue;
}

ovs_be32 snat_ip;
const char *dnat_force_snat_ip = get_force_snat_ip(od, "dnat",
&snat_ip);
const char *lb_force_snat_ip = get_force_snat_ip(od, "lb",
&snat_ip);

for (int i = 0; i < od->nbr->n_nat; i++) {
const struct nbrec_nat *nat;

nat = od->nbr->nat[i];

ovs_be32 ip, mask;

char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
if (error || mask != OVS_BE32_MAX) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad external ip %s for nat",
nat->external_ip);
free(error);
continue;
}

//检测NAT的logical_ip是否合理,SNAT的时候,logical_ip可以是一个子网
error = ip_parse_masked(nat->logical_ip, &ip, &mask);
if (!strcmp(nat->type, "snat")) {
if (error) {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
"in router "UUID_FMT"",
nat->logical_ip, UUID_ARGS(&od->key));
free(error);
continue;
}
} else {
if (error || mask != OVS_BE32_MAX) {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
free(error);
continue;
}
}

//分布式路由NAT,确定NAT规则是否满足分布式NAT的操作
bool distributed = false;
struct eth_addr mac;
if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") &&
nat->logical_port && nat->external_mac) {
if (eth_addr_from_string(nat->external_mac, &mac)) {
distributed = true;
} else {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad mac %s for dnat in router "
""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key));
continue;
}
}

//ingress 表3 ,必须是egress的SNAT已经创建了连接以及反向连接。
if (!strcmp(nat->type, "snat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 90,
ds_cstr(&match), "ct_snat; next;");
} else {
//分布式路由,网关进来的报文进行unnat
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s"
" && inport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
ds_cstr(&match), "ct_snat;");

//貌似是没有匹配上面的流表,表示不是网关端口来的
//是从其他路由端口来的,需要重定向到l3dgw_port操作NAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 50,
ds_cstr(&match),
REGBIT_NAT_REDIRECT" = 1; next;");
}
}

//ingress 表4 DNAT
int dnat = 0, pro = 100;
if (!strcmp(nat->type, "dnat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
if(!strcmp(nat->type, "dnat")){
if(nat->eport && nat->protocol){
if(!strcmp(nat->protocol, "tcp")) {
ds_put_format(&match, " && tcp && tcp.dst == %d", (uint16_t)nat->eport);
dnat = 1;
} else if (!strcmp(nat->protocol, "udp")) {
ds_put_format(&match, " && udp && udp.dst == %d", (uint16_t)nat->eport);
dnat = 1;
} else { continue; }
} else { ;}
}
ds_clear(&actions);
if (dnat_force_snat_ip) {
//该标志表示egress的SNAT需要执行
ds_put_format(&actions,
"flags.force_snat_for_dnat = 1; ");
}
if(dnat) {
pro = 90;
ds_put_format(&actions, "flags.loopback = 1; ct_dnat(%s:%d);", nat->logical_ip, (int)(nat->lport?:nat->eport));
} else {
ds_put_format(&actions, "flags.loopback = 1; ct_dnat(%s);", nat->logical_ip);
}
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, pro,
ds_cstr(&match), ds_cstr(&actions));
} else {
//分布式路由,入端口是l3dgw_port
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s"
" && inport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
ds_put_format(&actions, "ct_dnat(%s);",
nat->logical_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 100,
ds_cstr(&match), ds_cstr(&actions));

ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
ds_cstr(&match),
REGBIT_NAT_REDIRECT" = 1; next;");
}
}

//egress 表0 UNDNAT,必须是ingress的DNAT连接已经创建的情况下
if (od->l3dgw_port && (!strcmp(nat->type, "dnat")
|| !strcmp(nat->type, "dnat_and_snat"))) {
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s"
" && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
if (distributed) {
ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac));
}
ds_put_format(&actions, "ct_dnat;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}

//egress 表1 SNAT
if (!strcmp(nat->type, "snat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s",
nat->logical_ip);
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);

//掩码越长,优先级越高
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
count_1bits(ntohl(mask)) + 1,
ds_cstr(&match), ds_cstr(&actions));
} else {
//分布式路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s"
" && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
if (distributed) {
ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac));
}
ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);

//同样是掩码越长,优先级越高
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
count_1bits(ntohl(mask)) + 1,
ds_cstr(&match), ds_cstr(&actions));
}
}

//ingress 表0,50优先级允许支持NAT的报文通过
//入口是l3dgw_port,并且目的MAC是nat->external_mac的时候通过
if (distributed) {
ds_clear(&match);
ds_put_format(&match,
"eth.dst == "ETH_ADDR_FMT" && inport == %s"
" && is_chassis_resident(\"%s\")",
ETH_ADDR_ARGS(mac),
od->l3dgw_port->json_key,
nat->logical_port);
ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");
}

//ingress 表7,100优先级允许NAT报文通过
if (distributed) {
ds_clear(&match);
ds_put_format(&match, "ip4.src == %s && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 100,
ds_cstr(&match), "next;");
}

//egress 表2,100优先级如目的IP和EIP相等的话,需要报文clone回注到egress的表0
if (od->l3dgw_port) {
//分布式路由
ds_clear(&match);
ds_put_format(&match, "ip4.dst == %s && outport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
ds_clear(&actions);
ds_put_format(&actions,
"clone { ct_clear; "
"inport = outport; outport = \"\"; "
"flags = 0; flags.loopback = 1; ");
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
ds_put_format(&actions, "reg%d = 0; ", i);
}
ds_put_format(&actions, REGBIT_EGRESS_LOOPBACK" = 1; "
"next(pipeline=ingress, table=0); };");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}

//ingress 表3,110优先级强制SNAT之前DNAT过的报文,网关路由
if (dnat_force_snat_ip && !od->l3dgw_port) {
//如果报文的目的IP是网关路由的IP,那么进行UNSNAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", dnat_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 110,
ds_cstr(&match), "ct_snat; next;");

//已经DNAT过得报文走这一条,但是SNAT
ds_clear(&match);
ds_put_format(&match, "flags.force_snat_for_dnat == 1 && ip");
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", dnat_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}
if (lb_force_snat_ip && !od->l3dgw_port) {
//同上,报文的目的IP是网关路由IP,进行UNSNAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", lb_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
ds_cstr(&match), "ct_snat; next;");

//有force_snat_for_lb标志的进行SNAT
ds_clear(&match);
ds_put_format(&match, "flags.force_snat_for_lb == 1 && ip");
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", lb_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}

if (!od->l3dgw_port) {
//网关路由,重新循环每个报文到DNAT区域
//需要UNDNAT的报文都会进行UNDNAT,理想情况下,可以在egress完成
//但是由于网关路由器没有任何关于将源IP地址作为IP路由的EIP的特性
//所以我们可以在此处进行操作,从而省去了以后的一次重新循环
//任何通过SNAT区域的报文都会自动重新循环
//来得到openflow pipeline的路由需要的新目的IP
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
"ip4", "flags.loopback = 1; ct_dnat;");
} else {
//分布式路由的NAT,向ingress中没有NAT规则的IP路由表、ARP处理表、网关重定向添加流表
//ingress的表5 300优先级添加流表
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 300,
REGBIT_NAT_REDIRECT" == 1", "ip.ttl--; next;");

//ingress的表5 200优先级添加流表,修改目的MAC位分布式网关端口的地址
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
od->l3dgw_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 200,
REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));

//ingress的表7 200优先级添加流表
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; next;",
od->l3redirect_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 200,
REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));
}

//处理所有需要重组或者conntrack的IP的设置
struct sset all_ips = SSET_INITIALIZER(&all_ips);

for (int i = 0; i < od->nbr->n_load_balancer; i++) {
struct nbrec_load_balancer *lb = od->nbr->load_balancer[i];
struct smap *vips = &lb->vips;
struct smap_node *node;

SMAP_FOR_EACH (node, vips) {
uint16_t port = 0;

char *ip_address = NULL;
ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
if (!ip_address) {
continue;
}

if (!sset_contains(&all_ips, ip_address)) {
sset_add(&all_ips, ip_address);
}

//ingress 表4 DNAT中的LB添加高级的规则,120和110优先级
//每个匹配项,我们通过add_router_lb_flow()添加两条流表
//一条流表为ct.new添加操作ct_lb($targets)
//另一条流表为ct.est添加操作ct_dnat;
ds_clear(&actions);
ds_put_format(&actions, "ct_lb(%s);", node->value);

ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
ip_address);
free(ip_address);

if (port) {
if (lb->protocol && !strcmp(lb->protocol, "udp")) {
ds_put_format(&match, " && udp && udp.dst == %d",
port);
} else {
ds_put_format(&match, " && tcp && tcp.dst == %d",
port);
}
add_router_lb_flow(lflows, od, &match, &actions, 120,
lb_force_snat_ip);
} else {
add_router_lb_flow(lflows, od, &match, &actions, 110,
lb_force_snat_ip);
}
}
}

//如果有LB规则,我们应该将报文发送到重组和conntrack中
//有了conntrack,我们可以只从组里的新连接去设置DNAT IP
//如果LB规则中有L4的端口,我们需要重组进行匹配L4端口
const char *ip_address;
SSET_FOR_EACH(ip_address, &all_ips) {
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", ip_address);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG,
100, ds_cstr(&match), "ct_next;");
}

sset_destroy(&all_ips);
}

//ingress 表5 该表设置出口为正确的出口,并且源MAC修改为出口的MAC
//下一跳的IP从reg0中拿到,并且下一条表处理ARP resolution
HMAP_FOR_EACH (op, key_node, ports) {
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
add_route(lflows, op, op->lrp_networks.ipv4_addrs[i].addr_s,
op->lrp_networks.ipv4_addrs[i].network_s,
op->lrp_networks.ipv4_addrs[i].plen, NULL, NULL);
}

for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
add_route(lflows, op, op->lrp_networks.ipv6_addrs[i].addr_s,
op->lrp_networks.ipv6_addrs[i].network_s,
op->lrp_networks.ipv6_addrs[i].plen, NULL, NULL);
}
}

//将静态路由转换为流表
HMAP_FOR_EACH (od, key_node, datapaths) {
for (int i = 0; i < od->nbr->n_static_routes; i++) {
const struct nbrec_logical_router_static_route *route;

route = od->nbr->static_routes[i];
build_static_route_flow(lflows, od, ports, route);
}
}

//ingress 表6
//下一跳的IP地址存储在reg0,该表处理reg0中的IP设置到出口并且将MAC设置到报文目的MAC
HMAP_FOR_EACH (op, key_node, ports) {
if (op->nbrp) {
//当前状况是逻辑路由端口,当下一跳IP(存于reg0中)匹配该路由端口时,
//流表出口设置为这个逻辑端口,并且将该口的MAC设置为报文的目的MAC
//报文依然在对端的逻辑pipeline中,所以应当匹配对端的出口
if (op->peer && op->nbrp->peer) {
if (op->lrp_networks.n_ipv4_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == ",
op->peer->json_key);
op_put_v4_networks(&match, op, false);

ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
op->lrp_networks.ea_s);
ovn_lflow_add(lflows, op->peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}
}
} else if (op->od->n_router_ports && strcmp(op->nbsp->type, "router")) {
//当前状况是逻辑交换的端口,连接了一个VM或者容器
//解析里面的地址,为每个地址遍历连接到逻辑交换的所有路由端口
//如果地址从路由端口科大,添加ARP表到路由的pipeline

for (size_t i = 0; i < op->n_lsp_addrs; i++) {
const char *ea_s = op->lsp_addrs[i].ea_s;
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
const char *ip_s = op->lsp_addrs[i].ipv4_addrs[j].addr_s;
for (size_t k = 0; k < op->od->n_router_ports; k++) {
const char *peer_name = smap_get(
&op->od->router_ports[k]->nbsp->options,
"router-port");
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == %s",
peer->json_key, ip_s);

ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
ovn_lflow_add(lflows, peer->od,
S_ROUTER_IN_ARP_RESOLVE, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}

for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
const char *ip_s = op->lsp_addrs[i].ipv6_addrs[j].addr_s;
for (size_t k = 0; k < op->od->n_router_ports; k++) {
const char *peer_name = smap_get(
&op->od->router_ports[k]->nbsp->options,
"router-port");
ds_clear(&match);
ds_put_format(&match, "outport == %s && xxreg0 == %s",
peer->json_key, ip_s);

ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
ovn_lflow_add(lflows, peer->od,
S_ROUTER_IN_ARP_RESOLVE, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}
}
} else if (!strcmp(op->nbsp->type, "router")) {
//当前是连接路由的逻辑交换端口
//该交换端口的对端是路由端口
//我们需要添加逻辑流表,能够对所有连接到该交换机的路由端口添加ARP信息
const char *peer_name = smap_get(&op->nbsp->options,
"router-port");
struct ovn_port *peer = ovn_port_find(ports, peer_name);
for (size_t i = 0; i < op->od->n_router_ports; i++) {
const char *router_port_name = smap_get(
&op->od->router_ports[i]->nbsp->options,
"router-port");
struct ovn_port *router_port = ovn_port_find(ports,
router_port_name);
if (router_port->lrp_networks.n_ipv4_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == ",
peer->json_key);
op_put_v4_networks(&match, router_port, false);

ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
router_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}

if (router_port->lrp_networks.n_ipv6_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && xxreg0 == ",
peer->json_key);
op_put_v6_networks(&match, router_port);

ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
router_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}
}
}
}

HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 0, "ip4",
"get_arp(outport, reg0); next;");
}

//ingress 表7 分布式路由中,出口和l3dgw_port相等时
//这个表会将一个子网的流量转发到l3redirect_port
HMAP_FOR_EACH (od, key_node, datapaths) {
if (od->l3dgw_port && od->l3redirect_port) {
//当流量的出口等于l3dgw_port,如果报文不匹配更高级的重定向规则
//那么流量将重定向到l3dgw_port的中心实例
ds_clear(&match);
ds_put_format(&match, "outport == %s",
od->l3dgw_port->json_key);
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; next;",
od->l3redirect_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 50,
ds_cstr(&match), ds_cstr(&actions));

//如果目的MAC已经处理了,则重定向到l3dgw_port的中心实例
//这些流量在进行重定向到中心实例之前,将被ingress表中的ARP request替代
ds_put_format(&match, " && eth.dst == 00:00:00:00:00:00");
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 150,
ds_cstr(&match), ds_cstr(&actions));
}

//0优先级默认是转发
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 0, "1", "next;");
}

//ingress 表8 100优先级用来发送ARP Request
//主要是目的MAC修改为广播、源IP从reg1取,目标IP从reg0取、操作为request,最后发送
//0优先级表示目的MAC地址可以处理,直接发送
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 100,
"eth.dst == 00:00:00:00:00:00",
"arp { "
"eth.dst = ff:ff:ff:ff:ff:ff; "
"arp.spa = reg1; "
"arp.tpa = reg0; "
"arp.op = 1; " /* ARP request */
"output; "
"};");
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
}

//egress 表3 100优先级直接匹配逻辑出口,执行发送
HMAP_FOR_EACH (op, key_node, ports) {
ds_clear(&match);
ds_put_format(&match, "outport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
ds_cstr(&match), "output;");
}
}

注意:所有文章非特别说明皆为原创。为保证信息与源同步,转载时请务必注明文章出处!谢谢合作 :-)

原始链接:http://zhaozhanxu.com/2017/03/04/SDN/OVN/2017-03-04-ovn-northd/

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。