前面我们介绍到ovn-northd服务主要是将Northbound DB存储的逻辑拓扑翻译为chassis和Logical Flow,今天我们从代码的角度看一下他是怎么实现的。
最近发现了一篇不错的文章,点击这里查看part 1和part 2。
int main(int argc, char *argv[])
{
//创建到nb db的连接,连接维护的数据记录在ovn-nb.ovsschema
//注意函数ovsdb_idl_create的第三个参数,表示默认监控所有更新
struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
//I think... maybe... 分别忽略sb和hv配置更新的提示
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_sb_cfg);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_hv_cfg);
//创建到sb db的连接,连接维护的数据记录在ovn-sb.ovsschema
//注意函数ovsdb_idl_create的第三个参数,表示默认都不管
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));
//当上面的ovsdb_idl_create的第三个参数是false的时候,需要调用这个接口
//用来确认带有sbrec_table_sb_global的类会被复制
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_sb_global);
//用来确认带有sbrec_sb_global_col_nb_cfg的列会被复制,不提示
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);
//确认Logical Flow表会被复制,确认该表里面的datapath、
//pipeline、table_id、match、action会被复制并且不提示。
//以下这些操作都是类似这里的操作,至于这些表和col可以参照前面介绍。
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_logical_flow_col_logical_datapath);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_multicast_group);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_multicast_group_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_multicast_group_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_name);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_ports);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_datapath_binding);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_datapath_binding_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_datapath_binding_col_external_ids);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_logical_port);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_parent_port);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_tag);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_mac);
ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_port_binding_col_chassis);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_mac_binding);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_ip);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_mac);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_mac_binding_col_logical_port);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcp_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_code);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_name);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcpv6_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_code);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_type);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_name);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_address_set);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses);
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_chassis);
ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
/* Main loop. */
exiting = false;
while (!exiting) {
struct northd_context ctx = {
.ovnnb_idl = ovnnb_idl_loop.idl,
//加载nbdb的数据
.ovnnb_txn = ovsdb_idl_loop_run(&ovnnb_idl_loop),
.ovnsb_idl = ovnsb_idl_loop.idl,
//加载sbdb的数据
.ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
};
//将ctx->ovnnb_txn的各项数据通过翻译加载到ctx->ovnsb_txn中
ovnnb_db_run(&ctx, &ovnsb_idl_loop);
//主要更新端口信息
ovnsb_db_run(&ctx, &ovnsb_idl_loop);
if (ctx.ovnsb_txn) {
check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
}
//监听服务,主要是处理各个ovn-controller的连接,通过json rpc的方式通信
unixctl_server_run(unixctl);
unixctl_server_wait(unixctl);
if (exiting) {
poll_immediate_wake();
}
//提交ctx的信息到nbdb和sbdb
ovsdb_idl_loop_commit_and_wait(&ovnnb_idl_loop);
ovsdb_idl_loop_commit_and_wait(&ovnsb_idl_loop);
poll_block();
if (should_service_stop()) {
exiting = true;
}
}
unixctl_server_destroy(unixctl);
ovsdb_idl_loop_destroy(&ovnnb_idl_loop);
ovsdb_idl_loop_destroy(&ovnsb_idl_loop);
service_stop();
exit(res);
}
static void
ovnnb_db_run(struct northd_context *ctx, struct ovsdb_idl_loop *sb_loop)
{
//将nbdb数据库中的logical switches、routers和datapath的对应关系
//更新到sb表Datapath_Binding中,并且存储在struct ovn_datapath
build_datapaths(ctx, &datapaths);
//将nbdb数据库中的logical switch ports
//更新到sb表Port_Binding中,并且存储在struct ovn_port
build_ports(ctx, &datapaths, &ports);
//用来创建管理IP和MAC的表,比如配置了IP或者dynamic以及MAC时
build_ipam(&datapaths, &ports);
//基于nbdb的内容生成的Logical_Flow和Multicast_Group
//生成的各级流表前面一篇文章都有介绍
build_lflows(ctx, &datapaths, &ports);
//Address_Set表更新
sync_address_sets(ctx);
}
static void
ovnsb_db_run(struct northd_context *ctx, struct ovsdb_idl_loop *sb_loop)
{
if (!ctx->ovnnb_txn || !ovsdb_idl_has_ever_connected(ctx->ovnsb_idl)) {
return;
}
//更新Port_Binding表chassis列,不为空时意味着nbdb中将这个端口设置为UP
update_logical_port_status(ctx);
update_northbound_cfg(ctx, sb_loop);
}
Datapath_Binding、Port_Binding、MAC_Binding的创建都还比较简单,所以就不看了,主要看一下逻辑流表的生成,首先会创建逻辑交换网元的逻辑流表,然后创建逻辑路由网元的逻辑流表,最后会查看目前的逻辑流表,然后对比变化,将更改的逻辑流表写入数据库中,我们重点关注一下逻辑交换和逻辑路由两个网元的逻辑流表。
static void
build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
struct hmap *lflows, struct hmap *mcgroups)
{
//创建ingress和egress的pre-ACL和ACL表,对应着ingress的表3-9和egress的0-6
//这里面的每个逻辑都和下面类似,我们就不每个都细看了
struct ovn_datapath *od;
HMAP_FOR_EACH (od, key_node, datapaths) {
build_pre_acls(od, lflows);
build_pre_lb(od, lflows);
build_pre_stateful(od, lflows);
build_acls(od, lflows);
build_flows(od, lflows);
build_qos(od, lflows);
build_lb(od, lflows);
build_stateful(od, lflows);
}
//ingress 表0 100优先级的入口控制
HMAP_FOR_EACH (od, key_node, datapaths) {
//带VLAN标签的报文丢弃
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present",
"drop;");
//源地址多播或者广播的报文丢弃
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]",
"drop;");
}
//ingress 表0 50优先级的端口安全控制
struct ovn_port *op;
HMAP_FOR_EACH (op, key_node, ports) {
ds_put_format(&match, "inport == %s", op->json_key);
build_port_security_l2("eth.src", op->ps_addrs, op->n_ps_addrs,
&match);
const char *queue_id = smap_get(&op->sb->options, "qdisc_queue_id");
if (queue_id) {
ds_put_format(&actions, "set_queue(%s); ", queue_id);
}
ds_put_cstr(&actions, "next;");
//符合条件的报文通过
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_L2, 50,
ds_cstr(&match), ds_cstr(&actions));
if (op->nbsp->n_port_security) {
//根据IP或者ND的安全配置,确定是否有ingress 表1和2的端口安全设置,优先级80和90
build_port_security_ip(P_IN, op, lflows);
build_port_security_nd(op, lflows);
}
}
//ingress 表1-2的端口安全默认优先级0的操作是允许通过
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_ND, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_IP, 0, "1", "next;");
}
//ingress 表10 100优先级,ARP请求,如果是localnet或者vtep的端口则通过
HMAP_FOR_EACH (op, key_node, ports) {
if ((!strcmp(op->nbsp->type, "localnet")) ||
(!strcmp(op->nbsp->type, "vtep"))) {
ds_clear(&match);
ds_put_format(&match, "inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
ds_cstr(&match), "next;");
}
}
//ingress 表10 50优先级 ARP请求,代答已知的IP
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->n_lsp_addrs; i++) {
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
ds_clear(&match);
ds_put_format(&match, "arp.tpa == %s && arp.op == 1",
op->lsp_addrs[i].ipv4_addrs[j].addr_s);
ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = %s; "
"outport = inport; "
"flags.loopback = 1; "
"output;",
op->lsp_addrs[i].ea_s, op->lsp_addrs[i].ea_s,
op->lsp_addrs[i].ipv4_addrs[j].addr_s);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
ds_cstr(&match), ds_cstr(&actions));
//DHCP客户端通过ARP请求判定是否有IP冲突的时候,会请求自己已有的IP地址
//这个时候不要代答,所以需要判定ARP请求的IP和发送端的IP一致时,通过
ds_put_format(&match, " && inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
ds_cstr(&match), "next;");
}
}
}
//ingress 表10 0优先级,默认通过ARP请求
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 0, "1", "next;");
}
//ingress 表11-12 100优先级,DHCP options和response操作
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->n_lsp_addrs; i++) {
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
if (build_dhcpv4_action(
op, op->lsp_addrs[i].ipv4_addrs[j].addr,
&options_action, &response_action, &ipv4_addr_match)) {
ds_put_format(
&match, "inport == %s && eth.src == %s && "
"ip4.src == 0.0.0.0 && ip4.dst == 255.255.255.255 && "
"udp.src == 68 && udp.dst == 67", op->json_key,
op->lsp_addrs[i].ea_s);
//修改DHCP报文的option部分,给出分配的IP地址,以及回复信息
//然后进入下一级即response
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
100, ds_cstr(&match),
ds_cstr(&options_action));
ds_clear(&match);
ds_put_format(
&match, "inport == %s && eth.src == %s && "
"%s && udp.src == 68 && udp.dst == 67", op->json_key,
op->lsp_addrs[i].ea_s, ds_cstr(&ipv4_addr_match));
//当客户端需要renew一个新的IP的时候,需要match项更改如下
//ip4.src = OFFER_IP and ip4.dst = {SERVER_IP, 255.255.255.255}
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
100, ds_cstr(&match),
ds_cstr(&options_action));
ds_clear(&match);
ds_put_format(
&match, "inport == %s && eth.src == %s && "
"ip4 && udp.src == 68 && udp.dst == 67"
" && "REGBIT_DHCP_OPTS_RESULT, op->json_key,
op->lsp_addrs[i].ea_s);
//如果option已经修改,则修改报文的源目的,变成回复报文
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_RESPONSE,
100, ds_cstr(&match),
ds_cstr(&response_action));
ds_destroy(&match);
ds_destroy(&options_action);
ds_destroy(&response_action);
ds_destroy(&ipv4_addr_match);
break;
}
}
}
}
//ingress 表11-12 0优先级,默认通过
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_OPTIONS, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_RESPONSE, 0, "1", "next;");
}
//ingress 表13 100优先级,多播广播flood
HMAP_FOR_EACH (op, key_node, ports) {
if (lsp_is_enabled(op->nbsp)) {
ovn_multicast_add(mcgroups, &mc_flood, op);
}
}
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100, "eth.mcast",
"outport = \""MC_FLOOD"\"; output;");
}
//ingress 表13, 50优先级,根据目的MAC查找出口
//逻辑交换的逻辑端口本来应该没有MAC地址,都是记录的对端的MAC
HMAP_FOR_EACH (op, key_node, ports) {
for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
struct eth_addr mac;
//将逻辑交换的逻辑端口MAC地址进行格式转换,然后写到match里面
if (ovs_scan(op->nbsp->addresses[i],
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));
//unkown...
} else if (!strcmp(op->nbsp->addresses[i], "unknown")) {
if (lsp_is_enabled(op->nbsp)) {
ovn_multicast_add(mcgroups, &mc_unknown, op);
op->od->has_unknown = true;
}
//动态地址的MAC转换,需要从dynamic_addresses
} else if (is_dynamic_lsp_address(op->nbsp->addresses[i])) {
if (!op->nbsp->dynamic_addresses
|| !ovs_scan(op->nbsp->dynamic_addresses,
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
continue;
}
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));
//router的时候,根据peer的MAC地址进行转换
} else if (!strcmp(op->nbsp->addresses[i], "router")) {
if (!op->peer || !op->peer->nbrp
|| !ovs_scan(op->peer->nbrp->mac,
ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
continue;
}
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
ETH_ADDR_ARGS(mac));
if (op->peer->od->l3dgw_port
&& op->peer == op->peer->od->l3dgw_port
&& op->peer->od->l3redirect_port) {
//???是不是端口转发,不负责的猜测
ds_put_format(&match, " && is_chassis_resident(%s)",
op->peer->od->l3redirect_port->json_key);
}
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;", op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
ds_cstr(&match), ds_cstr(&actions));
//分布式逻辑路由的NAT规则
if (op->peer->od->l3dgw_port
&& op->peer == op->peer->od->l3dgw_port) {
for (int i = 0; i < op->peer->od->nbr->n_nat; i++) {
const struct nbrec_nat *nat
= op->peer->od->nbr->nat[i];
if (!strcmp(nat->type, "dnat_and_snat")
&& nat->logical_port && nat->external_mac
&& eth_addr_from_string(nat->external_mac, &mac)) {
ds_clear(&match);
ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT
" && is_chassis_resident(\"%s\")",
ETH_ADDR_ARGS(mac),
nat->logical_port);
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; output;",
op->json_key);
ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP,
50, ds_cstr(&match),
ds_cstr(&actions));
}
}
}
} else {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_INFO_RL(&rl,
"%s: invalid syntax '%s' in addresses column",
op->nbsp->name, op->nbsp->addresses[i]);
}
}
}
//ingress 表13 0优先级,unkown的到哪里了???
HMAP_FOR_EACH (od, key_node, datapaths) {
if (od->has_unknown) {
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
"outport = \""MC_UNKNOWN"\"; output;");
}
}
//egress 表7 0优先级,IP端口安全默认允许过
//egress 表8 100优先级,多播广播直接发送了
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_IP, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_L2, 100, "eth.mcast",
"output;");
}
//egress 表8 50优先级,根据配置设置不允许通过的流量
//150优先级,丢掉没有使能的逻辑端口的流量
HMAP_FOR_EACH (op, key_node, ports) {
ds_clear(&match);
ds_put_format(&match, "outport == %s", op->json_key);
if (lsp_is_enabled(op->nbsp)) {
build_port_security_l2("eth.dst", op->ps_addrs, op->n_ps_addrs,
&match);
ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 50,
ds_cstr(&match), "output;");
} else {
ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 150,
ds_cstr(&match), "drop;");
}
//egress 表7 80 90优先级,根据配置设置不允许通过流量
if (op->nbsp->n_port_security) {
build_port_security_ip(P_OUT, op, lflows);
}
}
}
static void
build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
struct hmap *lflows)
{
//ingress 表0 100优先级的入口控制
struct ovn_datapath *od;
HMAP_FOR_EACH (od, key_node, datapaths) {
//带VLAN标签的报文丢弃
//源地址多播或者广播的报文丢弃
ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 100,
"vlan.present || eth.src[40]", "drop;");
}
//ingress 表0 50优先级的端口安全控制
//允许端口配置的MAC和广播MAC的报文传输
struct ovn_port *op;
HMAP_FOR_EACH (op, key_node, ports) {
ds_put_format(&match, "eth.mcast && inport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");
ds_clear(&match);
ds_put_format(&match, "eth.dst == %s && inport == %s",
op->lrp_networks.ea_s, op->json_key);
if (op->od->l3dgw_port && op == op->od->l3dgw_port
&& op->od->l3redirect_port) {
//是不是端口转发???
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");
}
//ingress 表1 ip input的一些固定设置
HMAP_FOR_EACH (od, key_node, datapaths) {
//优先级100,不合规的报文不准通过
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
"ip4.mcast || "
"ip4.src == 255.255.255.255 || "
"ip4.src == 127.0.0.0/8 || "
"ip4.dst == 127.0.0.0/8 || "
"ip4.src == 0.0.0.0/8 || "
"ip4.dst == 0.0.0.0/8",
"drop;");
//90优先级,ARP reply报文,存储在逻辑交换的ARP表,即MAC_Binding
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
"put_arp(inport, arp.spa, arp.sha);");
//50优先级,逻辑路由不能发送广播包,所以丢弃
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 50,
"eth.bcast", "drop;");
//30优先级,ttl消亡的报文丢弃
ds_clear(&match);
ds_put_cstr(&match, "ip4 && ip.ttl == {0, 1}");
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 30,
ds_cstr(&match), "drop;");
//0优先级,放过
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 0, "1", "next;");
}
//ingress 表1 ip input的设置,根据具体的IP情况
HMAP_FOR_EACH (op, key_node, ports) {
if (op->lrp_networks.n_ipv4_addrs) {
//100优先级,源IP是逻辑路由端口IP或者广播IP的丢掉
ds_clear(&match);
ds_put_cstr(&match, "ip4.src == ");
op_put_v4_networks(&match, op, true);
ds_put_cstr(&match, " && "REGBIT_EGRESS_LOOPBACK" == 0");
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 100,
ds_cstr(&match), "drop;");
//90优先级,对逻辑路由内IP的ICMP echo requests报文进行回复
ds_clear(&match);
ds_put_cstr(&match, "ip4.dst == ");
op_put_v4_networks(&match, op, false);
ds_put_cstr(&match, " && icmp4.type == 8 && icmp4.code == 0");
ds_clear(&actions);
ds_put_format(&actions,
"ip4.dst <-> ip4.src; "
"ip.ttl = 255; "
"icmp4.type = 0; "
"flags.loopback = 1; "
"next; ");
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}
//90优先级,针对逻辑路由内IP的ARP requests进行回复
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == %s && arp.op == 1",
op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s);
if (op->od->l3dgw_port && op == op->od->l3dgw_port
&& op->od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}
ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = %s; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s,
op->lrp_networks.ipv4_addrs[i].addr_s,
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}
//LB的VIP也需要ARP responses的报文进行回复,同上的操作
struct sset all_ips = SSET_INITIALIZER(&all_ips);
for (int i = 0; i < op->od->nbr->n_load_balancer; i++) {
struct nbrec_load_balancer *lb = op->od->nbr->load_balancer[i];
struct smap *vips = &lb->vips;
struct smap_node *node;
SMAP_FOR_EACH (node, vips) {
char *ip_address = NULL;
uint16_t port;
ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
if (!sset_contains(&all_ips, ip_address)) {
sset_add(&all_ips, ip_address);
}
free(ip_address);
}
}
const char *ip_address;
SSET_FOR_EACH(ip_address, &all_ips) {
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
op->json_key, IP_ARGS(ip));
ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"eth.src = %s; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; "
"arp.sha = %s; "
"arp.tpa = arp.spa; "
"arp.spa = "IP_FMT"; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s,
IP_ARGS(ip),
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}
/* A gateway router can have 2 SNAT IP addresses to force DNATed and
* LBed traffic respectively to be SNATed. In addition, there can be
* a number of SNAT rules in the NAT table. */
ovs_be32 *snat_ips = xmalloc(sizeof *snat_ips *
(op->od->nbr->n_nat + 2));
size_t n_snat_ips = 0;
ovs_be32 snat_ip;
const char *dnat_force_snat_ip = get_force_snat_ip(op->od, "dnat",
&snat_ip);
if (dnat_force_snat_ip) {
snat_ips[n_snat_ips++] = snat_ip;
}
const char *lb_force_snat_ip = get_force_snat_ip(op->od, "lb",
&snat_ip);
if (lb_force_snat_ip) {
snat_ips[n_snat_ips++] = snat_ip;
}
for (int i = 0; i < op->od->nbr->n_nat; i++) {
const struct nbrec_nat *nat;
nat = op->od->nbr->nat[i];
ovs_be32 ip;
if (!ip_parse(nat->external_ip, &ip) || !ip) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip address %s in nat configuration "
"for router %s", nat->external_ip, op->key);
continue;
}
if (!strcmp(nat->type, "snat")) {
snat_ips[n_snat_ips++] = ip;
continue;
}
//EIP(DNAT)需要的arp处理
ds_clear(&match);
ds_put_format(&match,
"inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
op->json_key, IP_ARGS(ip));
ds_clear(&actions);
ds_put_format(&actions,
"eth.dst = eth.src; "
"arp.op = 2; /* ARP reply */ "
"arp.tha = arp.sha; ");
//说实话,这块没懂
if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
struct eth_addr mac;
if (nat->external_mac &&
eth_addr_from_string(nat->external_mac, &mac)
&& nat->logical_port) {
//分布式NAT的场景,回复的源MAC是nat->external_mac
ds_put_format(&actions,
"eth.src = "ETH_ADDR_FMT"; "
"arp.sha = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac),
ETH_ADDR_ARGS(mac));
ds_put_format(&match, " && is_chassis_resident(\"%s\")",
nat->logical_port);
} else {
//网络节点的NAT场景,回复的源MAC是网关的MAC
ds_put_format(&actions,
"eth.src = %s; "
"arp.sha = %s; ",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s);
if (op->od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
op->od->l3redirect_port->json_key);
}
}
} else {
ds_put_format(&actions,
"eth.src = %s; "
"arp.sha = %s; ",
op->lrp_networks.ea_s,
op->lrp_networks.ea_s);
}
ds_put_format(&actions,
"arp.tpa = arp.spa; "
"arp.spa = "IP_FMT"; "
"outport = %s; "
"flags.loopback = 1; "
"output;",
IP_ARGS(ip),
op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
ds_cstr(&match), ds_cstr(&actions));
}
ds_clear(&match);
ds_put_cstr(&match, "ip4.dst == {");
bool has_drop_ips = false;
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
bool snat_ip_is_router_ip = false;
for (int j = 0; j < n_snat_ips; j++) {
//到SNAT IP的报文丢弃,因为这是一个虚拟IP,不处理任何报文
if (op->lrp_networks.ipv4_addrs[i].addr == snat_ips[j]) {
snat_ip_is_router_ip = true;
break;
}
}
if (snat_ip_is_router_ip) {
continue;
}
ds_put_format(&match, "%s, ",
op->lrp_networks.ipv4_addrs[i].addr_s);
has_drop_ips = true;
}
ds_chomp(&match, ' ');
ds_chomp(&match, ',');
ds_put_cstr(&match, "}");
if (has_drop_ips) {
//到网关router的报文也不处理,之前的ARP和ICMP已经代答
ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 60,
ds_cstr(&match), "drop;");
}
free(snat_ips);
}
//处理NAT、分片和LB的操作
HMAP_FOR_EACH (od, key_node, datapaths) {
//ingress的表2-4,egress的表0-3的默认流表0优先级是通过
ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");
if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port
&& !smap_get(&od->nbr->options, "underlay-gateway")) {
continue;
}
ovs_be32 snat_ip;
const char *dnat_force_snat_ip = get_force_snat_ip(od, "dnat",
&snat_ip);
const char *lb_force_snat_ip = get_force_snat_ip(od, "lb",
&snat_ip);
for (int i = 0; i < od->nbr->n_nat; i++) {
const struct nbrec_nat *nat;
nat = od->nbr->nat[i];
ovs_be32 ip, mask;
char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
if (error || mask != OVS_BE32_MAX) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad external ip %s for nat",
nat->external_ip);
free(error);
continue;
}
//检测NAT的logical_ip是否合理,SNAT的时候,logical_ip可以是一个子网
error = ip_parse_masked(nat->logical_ip, &ip, &mask);
if (!strcmp(nat->type, "snat")) {
if (error) {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
"in router "UUID_FMT"",
nat->logical_ip, UUID_ARGS(&od->key));
free(error);
continue;
}
} else {
if (error || mask != OVS_BE32_MAX) {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
free(error);
continue;
}
}
//分布式路由NAT,确定NAT规则是否满足分布式NAT的操作
bool distributed = false;
struct eth_addr mac;
if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") &&
nat->logical_port && nat->external_mac) {
if (eth_addr_from_string(nat->external_mac, &mac)) {
distributed = true;
} else {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad mac %s for dnat in router "
""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key));
continue;
}
}
//ingress 表3 ,必须是egress的SNAT已经创建了连接以及反向连接。
if (!strcmp(nat->type, "snat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 90,
ds_cstr(&match), "ct_snat; next;");
} else {
//分布式路由,网关进来的报文进行unnat
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s"
" && inport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
ds_cstr(&match), "ct_snat;");
//貌似是没有匹配上面的流表,表示不是网关端口来的
//是从其他路由端口来的,需要重定向到l3dgw_port操作NAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 50,
ds_cstr(&match),
REGBIT_NAT_REDIRECT" = 1; next;");
}
}
//ingress 表4 DNAT
int dnat = 0, pro = 100;
if (!strcmp(nat->type, "dnat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
if(!strcmp(nat->type, "dnat")){
if(nat->eport && nat->protocol){
if(!strcmp(nat->protocol, "tcp")) {
ds_put_format(&match, " && tcp && tcp.dst == %d", (uint16_t)nat->eport);
dnat = 1;
} else if (!strcmp(nat->protocol, "udp")) {
ds_put_format(&match, " && udp && udp.dst == %d", (uint16_t)nat->eport);
dnat = 1;
} else { continue; }
} else { ;}
}
ds_clear(&actions);
if (dnat_force_snat_ip) {
//该标志表示egress的SNAT需要执行
ds_put_format(&actions,
"flags.force_snat_for_dnat = 1; ");
}
if(dnat) {
pro = 90;
ds_put_format(&actions, "flags.loopback = 1; ct_dnat(%s:%d);", nat->logical_ip, (int)(nat->lport?:nat->eport));
} else {
ds_put_format(&actions, "flags.loopback = 1; ct_dnat(%s);", nat->logical_ip);
}
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, pro,
ds_cstr(&match), ds_cstr(&actions));
} else {
//分布式路由,入端口是l3dgw_port
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s"
" && inport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
ds_put_format(&actions, "ct_dnat(%s);",
nat->logical_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
nat->external_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
ds_cstr(&match),
REGBIT_NAT_REDIRECT" = 1; next;");
}
}
//egress 表0 UNDNAT,必须是ingress的DNAT连接已经创建的情况下
if (od->l3dgw_port && (!strcmp(nat->type, "dnat")
|| !strcmp(nat->type, "dnat_and_snat"))) {
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s"
" && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
if (distributed) {
ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac));
}
ds_put_format(&actions, "ct_dnat;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}
//egress 表1 SNAT
if (!strcmp(nat->type, "snat")
|| !strcmp(nat->type, "dnat_and_snat")) {
if (!od->l3dgw_port) {
//网关路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s",
nat->logical_ip);
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);
//掩码越长,优先级越高
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
count_1bits(ntohl(mask)) + 1,
ds_cstr(&match), ds_cstr(&actions));
} else {
//分布式路由
ds_clear(&match);
ds_put_format(&match, "ip && ip4.src == %s"
" && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
if (!distributed && od->l3redirect_port) {
ds_put_format(&match, " && is_chassis_resident(%s)",
od->l3redirect_port->json_key);
}
ds_clear(&actions);
if (distributed) {
ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
ETH_ADDR_ARGS(mac));
}
ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);
//同样是掩码越长,优先级越高
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
count_1bits(ntohl(mask)) + 1,
ds_cstr(&match), ds_cstr(&actions));
}
}
//ingress 表0,50优先级允许支持NAT的报文通过
//入口是l3dgw_port,并且目的MAC是nat->external_mac的时候通过
if (distributed) {
ds_clear(&match);
ds_put_format(&match,
"eth.dst == "ETH_ADDR_FMT" && inport == %s"
" && is_chassis_resident(\"%s\")",
ETH_ADDR_ARGS(mac),
od->l3dgw_port->json_key,
nat->logical_port);
ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 50,
ds_cstr(&match), "next;");
}
//ingress 表7,100优先级允许NAT报文通过
if (distributed) {
ds_clear(&match);
ds_put_format(&match, "ip4.src == %s && outport == %s",
nat->logical_ip,
od->l3dgw_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 100,
ds_cstr(&match), "next;");
}
//egress 表2,100优先级如目的IP和EIP相等的话,需要报文clone回注到egress的表0
if (od->l3dgw_port) {
//分布式路由
ds_clear(&match);
ds_put_format(&match, "ip4.dst == %s && outport == %s",
nat->external_ip,
od->l3dgw_port->json_key);
ds_clear(&actions);
ds_put_format(&actions,
"clone { ct_clear; "
"inport = outport; outport = \"\"; "
"flags = 0; flags.loopback = 1; ");
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
ds_put_format(&actions, "reg%d = 0; ", i);
}
ds_put_format(&actions, REGBIT_EGRESS_LOOPBACK" = 1; "
"next(pipeline=ingress, table=0); };");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}
//ingress 表3,110优先级强制SNAT之前DNAT过的报文,网关路由
if (dnat_force_snat_ip && !od->l3dgw_port) {
//如果报文的目的IP是网关路由的IP,那么进行UNSNAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", dnat_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 110,
ds_cstr(&match), "ct_snat; next;");
//已经DNAT过得报文走这一条,但是SNAT
ds_clear(&match);
ds_put_format(&match, "flags.force_snat_for_dnat == 1 && ip");
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", dnat_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}
if (lb_force_snat_ip && !od->l3dgw_port) {
//同上,报文的目的IP是网关路由IP,进行UNSNAT
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", lb_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
ds_cstr(&match), "ct_snat; next;");
//有force_snat_for_lb标志的进行SNAT
ds_clear(&match);
ds_put_format(&match, "flags.force_snat_for_lb == 1 && ip");
ds_clear(&actions);
ds_put_format(&actions, "ct_snat(%s);", lb_force_snat_ip);
ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
ds_cstr(&match), ds_cstr(&actions));
}
if (!od->l3dgw_port) {
//网关路由,重新循环每个报文到DNAT区域
//需要UNDNAT的报文都会进行UNDNAT,理想情况下,可以在egress完成
//但是由于网关路由器没有任何关于将源IP地址作为IP路由的EIP的特性
//所以我们可以在此处进行操作,从而省去了以后的一次重新循环
//任何通过SNAT区域的报文都会自动重新循环
//来得到openflow pipeline的路由需要的新目的IP
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
"ip4", "flags.loopback = 1; ct_dnat;");
} else {
//分布式路由的NAT,向ingress中没有NAT规则的IP路由表、ARP处理表、网关重定向添加流表
//ingress的表5 300优先级添加流表
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 300,
REGBIT_NAT_REDIRECT" == 1", "ip.ttl--; next;");
//ingress的表5 200优先级添加流表,修改目的MAC位分布式网关端口的地址
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
od->l3dgw_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 200,
REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));
//ingress的表7 200优先级添加流表
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; next;",
od->l3redirect_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 200,
REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));
}
//处理所有需要重组或者conntrack的IP的设置
struct sset all_ips = SSET_INITIALIZER(&all_ips);
for (int i = 0; i < od->nbr->n_load_balancer; i++) {
struct nbrec_load_balancer *lb = od->nbr->load_balancer[i];
struct smap *vips = &lb->vips;
struct smap_node *node;
SMAP_FOR_EACH (node, vips) {
uint16_t port = 0;
char *ip_address = NULL;
ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
if (!ip_address) {
continue;
}
if (!sset_contains(&all_ips, ip_address)) {
sset_add(&all_ips, ip_address);
}
//ingress 表4 DNAT中的LB添加高级的规则,120和110优先级
//每个匹配项,我们通过add_router_lb_flow()添加两条流表
//一条流表为ct.new添加操作ct_lb($targets)
//另一条流表为ct.est添加操作ct_dnat;
ds_clear(&actions);
ds_put_format(&actions, "ct_lb(%s);", node->value);
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s",
ip_address);
free(ip_address);
if (port) {
if (lb->protocol && !strcmp(lb->protocol, "udp")) {
ds_put_format(&match, " && udp && udp.dst == %d",
port);
} else {
ds_put_format(&match, " && tcp && tcp.dst == %d",
port);
}
add_router_lb_flow(lflows, od, &match, &actions, 120,
lb_force_snat_ip);
} else {
add_router_lb_flow(lflows, od, &match, &actions, 110,
lb_force_snat_ip);
}
}
}
//如果有LB规则,我们应该将报文发送到重组和conntrack中
//有了conntrack,我们可以只从组里的新连接去设置DNAT IP
//如果LB规则中有L4的端口,我们需要重组进行匹配L4端口
const char *ip_address;
SSET_FOR_EACH(ip_address, &all_ips) {
ds_clear(&match);
ds_put_format(&match, "ip && ip4.dst == %s", ip_address);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG,
100, ds_cstr(&match), "ct_next;");
}
sset_destroy(&all_ips);
}
//ingress 表5 该表设置出口为正确的出口,并且源MAC修改为出口的MAC
//下一跳的IP从reg0中拿到,并且下一条表处理ARP resolution
HMAP_FOR_EACH (op, key_node, ports) {
for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
add_route(lflows, op, op->lrp_networks.ipv4_addrs[i].addr_s,
op->lrp_networks.ipv4_addrs[i].network_s,
op->lrp_networks.ipv4_addrs[i].plen, NULL, NULL);
}
for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
add_route(lflows, op, op->lrp_networks.ipv6_addrs[i].addr_s,
op->lrp_networks.ipv6_addrs[i].network_s,
op->lrp_networks.ipv6_addrs[i].plen, NULL, NULL);
}
}
//将静态路由转换为流表
HMAP_FOR_EACH (od, key_node, datapaths) {
for (int i = 0; i < od->nbr->n_static_routes; i++) {
const struct nbrec_logical_router_static_route *route;
route = od->nbr->static_routes[i];
build_static_route_flow(lflows, od, ports, route);
}
}
//ingress 表6
//下一跳的IP地址存储在reg0,该表处理reg0中的IP设置到出口并且将MAC设置到报文目的MAC
HMAP_FOR_EACH (op, key_node, ports) {
if (op->nbrp) {
//当前状况是逻辑路由端口,当下一跳IP(存于reg0中)匹配该路由端口时,
//流表出口设置为这个逻辑端口,并且将该口的MAC设置为报文的目的MAC
//报文依然在对端的逻辑pipeline中,所以应当匹配对端的出口
if (op->peer && op->nbrp->peer) {
if (op->lrp_networks.n_ipv4_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == ",
op->peer->json_key);
op_put_v4_networks(&match, op, false);
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
op->lrp_networks.ea_s);
ovn_lflow_add(lflows, op->peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}
}
} else if (op->od->n_router_ports && strcmp(op->nbsp->type, "router")) {
//当前状况是逻辑交换的端口,连接了一个VM或者容器
//解析里面的地址,为每个地址遍历连接到逻辑交换的所有路由端口
//如果地址从路由端口科大,添加ARP表到路由的pipeline
for (size_t i = 0; i < op->n_lsp_addrs; i++) {
const char *ea_s = op->lsp_addrs[i].ea_s;
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
const char *ip_s = op->lsp_addrs[i].ipv4_addrs[j].addr_s;
for (size_t k = 0; k < op->od->n_router_ports; k++) {
const char *peer_name = smap_get(
&op->od->router_ports[k]->nbsp->options,
"router-port");
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == %s",
peer->json_key, ip_s);
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
ovn_lflow_add(lflows, peer->od,
S_ROUTER_IN_ARP_RESOLVE, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}
for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
const char *ip_s = op->lsp_addrs[i].ipv6_addrs[j].addr_s;
for (size_t k = 0; k < op->od->n_router_ports; k++) {
const char *peer_name = smap_get(
&op->od->router_ports[k]->nbsp->options,
"router-port");
ds_clear(&match);
ds_put_format(&match, "outport == %s && xxreg0 == %s",
peer->json_key, ip_s);
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
ovn_lflow_add(lflows, peer->od,
S_ROUTER_IN_ARP_RESOLVE, 100,
ds_cstr(&match), ds_cstr(&actions));
}
}
}
} else if (!strcmp(op->nbsp->type, "router")) {
//当前是连接路由的逻辑交换端口
//该交换端口的对端是路由端口
//我们需要添加逻辑流表,能够对所有连接到该交换机的路由端口添加ARP信息
const char *peer_name = smap_get(&op->nbsp->options,
"router-port");
struct ovn_port *peer = ovn_port_find(ports, peer_name);
for (size_t i = 0; i < op->od->n_router_ports; i++) {
const char *router_port_name = smap_get(
&op->od->router_ports[i]->nbsp->options,
"router-port");
struct ovn_port *router_port = ovn_port_find(ports,
router_port_name);
if (router_port->lrp_networks.n_ipv4_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && reg0 == ",
peer->json_key);
op_put_v4_networks(&match, router_port, false);
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
router_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}
if (router_port->lrp_networks.n_ipv6_addrs) {
ds_clear(&match);
ds_put_format(&match, "outport == %s && xxreg0 == ",
peer->json_key);
op_put_v6_networks(&match, router_port);
ds_clear(&actions);
ds_put_format(&actions, "eth.dst = %s; next;",
router_port->lrp_networks.ea_s);
ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
100, ds_cstr(&match), ds_cstr(&actions));
}
}
}
}
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 0, "ip4",
"get_arp(outport, reg0); next;");
}
//ingress 表7 分布式路由中,出口和l3dgw_port相等时
//这个表会将一个子网的流量转发到l3redirect_port
HMAP_FOR_EACH (od, key_node, datapaths) {
if (od->l3dgw_port && od->l3redirect_port) {
//当流量的出口等于l3dgw_port,如果报文不匹配更高级的重定向规则
//那么流量将重定向到l3dgw_port的中心实例
ds_clear(&match);
ds_put_format(&match, "outport == %s",
od->l3dgw_port->json_key);
ds_clear(&actions);
ds_put_format(&actions, "outport = %s; next;",
od->l3redirect_port->json_key);
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 50,
ds_cstr(&match), ds_cstr(&actions));
//如果目的MAC已经处理了,则重定向到l3dgw_port的中心实例
//这些流量在进行重定向到中心实例之前,将被ingress表中的ARP request替代
ds_put_format(&match, " && eth.dst == 00:00:00:00:00:00");
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 150,
ds_cstr(&match), ds_cstr(&actions));
}
//0优先级默认是转发
ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 0, "1", "next;");
}
//ingress 表8 100优先级用来发送ARP Request
//主要是目的MAC修改为广播、源IP从reg1取,目标IP从reg0取、操作为request,最后发送
//0优先级表示目的MAC地址可以处理,直接发送
HMAP_FOR_EACH (od, key_node, datapaths) {
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 100,
"eth.dst == 00:00:00:00:00:00",
"arp { "
"eth.dst = ff:ff:ff:ff:ff:ff; "
"arp.spa = reg1; "
"arp.tpa = reg0; "
"arp.op = 1; " /* ARP request */
"output; "
"};");
ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
}
//egress 表3 100优先级直接匹配逻辑出口,执行发送
HMAP_FOR_EACH (op, key_node, ports) {
ds_clear(&match);
ds_put_format(&match, "outport == %s", op->json_key);
ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
ds_cstr(&match), "output;");
}
}