前面我们介绍到openflow流表和logical流表的大致对应,今天我们从代码的角度看一下他是怎么实现的。

int
main(int argc, char *argv[])
{
    //为LB初始化group id
    struct group_table group_table;
    group_table.group_ids = bitmap_allocate(MAX_OVN_GROUPS);
    bitmap_set1(group_table.group_ids, 0); /* Group id 0 is invalid. */
    hmap_init(&group_table.desired_groups);
    hmap_init(&group_table.existing_groups);

    //连接ovs的ovsdb实例,我们不监控所有的表,所以模块必须注册感兴趣的部分
    struct ovsdb_idl_loop ovs_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
        ovsdb_idl_create(ovs_remote, &ovsrec_idl_class, false, true));
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
    ovsdb_idl_add_column(ovs_idl_loop.idl,
                         &ovsrec_open_vswitch_col_external_ids);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_fail_mode);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_other_config);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_external_ids);
    //chassis_register_ovs_idl
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_external_ids);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_iface_types);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_datapath_type);
    //encaps_register_ovs_idl
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
    //binding_register_ovs_idl
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_qos);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_status);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_qos);
    //physical_register_ovs_idl
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
    ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
    ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
    //确保成功连接到ovs的ovsdb,并且取回上面设置的内容
    ovsdb_idl_get_initial_snapshot(ovs_idl_loop.idl);

    //连接控制节点的sbdb,监听表的所有内容,除了下面忽略的nb_cfg
    //确保成功连接,并且取回相关的内容
    char *ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
    struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
        ovsdb_idl_create(ovnsb_remote, &sbrec_idl_class, true, true));
    ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
    update_sb_monitors(ovnsb_idl_loop.idl, NULL, NULL, NULL);
    ovsdb_idl_get_initial_snapshot(ovnsb_idl_loop.idl);

    //初始化conntrack zone
    struct simap ct_zones = SIMAP_INITIALIZER(&ct_zones);
    struct shash pending_ct_zones = SHASH_INITIALIZER(&pending_ct_zones);
    unsigned long ct_zone_bitmap[BITMAP_N_LONGS(MAX_CT_ZONES)];
    memset(ct_zone_bitmap, 0, sizeof ct_zone_bitmap);
    //默认是zone 0
    bitmap_set1(ct_zone_bitmap, 0);
    restore_ct_zones(ovs_idl_loop.idl, &ct_zones, ct_zone_bitmap);
    unixctl_command_register("ct-zone-list", "", 0, 0,
                             ct_zone_list, &ct_zones);

    struct pending_pkt pending_pkt = { .conn = NULL };
    unixctl_command_register("inject-pkt", "MICROFLOW", 1, 1, inject_pkt,
                             &pending_pkt);

    while (!exiting) {
        //检测sbdb是否变化
        char *new_ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
        if (strcmp(ovnsb_remote, new_ovnsb_remote)) {
            free(ovnsb_remote);
            ovnsb_remote = new_ovnsb_remote;
            ovsdb_idl_set_remote(ovnsb_idl_loop.idl, ovnsb_remote, true);
        } else {
            free(new_ovnsb_remote);
        }

        struct controller_ctx ctx = {
            .ovs_idl = ovs_idl_loop.idl,
            .ovs_idl_txn = ovsdb_idl_loop_run(&ovs_idl_loop),
            .ovnsb_idl = ovnsb_idl_loop.idl,
            .ovnsb_idl_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
        };

        //默认5s的探测sbdb
        update_probe_interval(&ctx);

        //包含了struct local_datapath节点
        struct hmap local_datapaths = HMAP_INITIALIZER(&local_datapaths);

        //包含了驻留在本地的所有逻辑端口的名字。
        //这些逻辑端口包括虚拟机的虚拟端口、
        //l2gateway-chassis指派的l2网关端口以及localnet端口
        struct sset local_lports = SSET_INITIALIZER(&local_lports);

        //获取br-int,如果没有则创建,名字不一定是br-int
        const struct ovsrec_bridge *br_int = get_br_int(&ctx);
        //获取本地ovs的system-id
        const char *chassis_id = get_chassis_id(ctx.ovs_idl);

        struct ldatapath_index ldatapaths;
        struct lport_index lports;
        struct mcgroup_index mcgroups;
        ldatapath_index_init(&ldatapaths, ctx.ovnsb_idl);
        lport_index_init(&lports, ctx.ovnsb_idl);
        mcgroup_index_init(&mcgroups, ctx.ovnsb_idl);

        const struct sbrec_chassis *chassis = NULL;
        if (chassis_id) {
            //根据本地的system-id,获取本地encap type和IP、hostname
            //datapath type、端口type并且更新到ovn的chassis中
            //查看ovn的chassis方法为ovn-sbctl list Chassis
            chassis = chassis_run(&ctx, chassis_id, br_int);
            //根据ovn配置的创建tunnel信息
            //找到需要和其他chassis建立tunnel的隧道信息,并且创建隧道
            //所谓的创建,其实就是给ovs的数据库插入端口信息
            encaps_run(&ctx, br_int, chassis_id);
            //首先获取端口的信息,本地端口获取external-ids的iface-id
            //egress端口获取options的remote_ip,查看命令为ovs-vsctl list interface
            //将上面获取的iface-id填充到sbdb的port_binding的logical_port中
            //给egress端口设置qos,因为ovn现在给vm的出口配置qos最终会通过queue配置到出口
            binding_run(&ctx, br_int, chassis, &ldatapaths, &lports,
                        &local_datapaths, &local_lports);
        }

        if (br_int && chassis) {
            struct shash addr_sets = SHASH_INITIALIZER(&addr_sets);
            addr_sets_init(&ctx, &addr_sets);

            //这块没找到相关信息,据我所知ovs2.6还有patch port
            //ovs2.7开始就没有patch port了,都是通过流表resubmit重新从表16开始匹配
            patch_run(&ctx, br_int, chassis, &local_datapaths);

            enum mf_field_id mff_ovn_geneve = ofctrl_run(br_int,
                                                         &pending_ct_zones);

            pinctrl_run(&ctx, &lports, br_int, chassis, &local_datapaths);
            update_ct_zones(&local_lports, &local_datapaths, &ct_zones,
                            ct_zone_bitmap, &pending_ct_zones);
            if (ctx.ovs_idl_txn) {

                //设置ct_zone,通过命令ovs-vsctl list Bridge可以查看
                commit_ct_zones(br_int, &pending_ct_zones);

                struct hmap flow_table = HMAP_INITIALIZER(&flow_table);
                //此处是生成逻辑流表的地方,这里不多说,下面详说
                lflow_run(&ctx, chassis, &lports, &mcgroups,
                          &local_datapaths, &group_table, &ct_zones,
                          &addr_sets, &flow_table);

                //一些和物理设备相关和逻辑流表无关的,下面也会细看
                physical_run(&ctx, mff_ovn_geneve,
                             br_int, chassis, &ct_zones, &lports,
                             &flow_table, &local_datapaths);

            }
        }
    }
}

lflow_run主要有两块的工作,一个是翻译逻辑流表的函数add_logical_flows-->consider_logical_flow,另外一个是添加邻居子系统相关流表的函数add_neighbor_flows-->consider_neighbor_flow

  • 逻辑流表翻译是ingress table 0-15对应openflow的table 16-31,egress table 0-15 对应openflow的table 48-63
  • 邻居子系统相关的流表是对应openflow的table 66
static void
consider_logical_flow(const struct lport_index *lports,
                      const struct mcgroup_index *mcgroups,
                      const struct sbrec_logical_flow *lflow,
                      const struct hmap *local_datapaths,
                      struct group_table *group_table,
                      const struct simap *ct_zones,
                      const struct sbrec_chassis *chassis,
                      struct hmap *dhcp_opts,
                      struct hmap *dhcpv6_opts,
                      uint32_t *conj_id_ofs,
                      const struct shash *addr_sets,
                      struct hmap *flow_table)
{
    //判定逻辑流表是ingress还是egress
    bool ingress = !strcmp(lflow->pipeline, "ingress");

    const struct sbrec_datapath_binding *ldp = lflow->logical_datapath;
    if (!get_local_datapath(local_datapaths, ldp->tunnel_key)) {
        return;
    }

    //逻辑流表的ID和openflow流表ID的对应关系,ingress从表16开始,egress从48开始
    uint8_t first_ptable = (ingress
                            ? OFTABLE_LOG_INGRESS_PIPELINE
                            : OFTABLE_LOG_EGRESS_PIPELINE);
    uint8_t ptable = first_ptable + lflow->table_id;
    //这块可能是关于发送口的流表,ingress从表32开始,egress从64开始
    uint8_t output_ptable = (ingress
                             ? OFTABLE_REMOTE_OUTPUT
                             : OFTABLE_SAVE_INPORT);

    //此处开始解析逻辑流表的action到openflow流表的action
    uint64_t ovnacts_stub[1024 / 8];
    struct ofpbuf ovnacts = OFPBUF_STUB_INITIALIZER(ovnacts_stub);
    struct ovnact_parse_params pp = {
        .symtab = &symtab,
        .dhcp_opts = dhcp_opts,
        .dhcpv6_opts = dhcpv6_opts,

        .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
        .n_tables = LOG_PIPELINE_LEN,
        .cur_ltable = lflow->table_id,
    };
    struct expr *prereqs;
    char *error;

    //新加的,没看懂,说是用local info替换lflow->actions.....
    const char *new_actions = (const char *)replace_local_info(lflow->actions, chassis);

    //详细的解析action不说了,可以查看函数parse_action
    error = ovnacts_parse_string(new_actions, &pp, &ovnacts, &prereqs);
    if (error) {
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
        VLOG_WARN_RL(&rl, "error parsing actions \"%s\": %s",
                     lflow->actions, error);
        free(error);
        ovnacts_free(ovnacts.data, ovnacts.size);
        ofpbuf_uninit(&ovnacts);
        return;
    }

    //将之前解析的action转换成openflow的action
    uint64_t ofpacts_stub[1024 / 8];
    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
    struct lookup_port_aux aux = {
        .lports = lports,
        .mcgroups = mcgroups,
        .dp = lflow->logical_datapath
    };
    struct ovnact_encode_params ep = {
        .lookup_port = lookup_port_cb,
        .aux = &aux,
        .is_switch = is_switch(ldp),
        .is_gateway_router = is_gateway_router(ldp, local_datapaths),
        .ct_zones = ct_zones,
        .group_table = group_table,

        .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
        .ingress_ptable = OFTABLE_LOG_INGRESS_PIPELINE,
        .egress_ptable = OFTABLE_LOG_EGRESS_PIPELINE,
        .output_ptable = output_ptable,
        .mac_bind_ptable = OFTABLE_MAC_BINDING,
    };
    //每个action都有一个对应的两个函数进行转换,不细看了
    ovnacts_encode(ovnacts.data, ovnacts.size, &ep, &ofpacts);
    ovnacts_free(ovnacts.data, ovnacts.size);
    ofpbuf_uninit(&ovnacts);

    //将ovn的match信息翻译为openflow的match
    struct hmap matches;
    struct expr *expr;

    //同上不造什么鬼
    const char *new_match = (const char *)replace_local_info(lflow->match, chassis);

    //解析匹配项,将字符串解析为宏定义,比如&解析成LEX_T_LOG_AND
    expr = expr_parse_string(new_match, &symtab, addr_sets, &error);
    if (!error) {
        if (prereqs) {
            expr = expr_combine(EXPR_T_AND, expr, prereqs);
            prereqs = NULL;
        }
        expr = expr_annotate(expr, &symtab, &error);
    }
    if (error) {
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
        VLOG_WARN_RL(&rl, "error parsing match \"%s\": %s",
                     lflow->match, error);
        expr_destroy(prereqs);
        ofpbuf_uninit(&ofpacts);
        free(error);
        return;
    }

    struct condition_aux cond_aux = { lports, chassis };
    expr = expr_simplify(expr, is_chassis_resident_cb, &cond_aux);
    expr = expr_normalize(expr);
    //表达式转换成到matches
    uint32_t n_conjs = expr_to_matches(expr, lookup_port_cb, &aux,
                                       &matches);
    expr_destroy(expr);

    struct expr_match *m;
    HMAP_FOR_EACH (m, hmap_node, &matches) {
        match_set_metadata(&m->match,
                           htonll(lflow->logical_datapath->tunnel_key));
        if (m->match.wc.masks.conj_id) {
            m->match.flow.conj_id += *conj_id_ofs;
        }
        if (!m->n) {
            //生成openflow流表
            ofctrl_add_flow(flow_table, ptable, lflow->priority,
                            lflow->header_.uuid.parts[0], &m->match, &ofpacts);
        } else {
            uint64_t conj_stubs[64 / 8];
            struct ofpbuf conj;

            ofpbuf_use_stub(&conj, conj_stubs, sizeof conj_stubs);
            for (int i = 0; i < m->n; i++) {
                const struct cls_conjunction *src = &m->conjunctions[i];
                struct ofpact_conjunction *dst;

                dst = ofpact_put_CONJUNCTION(&conj);
                dst->id = src->id + *conj_id_ofs;
                dst->clause = src->clause;
                dst->n_clauses = src->n_clauses;
            }
            ofctrl_add_flow(flow_table, ptable, lflow->priority, 0, &m->match,
                            &conj);
            ofpbuf_uninit(&conj);
        }
    }
}
static void
consider_neighbor_flow(const struct lport_index *lports,
                       const struct sbrec_mac_binding *b,
                       struct hmap *flow_table)
{
    //通过MAC_Binding查找到Port_Binding
    const struct sbrec_port_binding *pb
        = lport_lookup_by_name(lports, b->logical_port);

    struct eth_addr mac;
    //解析mac
    eth_addr_from_string(b->mac, &mac);

    struct match match = MATCH_CATCHALL_INITIALIZER;
    if (strchr(b->ip, '.')) {
        ovs_be32 ip;
        //解析IP
        ip_parse(b->ip, &ip);
        //然后match中reg0匹配IP
        match_set_reg(&match, 0, ntohl(ip));
    }

    uint64_t stub[1024 / 8];
    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(stub);
    //设置action是修改目的MAC
    put_load(mac.ea, sizeof mac.ea, MFF_ETH_DST, 0, 48, &ofpacts);
    //添加流表
    ofctrl_add_flow(flow_table, OFTABLE_MAC_BINDING, 100, 0, &match, &ofpacts);
    ofpbuf_uninit(&ofpacts);
}

physical_run

void
physical_run(struct controller_ctx *ctx, enum mf_field_id mff_ovn_geneve,
             const struct ovsrec_bridge *br_int,
             const struct sbrec_chassis *chassis,
             const struct simap *ct_zones, struct lport_index *lports,
             struct hmap *flow_table, struct hmap *local_datapaths)
{

    bool physical_map_changed = false;

    struct simap new_localvif_to_ofport =
        SIMAP_INITIALIZER(&new_localvif_to_ofport);
    struct simap new_tunnel_to_ofport =
        SIMAP_INITIALIZER(&new_tunnel_to_ofport);
    for (int i = 0; i < br_int->n_ports; i++) {
        const struct ovsrec_port *port_rec = br_int->ports[i];
        //对应的数据查看用ovs-vsctl list port
        const char *chassis_id = smap_get(&port_rec->external_ids,
                                          "ovn-chassis-id");
        const char *localnet = smap_get(&port_rec->external_ids,
                                        "ovn-localnet-port");
        const char *l2gateway = smap_get(&port_rec->external_ids,
                                        "ovn-l2gateway-port");

        for (int j = 0; j < port_rec->n_interfaces; j++) {
            const struct ovsrec_interface *iface_rec = port_rec->interfaces[j];
            //查看信息用ovs-vsctl list interface
            int64_t ofport = iface_rec->ofport[0];

            /* Record as patch to local net, logical patch port, chassis, or
             * local logical port. */
            bool is_patch = !strcmp(iface_rec->type, "patch");
            //localnet patch ports和L2 gateway patch ports都当做VIF处理
            if (is_patch && localnet) {
                simap_put(&new_localvif_to_ofport, localnet, ofport);
                break;
            } else if (is_patch && l2gateway) {
                simap_put(&new_localvif_to_ofport, l2gateway, ofport);
                break;
            //有chassis_id表示是连接到某个宿主机的tunnel口
            } else if (chassis_id) {
                enum chassis_tunnel_type tunnel_type;
                if (!strcmp(iface_rec->type, "geneve")) {
                    tunnel_type = GENEVE;
                    if (!mff_ovn_geneve) {
                        continue;
                    }
                } else if (!strcmp(iface_rec->type, "stt")) {
                    tunnel_type = STT;
                } else if (!strcmp(iface_rec->type, "vxlan")) {
                    tunnel_type = VXLAN;
                } else {
                    continue;
                }

                //找到chassis_id对应的tunnel口,存在即更新,不存在创建
                simap_put(&new_tunnel_to_ofport, chassis_id, ofport);
                struct chassis_tunnel *tun = chassis_tunnel_find(chassis_id);
                if (tun) {
                    if (tun->ofport != u16_to_ofp(ofport) ||
                        tun->type != tunnel_type) {
                        tun->ofport = u16_to_ofp(ofport);
                        tun->type = tunnel_type;
                        physical_map_changed = true;
                    }
                } else {
                    tun = xmalloc(sizeof *tun);
                    hmap_insert(&tunnels, &tun->hmap_node,
                                hash_string(chassis_id, 0));
                    tun->chassis_id = chassis_id;
                    tun->ofport = u16_to_ofp(ofport);
                    tun->type = tunnel_type;
                    physical_map_changed = true;
                }
                break;
            } else {
                const char *iface_id = smap_get(&iface_rec->external_ids,
                                                "iface-id");
                if (iface_id) {
                    simap_put(&new_localvif_to_ofport, iface_id, ofport);
                }
            }
        }
    }

    //清理不存在的tunnel口
    struct chassis_tunnel *tun, *tun_next;
    HMAP_FOR_EACH_SAFE (tun, tun_next, hmap_node, &tunnels) {
        if (!simap_find(&new_tunnel_to_ofport, tun->chassis_id)) {
            hmap_remove(&tunnels, &tun->hmap_node);
            physical_map_changed = true;
            free(tun);
        }
    }

    //记录下更改或者删除的openflow ports
    struct simap_node *vif_name, *vif_name_next;
    SIMAP_FOR_EACH_SAFE (vif_name, vif_name_next, &localvif_to_ofport) {
        int newport;
        if ((newport = simap_get(&new_localvif_to_ofport, vif_name->name))) {
            if (newport != simap_get(&localvif_to_ofport, vif_name->name)) {
                simap_put(&localvif_to_ofport, vif_name->name, newport);
                physical_map_changed = true;
            }
        } else {
            simap_find_and_delete(&localvif_to_ofport, vif_name->name);
            physical_map_changed = true;
        }
    }
    SIMAP_FOR_EACH (vif_name, &new_localvif_to_ofport) {
        if (!simap_get(&localvif_to_ofport, vif_name->name)) {
            simap_put(&localvif_to_ofport, vif_name->name,
                      simap_get(&new_localvif_to_ofport, vif_name->name));
            physical_map_changed = true;
        }
    }
    if (physical_map_changed) {
        //调用此处触发logical flow table的处理
        poll_immediate_wake();
    }

    struct ofpbuf ofpacts;
    ofpbuf_init(&ofpacts, 0);

    //建立表0,建立physical-to-logical的映射
    //建立表32,报文到远端宿主机的tunnel流量
    //建立表65,走完一个逻辑网元到下一个逻辑网元的流表
    const struct sbrec_port_binding *binding;
    SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
        consider_port_binding(mff_ovn_geneve, ct_zones, lports,
                              local_datapaths, binding, chassis,
                              &ofpacts, flow_table);
    }

    //table 32 33处理报文到multicast groups
    const struct sbrec_multicast_group *mc;
    struct ofpbuf remote_ofpacts;
    ofpbuf_init(&remote_ofpacts, 0);
    SBREC_MULTICAST_GROUP_FOR_EACH (mc, ctx->ovnsb_idl) {
        consider_mc_group(mff_ovn_geneve, ct_zones, local_datapaths, chassis,
                          mc, &ofpacts, &remote_ofpacts, flow_table);
    }

    ofpbuf_uninit(&remote_ofpacts);

    //表0 100优先级,处理从tunnel口收到的报文
    //Geneve和STT封装因为包含了ingress和egress的逻辑口
    //我们从封装信息获取MFF_LOG_DATAPATH, MFF_LOG_INPORT, MFF_LOG_OUTPORT
    //重新提交到表33,发送报文到本地vm
    //这里先不处理vxlan的报文
    HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
        struct match match = MATCH_CATCHALL_INITIALIZER;
        match_set_in_port(&match, tun->ofport);

        ofpbuf_clear(&ofpacts);
        if (tun->type == GENEVE) {
            put_move(MFF_TUN_ID, 0,  MFF_LOG_DATAPATH, 0, 24, &ofpacts);
            put_move(mff_ovn_geneve, 16, MFF_LOG_INPORT, 0, 15,
                     &ofpacts);
            put_move(mff_ovn_geneve, 0, MFF_LOG_OUTPORT, 0, 16,
                     &ofpacts);
        } else if (tun->type == STT) {
            put_move(MFF_TUN_ID, 40, MFF_LOG_INPORT,   0, 15, &ofpacts);
            put_move(MFF_TUN_ID, 24, MFF_LOG_OUTPORT,  0, 16, &ofpacts);
            put_move(MFF_TUN_ID,  0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
        } else if (tun->type == VXLAN) {
            continue;
        } else {
            OVS_NOT_REACHED();
        }

        put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);

        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
                        &ofpacts);
    }

    //目前只支持VXLAN连接网关,VNI存于MFF_LOG_INPORT
    //然后报文提交到表16,然后确定出口
    //目前测试是跨子网的东西流量不通
    HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
        SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
            struct match match = MATCH_CATCHALL_INITIALIZER;

            match_set_in_port(&match, tun->ofport);
            match_set_tun_id(&match, htonll(binding->datapath->tunnel_key));

            ofpbuf_clear(&ofpacts);
            put_move(MFF_TUN_ID, 0,  MFF_LOG_DATAPATH, 0, 24, &ofpacts);
            put_load(binding->tunnel_key, MFF_LOG_INPORT, 0, 15, &ofpacts);
            put_load(1, MFF_LOG_FLAGS, MLF_RCV_FROM_VXLAN_BIT, 1, &ofpacts);
            put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);

            ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
                            &ofpacts);
        }
    }

     //表32 150优先级,处理收到的vxlan报文,因为缺乏metadata
     //不发送任何隧道,重新提交到表33
    struct match match;
    match_init_catchall(&match);
    ofpbuf_clear(&ofpacts);
    match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
                         MLF_RCV_FROM_VXLAN, MLF_RCV_FROM_VXLAN);

    put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 150, 0,
                    &match, &ofpacts);

    //表32 0优先级,不是组播或者tunnel到本地出口报文就重新提交到表33
    match_init_catchall(&match);
    ofpbuf_clear(&ofpacts);
    put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 0, 0, &match, &ofpacts);

    //表34 0优先级,不是出ingress口,而是进入egress流程的话,清楚寄存器
    match_init_catchall(&match);
    ofpbuf_clear(&ofpacts);
    for (int i = 0; i < MFF_N_LOG_REGS; i++) {
        put_load(0, MFF_REG0 + i, 0, 32, &ofpacts);
    }
    put_resubmit(OFTABLE_LOG_EGRESS_PIPELINE, &ofpacts);
    ofctrl_add_flow(flow_table, OFTABLE_CHECK_LOOPBACK, 0, 0, &match,
                    &ofpacts);

    //表64 0优先级,没有MLF_ALLOW_LOOPBACK标志的报文
    //重新提交到表65进行logical-to-physical的转换
    match_init_catchall(&match);
    ofpbuf_clear(&ofpacts);
    put_resubmit(OFTABLE_LOG_TO_PHY, &ofpacts);
    ofctrl_add_flow(flow_table, OFTABLE_SAVE_INPORT, 0, 0, &match, &ofpacts);
}
static void
consider_port_binding(enum mf_field_id mff_ovn_geneve,
                      const struct simap *ct_zones,
                      const struct lport_index *lports,
                      struct hmap *local_datapaths,
                      const struct sbrec_port_binding *binding,
                      const struct sbrec_chassis *chassis,
                      struct ofpbuf *ofpacts_p,
                      struct hmap *flow_table)
{
    uint32_t dp_key = binding->datapath->tunnel_key;
    uint32_t port_key = binding->tunnel_key;
    if (!get_local_datapath(local_datapaths, dp_key)) {
        return;
    }

    struct match match;
    if (!strcmp(binding->type, "patch")
        || (!strcmp(binding->type, "l3gateway")
            && binding->chassis == chassis)) {
        const char *peer_name = smap_get(&binding->options, "peer");
        const struct sbrec_port_binding *peer = lport_lookup_by_name(
            lports, peer_name);
        const char *peer_peer_name = smap_get(&peer->options, "peer");
        struct zone_ids binding_zones = get_zone_ids(binding, ct_zones);
        //表33 100优先级,每个flow对应一个端口,resubmit到表34
        //表34 100优先级,logical入口和出口一样的报文丢弃
        //表64 100优先级,入端口修改为0,resubmit到表65
        put_local_common_flows(dp_key, port_key, false, &binding_zones,
                               ofpacts_p, flow_table);

        //表65 100优先级,清空寄存器,resubmit到表16
        //表示过完了当前的一个网元,要进入下一个网元
        match_init_catchall(&match);
        ofpbuf_clear(ofpacts_p);
        match_set_metadata(&match, htonll(dp_key));
        match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

        size_t clone_ofs = ofpacts_p->size;
        struct ofpact_nest *clone = ofpact_put_CLONE(ofpacts_p);
        ofpact_put_CT_CLEAR(ofpacts_p);
        put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
        put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
        struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
        put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
        put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
        for (int i = 0; i < MFF_N_LOG_REGS; i++) {
            put_load(0, MFF_LOG_REG0 + i, 0, 32, ofpacts_p);
        }
        put_load(0, MFF_IN_PORT, 0, 16, ofpacts_p);
        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
        clone = ofpbuf_at_assert(ofpacts_p, clone_ofs, sizeof *clone);
        ofpacts_p->header = clone;
        ofpact_finish_CLONE(ofpacts_p, &clone);

        ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
                        &match, ofpacts_p);
        return;
    }

    if (!strcmp(binding->type, "chassisredirect")
        && binding->chassis == chassis) {

        //表33 100优先级,每个flow匹配一个逻辑出口到reg15,resubmit到表34
        //如果端口类型是chassisredirect,则修改逻辑出口
        match_init_catchall(&match);
        ofpbuf_clear(ofpacts_p);
        match_set_metadata(&match, htonll(dp_key));
        match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

        const char *distributed_port = smap_get(&binding->options,
                                                "distributed-port");
        const struct sbrec_port_binding *distributed_binding
            = lport_lookup_by_name(lports, distributed_port);

        if (!distributed_binding) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
            VLOG_WARN_RL(&rl, "No port binding record for distributed "
                         "port %s referred by chassisredirect port %s",
                         distributed_port,
                         binding->logical_port);
        } else if (binding->datapath !=
                   distributed_binding->datapath) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
            VLOG_WARN_RL(&rl,
                         "chassisredirect port %s refers to "
                         "distributed port %s in wrong datapath",
                         binding->logical_port,
                         distributed_port);
        } else {
            put_load(distributed_binding->tunnel_key,
                     MFF_LOG_OUTPORT, 0, 32, ofpacts_p);

            struct zone_ids zone_ids = get_zone_ids(distributed_binding,
                                                    ct_zones);
            if (zone_ids.ct) {
                put_load(zone_ids.ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
            }
            if (zone_ids.dnat) {
                put_load(zone_ids.dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
            }
            if (zone_ids.snat) {
                put_load(zone_ids.snat, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
            }

            put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts_p);
        }

        ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
                        &match, ofpacts_p);
        return;
    }

    //根据逻辑端口查找ofport
    //如果端口是VIF,那么ofport是VIF,tun为NULL
    //如果端口是远程的chassis,那么ofport是tunnel,tun的id是对端的chassis tun_key
    int tag = 0;
    bool nested_container = false;
    ofp_port_t ofport;
    bool is_remote = false;
    if (binding->parent_port && *binding->parent_port) {
        ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
                                      binding->parent_port));
        if (ofport) {
            tag = *binding->tag;
            nested_container = true;
        }
    } else {
        ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
                                      binding->logical_port));
        if ((!strcmp(binding->type, "localnet")
            || !strcmp(binding->type, "l2gateway"))
            && ofport && binding->tag) {
            tag = *binding->tag;
        }
    }

    const struct chassis_tunnel *tun = NULL;
    const struct sbrec_port_binding *localnet_port =
        get_localnet_port(local_datapaths, dp_key);
    if (!ofport) {
        is_remote = true;
        if (!binding->chassis) {
            return;
        }
        if (localnet_port) {
            ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
                                          localnet_port->logical_port));
            if (!ofport) {
                return;
            }
        } else {
            tun = chassis_tunnel_find(binding->chassis->name);
            if (!tun) {
                return;
            }
            ofport = tun->ofport;
        }
    }

    if (!is_remote) {
        struct zone_ids zone_ids = get_zone_ids(binding, ct_zones);
        put_local_common_flows(dp_key, port_key, nested_container, &zone_ids,
                               ofpacts_p, flow_table);

        //表0 100和150优先级
        //150优先级是标记的流量,可能是VM中的容器或者local network的vlan情况下
        //会匹配标记和剥去标记
        //100优先级是vm的流量或者没有标记的本地网络
        //这两种流量都需要设置逻辑入端口,逻辑datapath和resubmit到表16
        ofpbuf_clear(ofpacts_p);
        match_init_catchall(&match);
        match_set_in_port(&match, ofport);

        if (tag || !strcmp(binding->type, "localnet")
            || !strcmp(binding->type, "l2gateway")) {
            match_set_dl_vlan(&match, htons(tag));
            if (nested_container) {
                put_load(MLF_ALLOW_LOOPBACK, MFF_LOG_FLAGS, 0, 1, ofpacts_p);
            }
            ofpact_put_STRIP_VLAN(ofpacts_p);
        }

        uint32_t ofpacts_orig_size = ofpacts_p->size;

        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);

        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG,
                        tag ? 150 : 100, 0, &match, ofpacts_p);

        if (!tag && (!strcmp(binding->type, "localnet")
                     || !strcmp(binding->type, "l2gateway"))) {

            ofpbuf_pull(ofpacts_p, ofpacts_orig_size);
            match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI));
            ofctrl_add_flow(flow_table, 0, 100, 0, &match, ofpacts_p);
        }

        //表65 100优先级,转发报文到本地的vif
        match_init_catchall(&match);
        ofpbuf_clear(ofpacts_p);
        match_set_metadata(&match, htonll(dp_key));
        match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
        if (tag) {
            struct ofpact_vlan_vid *vlan_vid;
            vlan_vid = ofpact_put_SET_VLAN_VID(ofpacts_p);
            vlan_vid->vlan_vid = tag;
            vlan_vid->push_vlan_if_needed = true;
        }
        ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
        if (tag) {
            ofpact_put_STRIP_VLAN(ofpacts_p);
        }
        ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
                        &match, ofpacts_p);
    } else if (!tun) {
        //表33 100优先级,完成交换到localnet端口,每个flow匹配一个逻辑出口
        //切换出口到localnet口和resubmit到同样的表,即表33
        match_init_catchall(&match);
        ofpbuf_clear(ofpacts_p);

        match_set_metadata(&match, htonll(dp_key));
        match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

        put_load(localnet_port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);

        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
        ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
                        &match, ofpacts_p);
    } else {
        //表32 100优先级,将流量发送到远端设备,每一条flow匹配一个出口
        //封装报文到远端
        match_init_catchall(&match);
        ofpbuf_clear(ofpacts_p);

        match_set_metadata(&match, htonll(dp_key));
        match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);

        put_encapsulation(mff_ovn_geneve, tun, binding->datapath,
                          port_key, ofpacts_p);

        ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, 0,
                        &match, ofpacts_p);
    }
}
最后修改:2021 年 08 月 18 日
如果觉得我的文章对你有用,请随意赞赏