前面我们介绍到openflow流表和logical流表的大致对应,今天我们从代码的角度看一下他是怎么实现的。
int
main(int argc, char *argv[])
{
//为LB初始化group id
struct group_table group_table;
group_table.group_ids = bitmap_allocate(MAX_OVN_GROUPS);
bitmap_set1(group_table.group_ids, 0); /* Group id 0 is invalid. */
hmap_init(&group_table.desired_groups);
hmap_init(&group_table.existing_groups);
//连接ovs的ovsdb实例,我们不监控所有的表,所以模块必须注册感兴趣的部分
struct ovsdb_idl_loop ovs_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovs_remote, &ovsrec_idl_class, false, true));
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl,
&ovsrec_open_vswitch_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_fail_mode);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_other_config);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_external_ids);
//chassis_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_iface_types);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_datapath_type);
//encaps_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_type);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_options);
//binding_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_open_vswitch);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_open_vswitch_col_bridges);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_qos);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_status);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_qos);
//physical_register_ovs_idl
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_bridge);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_bridge_col_ports);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_port);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_interfaces);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_port_col_external_ids);
ovsdb_idl_add_table(ovs_idl_loop.idl, &ovsrec_table_interface);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_name);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_ofport);
ovsdb_idl_add_column(ovs_idl_loop.idl, &ovsrec_interface_col_external_ids);
//确保成功连接到ovs的ovsdb,并且取回上面设置的内容
ovsdb_idl_get_initial_snapshot(ovs_idl_loop.idl);
//连接控制节点的sbdb,监听表的所有内容,除了下面忽略的nb_cfg
//确保成功连接,并且取回相关的内容
char *ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnsb_remote, &sbrec_idl_class, true, true));
ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
update_sb_monitors(ovnsb_idl_loop.idl, NULL, NULL, NULL);
ovsdb_idl_get_initial_snapshot(ovnsb_idl_loop.idl);
//初始化conntrack zone
struct simap ct_zones = SIMAP_INITIALIZER(&ct_zones);
struct shash pending_ct_zones = SHASH_INITIALIZER(&pending_ct_zones);
unsigned long ct_zone_bitmap[BITMAP_N_LONGS(MAX_CT_ZONES)];
memset(ct_zone_bitmap, 0, sizeof ct_zone_bitmap);
//默认是zone 0
bitmap_set1(ct_zone_bitmap, 0);
restore_ct_zones(ovs_idl_loop.idl, &ct_zones, ct_zone_bitmap);
unixctl_command_register("ct-zone-list", "", 0, 0,
ct_zone_list, &ct_zones);
struct pending_pkt pending_pkt = { .conn = NULL };
unixctl_command_register("inject-pkt", "MICROFLOW", 1, 1, inject_pkt,
&pending_pkt);
while (!exiting) {
//检测sbdb是否变化
char *new_ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
if (strcmp(ovnsb_remote, new_ovnsb_remote)) {
free(ovnsb_remote);
ovnsb_remote = new_ovnsb_remote;
ovsdb_idl_set_remote(ovnsb_idl_loop.idl, ovnsb_remote, true);
} else {
free(new_ovnsb_remote);
}
struct controller_ctx ctx = {
.ovs_idl = ovs_idl_loop.idl,
.ovs_idl_txn = ovsdb_idl_loop_run(&ovs_idl_loop),
.ovnsb_idl = ovnsb_idl_loop.idl,
.ovnsb_idl_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
};
//默认5s的探测sbdb
update_probe_interval(&ctx);
//包含了struct local_datapath节点
struct hmap local_datapaths = HMAP_INITIALIZER(&local_datapaths);
//包含了驻留在本地的所有逻辑端口的名字。
//这些逻辑端口包括虚拟机的虚拟端口、
//l2gateway-chassis指派的l2网关端口以及localnet端口
struct sset local_lports = SSET_INITIALIZER(&local_lports);
//获取br-int,如果没有则创建,名字不一定是br-int
const struct ovsrec_bridge *br_int = get_br_int(&ctx);
//获取本地ovs的system-id
const char *chassis_id = get_chassis_id(ctx.ovs_idl);
struct ldatapath_index ldatapaths;
struct lport_index lports;
struct mcgroup_index mcgroups;
ldatapath_index_init(&ldatapaths, ctx.ovnsb_idl);
lport_index_init(&lports, ctx.ovnsb_idl);
mcgroup_index_init(&mcgroups, ctx.ovnsb_idl);
const struct sbrec_chassis *chassis = NULL;
if (chassis_id) {
//根据本地的system-id,获取本地encap type和IP、hostname
//datapath type、端口type并且更新到ovn的chassis中
//查看ovn的chassis方法为ovn-sbctl list Chassis
chassis = chassis_run(&ctx, chassis_id, br_int);
//根据ovn配置的创建tunnel信息
//找到需要和其他chassis建立tunnel的隧道信息,并且创建隧道
//所谓的创建,其实就是给ovs的数据库插入端口信息
encaps_run(&ctx, br_int, chassis_id);
//首先获取端口的信息,本地端口获取external-ids的iface-id
//egress端口获取options的remote_ip,查看命令为ovs-vsctl list interface
//将上面获取的iface-id填充到sbdb的port_binding的logical_port中
//给egress端口设置qos,因为ovn现在给vm的出口配置qos最终会通过queue配置到出口
binding_run(&ctx, br_int, chassis, &ldatapaths, &lports,
&local_datapaths, &local_lports);
}
if (br_int && chassis) {
struct shash addr_sets = SHASH_INITIALIZER(&addr_sets);
addr_sets_init(&ctx, &addr_sets);
//这块没找到相关信息,据我所知ovs2.6还有patch port
//ovs2.7开始就没有patch port了,都是通过流表resubmit重新从表16开始匹配
patch_run(&ctx, br_int, chassis, &local_datapaths);
enum mf_field_id mff_ovn_geneve = ofctrl_run(br_int,
&pending_ct_zones);
pinctrl_run(&ctx, &lports, br_int, chassis, &local_datapaths);
update_ct_zones(&local_lports, &local_datapaths, &ct_zones,
ct_zone_bitmap, &pending_ct_zones);
if (ctx.ovs_idl_txn) {
//设置ct_zone,通过命令ovs-vsctl list Bridge可以查看
commit_ct_zones(br_int, &pending_ct_zones);
struct hmap flow_table = HMAP_INITIALIZER(&flow_table);
//此处是生成逻辑流表的地方,这里不多说,下面详说
lflow_run(&ctx, chassis, &lports, &mcgroups,
&local_datapaths, &group_table, &ct_zones,
&addr_sets, &flow_table);
//一些和物理设备相关和逻辑流表无关的,下面也会细看
physical_run(&ctx, mff_ovn_geneve,
br_int, chassis, &ct_zones, &lports,
&flow_table, &local_datapaths);
}
}
}
}
lflow_run
主要有两块的工作,一个是翻译逻辑流表的函数add_logical_flows-->consider_logical_flow
,另外一个是添加邻居子系统相关流表的函数add_neighbor_flows-->consider_neighbor_flow
。
- 逻辑流表翻译是ingress table 0-15对应openflow的table 16-31,egress table 0-15 对应openflow的table 48-63
- 邻居子系统相关的流表是对应openflow的table 66
static void
consider_logical_flow(const struct lport_index *lports,
const struct mcgroup_index *mcgroups,
const struct sbrec_logical_flow *lflow,
const struct hmap *local_datapaths,
struct group_table *group_table,
const struct simap *ct_zones,
const struct sbrec_chassis *chassis,
struct hmap *dhcp_opts,
struct hmap *dhcpv6_opts,
uint32_t *conj_id_ofs,
const struct shash *addr_sets,
struct hmap *flow_table)
{
//判定逻辑流表是ingress还是egress
bool ingress = !strcmp(lflow->pipeline, "ingress");
const struct sbrec_datapath_binding *ldp = lflow->logical_datapath;
if (!get_local_datapath(local_datapaths, ldp->tunnel_key)) {
return;
}
//逻辑流表的ID和openflow流表ID的对应关系,ingress从表16开始,egress从48开始
uint8_t first_ptable = (ingress
? OFTABLE_LOG_INGRESS_PIPELINE
: OFTABLE_LOG_EGRESS_PIPELINE);
uint8_t ptable = first_ptable + lflow->table_id;
//这块可能是关于发送口的流表,ingress从表32开始,egress从64开始
uint8_t output_ptable = (ingress
? OFTABLE_REMOTE_OUTPUT
: OFTABLE_SAVE_INPORT);
//此处开始解析逻辑流表的action到openflow流表的action
uint64_t ovnacts_stub[1024 / 8];
struct ofpbuf ovnacts = OFPBUF_STUB_INITIALIZER(ovnacts_stub);
struct ovnact_parse_params pp = {
.symtab = &symtab,
.dhcp_opts = dhcp_opts,
.dhcpv6_opts = dhcpv6_opts,
.pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
.n_tables = LOG_PIPELINE_LEN,
.cur_ltable = lflow->table_id,
};
struct expr *prereqs;
char *error;
//新加的,没看懂,说是用local info替换lflow->actions.....
const char *new_actions = (const char *)replace_local_info(lflow->actions, chassis);
//详细的解析action不说了,可以查看函数parse_action
error = ovnacts_parse_string(new_actions, &pp, &ovnacts, &prereqs);
if (error) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "error parsing actions \"%s\": %s",
lflow->actions, error);
free(error);
ovnacts_free(ovnacts.data, ovnacts.size);
ofpbuf_uninit(&ovnacts);
return;
}
//将之前解析的action转换成openflow的action
uint64_t ofpacts_stub[1024 / 8];
struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
struct lookup_port_aux aux = {
.lports = lports,
.mcgroups = mcgroups,
.dp = lflow->logical_datapath
};
struct ovnact_encode_params ep = {
.lookup_port = lookup_port_cb,
.aux = &aux,
.is_switch = is_switch(ldp),
.is_gateway_router = is_gateway_router(ldp, local_datapaths),
.ct_zones = ct_zones,
.group_table = group_table,
.pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
.ingress_ptable = OFTABLE_LOG_INGRESS_PIPELINE,
.egress_ptable = OFTABLE_LOG_EGRESS_PIPELINE,
.output_ptable = output_ptable,
.mac_bind_ptable = OFTABLE_MAC_BINDING,
};
//每个action都有一个对应的两个函数进行转换,不细看了
ovnacts_encode(ovnacts.data, ovnacts.size, &ep, &ofpacts);
ovnacts_free(ovnacts.data, ovnacts.size);
ofpbuf_uninit(&ovnacts);
//将ovn的match信息翻译为openflow的match
struct hmap matches;
struct expr *expr;
//同上不造什么鬼
const char *new_match = (const char *)replace_local_info(lflow->match, chassis);
//解析匹配项,将字符串解析为宏定义,比如&解析成LEX_T_LOG_AND
expr = expr_parse_string(new_match, &symtab, addr_sets, &error);
if (!error) {
if (prereqs) {
expr = expr_combine(EXPR_T_AND, expr, prereqs);
prereqs = NULL;
}
expr = expr_annotate(expr, &symtab, &error);
}
if (error) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "error parsing match \"%s\": %s",
lflow->match, error);
expr_destroy(prereqs);
ofpbuf_uninit(&ofpacts);
free(error);
return;
}
struct condition_aux cond_aux = { lports, chassis };
expr = expr_simplify(expr, is_chassis_resident_cb, &cond_aux);
expr = expr_normalize(expr);
//表达式转换成到matches
uint32_t n_conjs = expr_to_matches(expr, lookup_port_cb, &aux,
&matches);
expr_destroy(expr);
struct expr_match *m;
HMAP_FOR_EACH (m, hmap_node, &matches) {
match_set_metadata(&m->match,
htonll(lflow->logical_datapath->tunnel_key));
if (m->match.wc.masks.conj_id) {
m->match.flow.conj_id += *conj_id_ofs;
}
if (!m->n) {
//生成openflow流表
ofctrl_add_flow(flow_table, ptable, lflow->priority,
lflow->header_.uuid.parts[0], &m->match, &ofpacts);
} else {
uint64_t conj_stubs[64 / 8];
struct ofpbuf conj;
ofpbuf_use_stub(&conj, conj_stubs, sizeof conj_stubs);
for (int i = 0; i < m->n; i++) {
const struct cls_conjunction *src = &m->conjunctions[i];
struct ofpact_conjunction *dst;
dst = ofpact_put_CONJUNCTION(&conj);
dst->id = src->id + *conj_id_ofs;
dst->clause = src->clause;
dst->n_clauses = src->n_clauses;
}
ofctrl_add_flow(flow_table, ptable, lflow->priority, 0, &m->match,
&conj);
ofpbuf_uninit(&conj);
}
}
}
static void
consider_neighbor_flow(const struct lport_index *lports,
const struct sbrec_mac_binding *b,
struct hmap *flow_table)
{
//通过MAC_Binding查找到Port_Binding
const struct sbrec_port_binding *pb
= lport_lookup_by_name(lports, b->logical_port);
struct eth_addr mac;
//解析mac
eth_addr_from_string(b->mac, &mac);
struct match match = MATCH_CATCHALL_INITIALIZER;
if (strchr(b->ip, '.')) {
ovs_be32 ip;
//解析IP
ip_parse(b->ip, &ip);
//然后match中reg0匹配IP
match_set_reg(&match, 0, ntohl(ip));
}
uint64_t stub[1024 / 8];
struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(stub);
//设置action是修改目的MAC
put_load(mac.ea, sizeof mac.ea, MFF_ETH_DST, 0, 48, &ofpacts);
//添加流表
ofctrl_add_flow(flow_table, OFTABLE_MAC_BINDING, 100, 0, &match, &ofpacts);
ofpbuf_uninit(&ofpacts);
}
physical_run
void
physical_run(struct controller_ctx *ctx, enum mf_field_id mff_ovn_geneve,
const struct ovsrec_bridge *br_int,
const struct sbrec_chassis *chassis,
const struct simap *ct_zones, struct lport_index *lports,
struct hmap *flow_table, struct hmap *local_datapaths)
{
bool physical_map_changed = false;
struct simap new_localvif_to_ofport =
SIMAP_INITIALIZER(&new_localvif_to_ofport);
struct simap new_tunnel_to_ofport =
SIMAP_INITIALIZER(&new_tunnel_to_ofport);
for (int i = 0; i < br_int->n_ports; i++) {
const struct ovsrec_port *port_rec = br_int->ports[i];
//对应的数据查看用ovs-vsctl list port
const char *chassis_id = smap_get(&port_rec->external_ids,
"ovn-chassis-id");
const char *localnet = smap_get(&port_rec->external_ids,
"ovn-localnet-port");
const char *l2gateway = smap_get(&port_rec->external_ids,
"ovn-l2gateway-port");
for (int j = 0; j < port_rec->n_interfaces; j++) {
const struct ovsrec_interface *iface_rec = port_rec->interfaces[j];
//查看信息用ovs-vsctl list interface
int64_t ofport = iface_rec->ofport[0];
/* Record as patch to local net, logical patch port, chassis, or
* local logical port. */
bool is_patch = !strcmp(iface_rec->type, "patch");
//localnet patch ports和L2 gateway patch ports都当做VIF处理
if (is_patch && localnet) {
simap_put(&new_localvif_to_ofport, localnet, ofport);
break;
} else if (is_patch && l2gateway) {
simap_put(&new_localvif_to_ofport, l2gateway, ofport);
break;
//有chassis_id表示是连接到某个宿主机的tunnel口
} else if (chassis_id) {
enum chassis_tunnel_type tunnel_type;
if (!strcmp(iface_rec->type, "geneve")) {
tunnel_type = GENEVE;
if (!mff_ovn_geneve) {
continue;
}
} else if (!strcmp(iface_rec->type, "stt")) {
tunnel_type = STT;
} else if (!strcmp(iface_rec->type, "vxlan")) {
tunnel_type = VXLAN;
} else {
continue;
}
//找到chassis_id对应的tunnel口,存在即更新,不存在创建
simap_put(&new_tunnel_to_ofport, chassis_id, ofport);
struct chassis_tunnel *tun = chassis_tunnel_find(chassis_id);
if (tun) {
if (tun->ofport != u16_to_ofp(ofport) ||
tun->type != tunnel_type) {
tun->ofport = u16_to_ofp(ofport);
tun->type = tunnel_type;
physical_map_changed = true;
}
} else {
tun = xmalloc(sizeof *tun);
hmap_insert(&tunnels, &tun->hmap_node,
hash_string(chassis_id, 0));
tun->chassis_id = chassis_id;
tun->ofport = u16_to_ofp(ofport);
tun->type = tunnel_type;
physical_map_changed = true;
}
break;
} else {
const char *iface_id = smap_get(&iface_rec->external_ids,
"iface-id");
if (iface_id) {
simap_put(&new_localvif_to_ofport, iface_id, ofport);
}
}
}
}
//清理不存在的tunnel口
struct chassis_tunnel *tun, *tun_next;
HMAP_FOR_EACH_SAFE (tun, tun_next, hmap_node, &tunnels) {
if (!simap_find(&new_tunnel_to_ofport, tun->chassis_id)) {
hmap_remove(&tunnels, &tun->hmap_node);
physical_map_changed = true;
free(tun);
}
}
//记录下更改或者删除的openflow ports
struct simap_node *vif_name, *vif_name_next;
SIMAP_FOR_EACH_SAFE (vif_name, vif_name_next, &localvif_to_ofport) {
int newport;
if ((newport = simap_get(&new_localvif_to_ofport, vif_name->name))) {
if (newport != simap_get(&localvif_to_ofport, vif_name->name)) {
simap_put(&localvif_to_ofport, vif_name->name, newport);
physical_map_changed = true;
}
} else {
simap_find_and_delete(&localvif_to_ofport, vif_name->name);
physical_map_changed = true;
}
}
SIMAP_FOR_EACH (vif_name, &new_localvif_to_ofport) {
if (!simap_get(&localvif_to_ofport, vif_name->name)) {
simap_put(&localvif_to_ofport, vif_name->name,
simap_get(&new_localvif_to_ofport, vif_name->name));
physical_map_changed = true;
}
}
if (physical_map_changed) {
//调用此处触发logical flow table的处理
poll_immediate_wake();
}
struct ofpbuf ofpacts;
ofpbuf_init(&ofpacts, 0);
//建立表0,建立physical-to-logical的映射
//建立表32,报文到远端宿主机的tunnel流量
//建立表65,走完一个逻辑网元到下一个逻辑网元的流表
const struct sbrec_port_binding *binding;
SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
consider_port_binding(mff_ovn_geneve, ct_zones, lports,
local_datapaths, binding, chassis,
&ofpacts, flow_table);
}
//table 32 33处理报文到multicast groups
const struct sbrec_multicast_group *mc;
struct ofpbuf remote_ofpacts;
ofpbuf_init(&remote_ofpacts, 0);
SBREC_MULTICAST_GROUP_FOR_EACH (mc, ctx->ovnsb_idl) {
consider_mc_group(mff_ovn_geneve, ct_zones, local_datapaths, chassis,
mc, &ofpacts, &remote_ofpacts, flow_table);
}
ofpbuf_uninit(&remote_ofpacts);
//表0 100优先级,处理从tunnel口收到的报文
//Geneve和STT封装因为包含了ingress和egress的逻辑口
//我们从封装信息获取MFF_LOG_DATAPATH, MFF_LOG_INPORT, MFF_LOG_OUTPORT
//重新提交到表33,发送报文到本地vm
//这里先不处理vxlan的报文
HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
struct match match = MATCH_CATCHALL_INITIALIZER;
match_set_in_port(&match, tun->ofport);
ofpbuf_clear(&ofpacts);
if (tun->type == GENEVE) {
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
put_move(mff_ovn_geneve, 16, MFF_LOG_INPORT, 0, 15,
&ofpacts);
put_move(mff_ovn_geneve, 0, MFF_LOG_OUTPORT, 0, 16,
&ofpacts);
} else if (tun->type == STT) {
put_move(MFF_TUN_ID, 40, MFF_LOG_INPORT, 0, 15, &ofpacts);
put_move(MFF_TUN_ID, 24, MFF_LOG_OUTPORT, 0, 16, &ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
} else if (tun->type == VXLAN) {
continue;
} else {
OVS_NOT_REACHED();
}
put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
&ofpacts);
}
//目前只支持VXLAN连接网关,VNI存于MFF_LOG_INPORT
//然后报文提交到表16,然后确定出口
//目前测试是跨子网的东西流量不通
HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
SBREC_PORT_BINDING_FOR_EACH (binding, ctx->ovnsb_idl) {
struct match match = MATCH_CATCHALL_INITIALIZER;
match_set_in_port(&match, tun->ofport);
match_set_tun_id(&match, htonll(binding->datapath->tunnel_key));
ofpbuf_clear(&ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
put_load(binding->tunnel_key, MFF_LOG_INPORT, 0, 15, &ofpacts);
put_load(1, MFF_LOG_FLAGS, MLF_RCV_FROM_VXLAN_BIT, 1, &ofpacts);
put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
&ofpacts);
}
}
//表32 150优先级,处理收到的vxlan报文,因为缺乏metadata
//不发送任何隧道,重新提交到表33
struct match match;
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
MLF_RCV_FROM_VXLAN, MLF_RCV_FROM_VXLAN);
put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 150, 0,
&match, &ofpacts);
//表32 0优先级,不是组播或者tunnel到本地出口报文就重新提交到表33
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 0, 0, &match, &ofpacts);
//表34 0优先级,不是出ingress口,而是进入egress流程的话,清楚寄存器
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
put_load(0, MFF_REG0 + i, 0, 32, &ofpacts);
}
put_resubmit(OFTABLE_LOG_EGRESS_PIPELINE, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_CHECK_LOOPBACK, 0, 0, &match,
&ofpacts);
//表64 0优先级,没有MLF_ALLOW_LOOPBACK标志的报文
//重新提交到表65进行logical-to-physical的转换
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
put_resubmit(OFTABLE_LOG_TO_PHY, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_SAVE_INPORT, 0, 0, &match, &ofpacts);
}
static void
consider_port_binding(enum mf_field_id mff_ovn_geneve,
const struct simap *ct_zones,
const struct lport_index *lports,
struct hmap *local_datapaths,
const struct sbrec_port_binding *binding,
const struct sbrec_chassis *chassis,
struct ofpbuf *ofpacts_p,
struct hmap *flow_table)
{
uint32_t dp_key = binding->datapath->tunnel_key;
uint32_t port_key = binding->tunnel_key;
if (!get_local_datapath(local_datapaths, dp_key)) {
return;
}
struct match match;
if (!strcmp(binding->type, "patch")
|| (!strcmp(binding->type, "l3gateway")
&& binding->chassis == chassis)) {
const char *peer_name = smap_get(&binding->options, "peer");
const struct sbrec_port_binding *peer = lport_lookup_by_name(
lports, peer_name);
const char *peer_peer_name = smap_get(&peer->options, "peer");
struct zone_ids binding_zones = get_zone_ids(binding, ct_zones);
//表33 100优先级,每个flow对应一个端口,resubmit到表34
//表34 100优先级,logical入口和出口一样的报文丢弃
//表64 100优先级,入端口修改为0,resubmit到表65
put_local_common_flows(dp_key, port_key, false, &binding_zones,
ofpacts_p, flow_table);
//表65 100优先级,清空寄存器,resubmit到表16
//表示过完了当前的一个网元,要进入下一个网元
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
size_t clone_ofs = ofpacts_p->size;
struct ofpact_nest *clone = ofpact_put_CLONE(ofpacts_p);
ofpact_put_CT_CLEAR(ofpacts_p);
put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
for (int i = 0; i < MFF_N_LOG_REGS; i++) {
put_load(0, MFF_LOG_REG0 + i, 0, 32, ofpacts_p);
}
put_load(0, MFF_IN_PORT, 0, 16, ofpacts_p);
put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
clone = ofpbuf_at_assert(ofpacts_p, clone_ofs, sizeof *clone);
ofpacts_p->header = clone;
ofpact_finish_CLONE(ofpacts_p, &clone);
ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
&match, ofpacts_p);
return;
}
if (!strcmp(binding->type, "chassisredirect")
&& binding->chassis == chassis) {
//表33 100优先级,每个flow匹配一个逻辑出口到reg15,resubmit到表34
//如果端口类型是chassisredirect,则修改逻辑出口
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
const char *distributed_port = smap_get(&binding->options,
"distributed-port");
const struct sbrec_port_binding *distributed_binding
= lport_lookup_by_name(lports, distributed_port);
if (!distributed_binding) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "No port binding record for distributed "
"port %s referred by chassisredirect port %s",
distributed_port,
binding->logical_port);
} else if (binding->datapath !=
distributed_binding->datapath) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl,
"chassisredirect port %s refers to "
"distributed port %s in wrong datapath",
binding->logical_port,
distributed_port);
} else {
put_load(distributed_binding->tunnel_key,
MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
struct zone_ids zone_ids = get_zone_ids(distributed_binding,
ct_zones);
if (zone_ids.ct) {
put_load(zone_ids.ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
}
if (zone_ids.dnat) {
put_load(zone_ids.dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
}
if (zone_ids.snat) {
put_load(zone_ids.snat, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
}
put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts_p);
}
ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
&match, ofpacts_p);
return;
}
//根据逻辑端口查找ofport
//如果端口是VIF,那么ofport是VIF,tun为NULL
//如果端口是远程的chassis,那么ofport是tunnel,tun的id是对端的chassis tun_key
int tag = 0;
bool nested_container = false;
ofp_port_t ofport;
bool is_remote = false;
if (binding->parent_port && *binding->parent_port) {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
binding->parent_port));
if (ofport) {
tag = *binding->tag;
nested_container = true;
}
} else {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
binding->logical_port));
if ((!strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway"))
&& ofport && binding->tag) {
tag = *binding->tag;
}
}
const struct chassis_tunnel *tun = NULL;
const struct sbrec_port_binding *localnet_port =
get_localnet_port(local_datapaths, dp_key);
if (!ofport) {
is_remote = true;
if (!binding->chassis) {
return;
}
if (localnet_port) {
ofport = u16_to_ofp(simap_get(&localvif_to_ofport,
localnet_port->logical_port));
if (!ofport) {
return;
}
} else {
tun = chassis_tunnel_find(binding->chassis->name);
if (!tun) {
return;
}
ofport = tun->ofport;
}
}
if (!is_remote) {
struct zone_ids zone_ids = get_zone_ids(binding, ct_zones);
put_local_common_flows(dp_key, port_key, nested_container, &zone_ids,
ofpacts_p, flow_table);
//表0 100和150优先级
//150优先级是标记的流量,可能是VM中的容器或者local network的vlan情况下
//会匹配标记和剥去标记
//100优先级是vm的流量或者没有标记的本地网络
//这两种流量都需要设置逻辑入端口,逻辑datapath和resubmit到表16
ofpbuf_clear(ofpacts_p);
match_init_catchall(&match);
match_set_in_port(&match, ofport);
if (tag || !strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway")) {
match_set_dl_vlan(&match, htons(tag));
if (nested_container) {
put_load(MLF_ALLOW_LOOPBACK, MFF_LOG_FLAGS, 0, 1, ofpacts_p);
}
ofpact_put_STRIP_VLAN(ofpacts_p);
}
uint32_t ofpacts_orig_size = ofpacts_p->size;
load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG,
tag ? 150 : 100, 0, &match, ofpacts_p);
if (!tag && (!strcmp(binding->type, "localnet")
|| !strcmp(binding->type, "l2gateway"))) {
ofpbuf_pull(ofpacts_p, ofpacts_orig_size);
match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI));
ofctrl_add_flow(flow_table, 0, 100, 0, &match, ofpacts_p);
}
//表65 100优先级,转发报文到本地的vif
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
if (tag) {
struct ofpact_vlan_vid *vlan_vid;
vlan_vid = ofpact_put_SET_VLAN_VID(ofpacts_p);
vlan_vid->vlan_vid = tag;
vlan_vid->push_vlan_if_needed = true;
}
ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
if (tag) {
ofpact_put_STRIP_VLAN(ofpacts_p);
}
ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100, 0,
&match, ofpacts_p);
} else if (!tun) {
//表33 100优先级,完成交换到localnet端口,每个flow匹配一个逻辑出口
//切换出口到localnet口和resubmit到同样的表,即表33
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
put_load(localnet_port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, 0,
&match, ofpacts_p);
} else {
//表32 100优先级,将流量发送到远端设备,每一条flow匹配一个出口
//封装报文到远端
match_init_catchall(&match);
ofpbuf_clear(ofpacts_p);
match_set_metadata(&match, htonll(dp_key));
match_set_reg(&match, MFF_LOG_OUTPORT - MFF_REG0, port_key);
put_encapsulation(mff_ovn_geneve, tun, binding->datapath,
port_key, ofpacts_p);
ofpact_put_OUTPUT(ofpacts_p)->port = ofport;
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, 0,
&match, ofpacts_p);
}
}