注:本文是转载,但不是100%的转载,可能稍微有些出入,原文地址点击这里
核心函数
ethernet_input_init
初始化函数,主循环之前会调用。
static clib_error_t *ethernet_input_init (vlib_main_t * vm)
{
//支持vlan,和qinq协议
ethernet_main_t *em = ðernet_main;
__attribute__ ((unused)) vlan_table_t *invalid_vlan_table;
__attribute__ ((unused)) qinq_table_t *invalid_qinq_table;
//只是给format_buffer,unformat_buffer赋值,
//值得注意的是有对packet generate初始化,基本协议都有自己的pg实现。
ethernet_setup_node (vm, ethernet_input_node.index);
ethernet_setup_node (vm, ethernet_input_type_node.index);
ethernet_setup_node (vm, ethernet_input_not_l2_node.index);
//初始化sparse_vec,用于根据3层协议来区分下一跳node这个目的。
next_by_ethertype_init (&em->l3_next);
...
}
ethernet_input_inline
完成了该node业务逻辑功能。
static_always_inline uword ethernet_input_inline (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame,
ethernet_input_variant_t variant)
{
u32 cpu_index = os_get_cpu_number ();
/*
* ETHERNET_INPUT_VARIANT_ETHERNET_TYPE,
* ETHERNET_INPUT_VARIANT_NOT_L2,
* ETHERNET_INPUT_VARIANT_ETHERNET三种模式下,
* 公用ethernet_input_node的error信息。
* 博主没有看出这里有什么特殊的含义。
*/
if (variant != ETHERNET_INPUT_VARIANT_ETHERNET)
error_node = vlib_node_get_runtime (vm, ethernet_input_node.index);
else
error_node = node;
//返回frame尾部保存数据包信息内存的起始地址
from = vlib_frame_vector_args (from_frame);
//frame中的数据包个数
n_left_from = from_frame->n_vectors;
//上次数据包的下一跳这里直接使用,后面有机会修正
next_index = node->cached_next_index;
stats_sw_if_index = node->runtime_data[0];
while (n_left_from > 0)
{
//获取传给下一跳node的保存数据包的缓存
vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
while (n_left_from >= 4 && n_left_to_next >= 2)
{
//操作两个数据包,再预取两个数据包,以下省略,因为下面操作一个数据包和这个方法一致
...
}
while (n_left_from > 0 && n_left_to_next > 0)
{
//预取下一个报文
if (n_left_from > 1)
{
vlib_buffer_t *p2;
p2 = vlib_get_buffer (vm, from[1]);
vlib_prefetch_buffer_header (p2, STORE);
CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = from[0];
to_next[0] = bi0;
from += 1;
to_next += 1;
n_left_from -= 1;
n_left_to_next -= 1;
b0 = vlib_get_buffer (vm, bi0);
error0 = ETHERNET_ERROR_NONE;
//解析2层信息,有多重封装的也解封,
//最终把vlib_buffer_t->current_data指向三层头部。
parse_header (variant, b0, &type0,
&orig_type0, &outer_id0, &inner_id0, &match_flags0);
old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
eth_vlan_table_lookups (em,
vnm, old_sw_if_index0, orig_type0, outer_id0, inner_id0,
&hi0, &main_intf0, &vlan_intf0, &qinq_intf0);
identify_subint (hi0, b0, match_flags0, main_intf0, vlan_intf0,
qinq_intf0, &new_sw_if_index0, &error0, &is_l20);
// Save RX sw_if_index for later nodes
vnet_buffer (b0)->sw_if_index[VLIB_RX] =
error0 != ETHERNET_ERROR_NONE ?
old_sw_if_index0 : new_sw_if_index0;
/*
* Increment subinterface stats
* Note that interface-level counters have already been incremented
* prior to calling this function. Thus only subinterface counters
* are incremented here.
* Interface level counters include packets received on the main
* interface and all subinterfaces. Subinterface level counters
* include only those packets received on that subinterface
* Increment stats if the subint is valid and it is not the main intf
* 更新统计信息,vpp中大量代码都是先按照预测执行逻辑,随后再修正,
* 或许对代码流水线有帮助,有空再仔细琢磨下。
*/
if ((new_sw_if_index0 != ~0) && (new_sw_if_index0 != old_sw_if_index0))
{
len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
- vnet_buffer (b0)->ethernet.start_of_ethernet_header;
stats_n_packets += 1;
stats_n_bytes += len0;
// Batch stat increments from the same subinterface so counters
// don't need to be incremented for every packet.
if (PREDICT_FALSE (new_sw_if_index0 != stats_sw_if_index))
{
stats_n_packets -= 1;
stats_n_bytes -= len0;
if (new_sw_if_index0 != ~0)
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index, new_sw_if_index0, 1, len0);
if (stats_n_packets > 0)
{
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index, stats_sw_if_index,
stats_n_packets, stats_n_bytes);
stats_n_packets = stats_n_bytes = 0;
}
stats_sw_if_index = new_sw_if_index0;
}
}
if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
is_l20 = 0;
//决定下一跳node,根据设置可以支持按照协议决定下一跳
determine_next_node (em, variant, is_l20, type0, b0, &error0, &next0);
b0->error = error_node->errors[error0];
// verify speculative enqueue
//修正这两个数据包下一跳node
vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
to_next, n_left_to_next,
bi0, next0);
}
vlib_put_next_frame (vm, node, next_index, n_left_to_next);
}
// Increment any remaining batched stats
if (stats_n_packets > 0)
{
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
node->runtime_data[0] = stats_sw_if_index;
}
return from_frame->n_vectors;
}
以下几个函数用于hook该node,替代determine_next_node
中默认的下一跳node挑选机制。
默认有ETHERNET_TYPE_IP4
,ETHERNET_TYPE_IP6
,ETHERNET_TYPE_MPLS_UNICAST
三种协议,用户可以自己添加更多协议。可以根据协议来做不同下一跳。
void ethernet_register_input_type (vlib_main_t * vm,
ethernet_type_t type, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
ethernet_type_info_t *ti;
u32 i;
ti = ethernet_get_type_info (em, type);
ti->node_index = node_index;
ti->next_index = vlib_node_add_next (vm,
ethernet_input_node.index, node_index);
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == ti->next_index);
i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
ASSERT (i == ti->next_index);
// Add the L3 node for this ethertype to the next nodes structure
next_by_ethertype_register (&em->l3_next, type, ti->next_index);
// Call the registration functions for other nodes that want a mapping
l2bvi_register_input_type (vm, type, node_index);
}
vlan包下一跳判定机制,可以还原ethernet头部,根据这里注册的值作跳转。
void ethernet_register_l2_input (vlib_main_t * vm, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
u32 i;
em->l2_next =
vlib_node_add_next (vm, ethernet_input_node.index, node_index);
// Even if we never use these arcs, we have to align the next indices...
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == em->l2_next);
i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
ASSERT (i == em->l2_next);
}
调用该函数后,大多数下一跳基本就由这里注册的值决定了。
// Register a next node for L3 redirect, and enable L3 redirect
void ethernet_register_l3_redirect (vlib_main_t * vm, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
u32 i;
em->redirect_l3 = 1;
em->redirect_l3_next = vlib_node_add_next (vm,
ethernet_input_node.index,
node_index);
//Change the cached next nodes to the redirect node
em->l3_next.input_next_ip4 = em->redirect_l3_next;
em->l3_next.input_next_ip6 = em->redirect_l3_next;
em->l3_next.input_next_mpls = em->redirect_l3_next;
// Even if we never use these arcs, we have to align the next indices...
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == em->redirect_l3_next);
}