注:本文是转载,但不是100%的转载,可能稍微有些出入,原文地址点击这里

核心函数


ethernet_input_init

初始化函数,主循环之前会调用。

static clib_error_t *ethernet_input_init (vlib_main_t * vm)
{
    //支持vlan,和qinq协议
    ethernet_main_t *em = &ethernet_main;
    __attribute__ ((unused)) vlan_table_t *invalid_vlan_table;
    __attribute__ ((unused)) qinq_table_t *invalid_qinq_table;

    //只是给format_buffer,unformat_buffer赋值,
    //值得注意的是有对packet generate初始化,基本协议都有自己的pg实现。
    ethernet_setup_node (vm, ethernet_input_node.index);
    ethernet_setup_node (vm, ethernet_input_type_node.index);
    ethernet_setup_node (vm, ethernet_input_not_l2_node.index);

    //初始化sparse_vec,用于根据3层协议来区分下一跳node这个目的。
    next_by_ethertype_init (&em->l3_next);

    ...
}

ethernet_input_inline

完成了该node业务逻辑功能。

static_always_inline uword ethernet_input_inline (vlib_main_t * vm,
               vlib_node_runtime_t * node,
               vlib_frame_t * from_frame,
               ethernet_input_variant_t variant)
{
    u32 cpu_index = os_get_cpu_number ();

    /*
     * ETHERNET_INPUT_VARIANT_ETHERNET_TYPE,
     * ETHERNET_INPUT_VARIANT_NOT_L2,
     * ETHERNET_INPUT_VARIANT_ETHERNET三种模式下,
     * 公用ethernet_input_node的error信息。
     * 博主没有看出这里有什么特殊的含义。
     */
    if (variant != ETHERNET_INPUT_VARIANT_ETHERNET)
        error_node = vlib_node_get_runtime (vm, ethernet_input_node.index);
    else
        error_node = node;

    //返回frame尾部保存数据包信息内存的起始地址
    from = vlib_frame_vector_args (from_frame);
    //frame中的数据包个数
    n_left_from = from_frame->n_vectors;

    //上次数据包的下一跳这里直接使用,后面有机会修正
    next_index = node->cached_next_index;
    stats_sw_if_index = node->runtime_data[0];

    while (n_left_from > 0)
    {
        //获取传给下一跳node的保存数据包的缓存
        vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

        while (n_left_from >= 4 && n_left_to_next >= 2)
        {
            //操作两个数据包,再预取两个数据包,以下省略,因为下面操作一个数据包和这个方法一致
            ...
        }

        while (n_left_from > 0 && n_left_to_next > 0)
        {
            //预取下一个报文
            if (n_left_from > 1)
            {
                vlib_buffer_t *p2;

                p2 = vlib_get_buffer (vm, from[1]);
                vlib_prefetch_buffer_header (p2, STORE);
                CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
            }

            bi0 = from[0];
            to_next[0] = bi0;
            from += 1;
            to_next += 1;
            n_left_from -= 1;
            n_left_to_next -= 1;

            b0 = vlib_get_buffer (vm, bi0);

            error0 = ETHERNET_ERROR_NONE;

            //解析2层信息,有多重封装的也解封,
            //最终把vlib_buffer_t->current_data指向三层头部。
            parse_header (variant, b0, &type0,
                    &orig_type0, &outer_id0, &inner_id0, &match_flags0);

            old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

            eth_vlan_table_lookups (em,
                    vnm, old_sw_if_index0, orig_type0, outer_id0, inner_id0,
                    &hi0, &main_intf0, &vlan_intf0, &qinq_intf0);

            identify_subint (hi0, b0, match_flags0, main_intf0, vlan_intf0,
                    qinq_intf0, &new_sw_if_index0, &error0, &is_l20);

            // Save RX sw_if_index for later nodes
            vnet_buffer (b0)->sw_if_index[VLIB_RX] =
                    error0 != ETHERNET_ERROR_NONE ?
                    old_sw_if_index0 : new_sw_if_index0;

            /*
             * Increment subinterface stats
             * Note that interface-level counters have already been incremented
             * prior to calling this function. Thus only subinterface counters
             * are incremented here.
             * Interface level counters include packets received on the main
             * interface and all subinterfaces. Subinterface level counters
             * include only those packets received on that subinterface
             * Increment stats if the subint is valid and it is not the main intf
             * 更新统计信息,vpp中大量代码都是先按照预测执行逻辑,随后再修正,
             * 或许对代码流水线有帮助,有空再仔细琢磨下。
             */
            if ((new_sw_if_index0 != ~0) && (new_sw_if_index0 != old_sw_if_index0))
            {
                len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
                        - vnet_buffer (b0)->ethernet.start_of_ethernet_header;

                stats_n_packets += 1;
                stats_n_bytes += len0;

                // Batch stat increments from the same subinterface so counters
                // don't need to be incremented for every packet.
                if (PREDICT_FALSE (new_sw_if_index0 != stats_sw_if_index))
                {
                    stats_n_packets -= 1;
                    stats_n_bytes -= len0;

                    if (new_sw_if_index0 != ~0)
                        vlib_increment_combined_counter
                                (vnm->interface_main.combined_sw_if_counters
                                + VNET_INTERFACE_COUNTER_RX,
                                cpu_index, new_sw_if_index0, 1, len0);
                    if (stats_n_packets > 0)
                    {
                        vlib_increment_combined_counter
                                (vnm->interface_main.combined_sw_if_counters
                                + VNET_INTERFACE_COUNTER_RX,
                                cpu_index, stats_sw_if_index,
                                stats_n_packets, stats_n_bytes);
                        stats_n_packets = stats_n_bytes = 0;
                    }
                    stats_sw_if_index = new_sw_if_index0;
                }
            }

            if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
                is_l20 = 0;

            //决定下一跳node,根据设置可以支持按照协议决定下一跳
            determine_next_node (em, variant, is_l20, type0, b0, &error0, &next0);

            b0->error = error_node->errors[error0];

            // verify speculative enqueue
            //修正这两个数据包下一跳node
            vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
                    to_next, n_left_to_next,
                    bi0, next0);
        }

        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

    // Increment any remaining batched stats
    if (stats_n_packets > 0)
    {
        vlib_increment_combined_counter
                (vnm->interface_main.combined_sw_if_counters
                + VNET_INTERFACE_COUNTER_RX,
                cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
        node->runtime_data[0] = stats_sw_if_index;
    }

    return from_frame->n_vectors;
}

以下几个函数用于hook该node,替代determine_next_node中默认的下一跳node挑选机制。
默认有ETHERNET_TYPE_IP4ETHERNET_TYPE_IP6ETHERNET_TYPE_MPLS_UNICAST三种协议,用户可以自己添加更多协议。可以根据协议来做不同下一跳。

void ethernet_register_input_type (vlib_main_t * vm,
                  ethernet_type_t type, u32 node_index)
{
    ethernet_main_t *em = &ethernet_main;
    ethernet_type_info_t *ti;
    u32 i;

    ti = ethernet_get_type_info (em, type);
    ti->node_index = node_index;
    ti->next_index = vlib_node_add_next (vm,
                       ethernet_input_node.index, node_index);
    i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
    ASSERT (i == ti->next_index);

    i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
    ASSERT (i == ti->next_index);

    // Add the L3 node for this ethertype to the next nodes structure
    next_by_ethertype_register (&em->l3_next, type, ti->next_index);

    // Call the registration functions for other nodes that want a mapping
    l2bvi_register_input_type (vm, type, node_index);
}

vlan包下一跳判定机制,可以还原ethernet头部,根据这里注册的值作跳转。

void ethernet_register_l2_input (vlib_main_t * vm, u32 node_index)
{
    ethernet_main_t *em = &ethernet_main;
    u32 i;

    em->l2_next =
            vlib_node_add_next (vm, ethernet_input_node.index, node_index);

    // Even if we never use these arcs, we have to align the next indices...
    i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);

    ASSERT (i == em->l2_next);

    i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
    ASSERT (i == em->l2_next);
}

调用该函数后,大多数下一跳基本就由这里注册的值决定了。

// Register a next node for L3 redirect, and enable L3 redirect
void ethernet_register_l3_redirect (vlib_main_t * vm, u32 node_index)
{
    ethernet_main_t *em = &ethernet_main;
    u32 i;

    em->redirect_l3 = 1;
    em->redirect_l3_next = vlib_node_add_next (vm,
                         ethernet_input_node.index,
                         node_index);
    //Change the cached next nodes to the redirect node
    em->l3_next.input_next_ip4 = em->redirect_l3_next;
    em->l3_next.input_next_ip6 = em->redirect_l3_next;
    em->l3_next.input_next_mpls = em->redirect_l3_next;

    // Even if we never use these arcs, we have to align the next indices...
    i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);

    ASSERT (i == em->redirect_l3_next);
}
最后修改:2021 年 08 月 18 日 09 : 15 PM