搞網絡不知道dpdk。。。不合适。。。
搞dpdk不知道rte_mbuf。。。不合适。。。
是以,搞搞搞。。。
上源碼!!!
//關于dpdk rte_mbuf資料結構的學習
/* define a set of marker types that can be used to refer to set points in the
* mbuf */
/* 定義一組可用于引用 mbuf 中的設定點的标記類型*/
__extension__
typedef void *MARKER[0]; /**< generic marker for a point in a structure */
__extension__
typedef uint8_t MARKER8[0]; /**< generic marker with 1B alignment */
__extension__
typedef uint64_t MARKER64[0]; /**< marker that allows us to overwrite 8 bytes
* with a single assignment */
/**
* The generic rte_mbuf, containing a packet mbuf.
*/
struct rte_mbuf {
MARKER cacheline0; /* 柔性數組,标記開頭 */
void *buf_addr; /**< Virtual address of segment buffer. */
/**
* Physical address of segment buffer.
* Force alignment to 8-bytes, so as to ensure we have the exact
* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
* working on vector drivers easier.
*/
RTE_STD_C11
union {
rte_iova_t buf_iova;
rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));
/* next 8 bytes are initialised on RX descriptor rearm */
MARKER64 rearm_data;
uint16_t data_off;
/**
* Reference counter. Its size should at least equal to the size
* of port field (16 bits), to support zero-copy broadcast.
* It should only be accessed using the following functions:
* rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
* rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
* or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC
* config option.
*/
RTE_STD_C11
union {
rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
uint16_t refcnt; /**< Non-atomically accessed refcnt */
};
uint16_t nb_segs; /**< Number of segments. */
/** Input port (16 bits to support more than 256 virtual ports). */
uint16_t port;
uint64_t ol_flags; /**< Offload features. */
/* remaining bytes are set on RX when pulling packet from descriptor */
MARKER rx_descriptor_fields1;
/*
* The packet type, which is the combination of outer/inner L2, L3, L4
* and tunnel types. The packet_type is about data really present in the
* mbuf. Example: if vlan stripping is enabled, a received vlan packet
* would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
* vlan is stripped from the data.
*/
RTE_STD_C11
union {
uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
struct {
uint32_t l2_type:4; /**< (Outer) L2 type. */
uint32_t l3_type:4; /**< (Outer) L3 type. */
uint32_t l4_type:4; /**< (Outer) L4 type. */
uint32_t tun_type:4; /**< Tunnel type. */
RTE_STD_C11
union {
uint8_t inner_esp_next_proto;
/**< ESP next protocol type, valid if
* RTE_PTYPE_TUNNEL_ESP tunnel type is set
* on both Tx and Rx.
*/
__extension__
struct {
uint8_t inner_l2_type:4;
/**< Inner L2 type. */
uint8_t inner_l3_type:4;
/**< Inner L3 type. */
};
};
uint32_t inner_l4_type:4; /**< Inner L4 type. */
};
};
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
uint16_t vlan_tci;
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
struct {
RTE_STD_C11
union {
struct {
uint16_t hash;
uint16_t id;
};
uint32_t lo;
/**< Second 4 flexible bytes */
};
uint32_t hi;
/**< First 4 flexible bytes or FD ID, dependent on
PKT_RX_FDIR_* flag in ol_flags. */
} fdir; /**< Filter identifier if FDIR enabled */
struct {
uint32_t lo;
uint32_t hi;
} sched; /**< Hierarchical scheduler */
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;
uint16_t buf_len; /**< Length of segment buffer. */
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
uint64_t timestamp;
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;
RTE_STD_C11
union {
void *userdata; /**< Can be used for external metadata */
uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */
};
struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
struct rte_mbuf *next; /**< Next segment of scattered packet. */
/* fields to support TX offloads */
RTE_STD_C11
union {
uint64_t tx_offload; /**< combined for easy fetch */
__extension__
struct {
uint64_t l2_len:7;
/**< L2 (MAC) Header Length for non-tunneling pkt.
* Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
*/
uint64_t l3_len:9; /**< L3 (IP) Header Length. */
uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
uint64_t tso_segsz:16; /**< TCP TSO segment size */
/* fields for TX offloading of tunnels */
uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */
/* uint64_t unused:8; */
};
};
/** Size of the application private data. In case of an indirect
* mbuf, it stores the direct mbuf private data size. */
uint16_t priv_size;
/** Timesync flags for use with IEEE1588. */
uint16_t timesync;
/** Sequence number. See also rte_reorder_insert(). */
uint32_t seqn;
}
好家夥,果然mbuf,大名鼎鼎。下面分别對每個字段進行學習解釋。
下面按照出現順序對每個字段進行解釋。
MARKER cacheline0;
typedef void *MARKER[0]; /**< generic marker for a point in a structure */
檢視typedef,發現這是一個柔性數組。長度為0,是以這裡在編譯時是不占用記憶體滴。隻是一個标記喽。MARKER嘛。
void *buf_addr; /**< Virtual address of segment buffer. */
有圖就容易解釋了,一些指針、成員或函數結果的内容在下表中列出,mbuf指針簡寫為m
m | 首部,即mbuf結構體 |
m->buf_addr | headroom起始位址 |
m->data_off | data起始位址相對于buf_addr的偏移 |
m->buf_len | mbuf和priv之後記憶體的長度,包含headroom |
m->pkt_len | 整個mbuf鍊的data總長度 |
m->data_len | 實際data的長度 |
m->buf_addr+m->data_off | 實際data的起始位址 |
rte_pktmbuf_mtod(m) | 同上 |
rte_pktmbuf_data_len(m) | 同m->data_len |
rte_pktmbuf_pkt_len | 同m->pkt_len |
rte_pktmbuf_data_room_size | 同m->buf_len |
rte_pktmbuf_headroom | headroom長度 |
rte_pktmbuf_tailroom | 尾部剩餘空間長度 |
綜合圖檔解釋以及上述表格的備注。這裡buf_addr就是rte_mbuf結構體尾部,headroom起始位址。
/**
* Physical address of segment buffer.
* Force alignment to 8-bytes, so as to ensure we have the exact
* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
* working on vector drivers easier.
*/
RTE_STD_C11
union {
rte_iova_t buf_iova;
rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));
段緩沖區的實體位址。 強制8位元組對齊,保證在32位和64位有相同的cacheline0。這塊暫時無需關注。
/* next 8 bytes are initialised on RX descriptor rearm */
MARKER64 rearm_data;
接下來的 8 個位元組在 RX 描述符重裝時初始化 。
uint16_t data_off;
data起始位址相對于buf_addr的偏移。要擷取data的位置,m->buf_addr + m->data_off ,就是對應的data的實際指針。一般中間間隔是一個headroom的大小。
/**
* Reference counter. Its size should at least equal to the size
* of port field (16 bits), to support zero-copy broadcast.
* It should only be accessed using the following functions:
* rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
* rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
* or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC
* config option.
*/
RTE_STD_C11
union {
rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
uint16_t refcnt; /**< Non-atomically accessed refcnt */
};
引用計數。這裡用union實作了原子通路和非原子通路2種。計數的規格至少等于端口字段的大小16bits,(用來支援零拷貝廣播?不明白)。
uint16_t nb_segs; /**< Number of segments. */
分片數。
/** Input port (16 bits to support more than 256 virtual ports). */
uint16_t port;
入接口id号。
uint64_t ol_flags; /**< Offload features. */
offload特性标記。
offload特性,主要是指将原本在協定棧中進行的IP分片、TCP分段、重組、checksum校驗等操作,轉移到網卡硬體中進行,降低系統CPU的消耗,提高處理性能。
/* remaining bytes are set on RX when pulling packet from descriptor */
MARKER rx_descriptor_fields1;
從描述符中提取資料包時,剩餘位元組設定在 RX 上。标記使用,MARKER。。。
/*
* The packet type, which is the combination of outer/inner L2, L3, L4
* and tunnel types. The packet_type is about data really present in the
* mbuf. Example: if vlan stripping is enabled, a received vlan packet
* would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
* vlan is stripped from the data.
*/
/* 資料包類型,它是外部/内部 L2、L3、L4 和隧道類型的組合。
* packet_type 是關于 mbuf 中真正存在的資料。
* 如果啟用了 vlan 剝離,則接收到的 vlan 資料包将具有 RTE_PTYPE_L2_ETHER
* 而不是 RTE_PTYPE_L2_VLAN,因為 vlan 已從資料中剝離。
*/
RTE_STD_C11
union {
uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
struct {
uint32_t l2_type:4; /**< (Outer) L2 type. */
uint32_t l3_type:4; /**< (Outer) L3 type. */
uint32_t l4_type:4; /**< (Outer) L4 type. */
uint32_t tun_type:4; /**< Tunnel type. */
RTE_STD_C11
union {
uint8_t inner_esp_next_proto;
/**< ESP next protocol type, valid if
* RTE_PTYPE_TUNNEL_ESP tunnel type is set
* on both Tx and Rx.
*/
__extension__
struct {
uint8_t inner_l2_type:4;
/**< Inner L2 type. */
uint8_t inner_l3_type:4;
/**< Inner L3 type. */
};
};
uint32_t inner_l4_type:4; /**< Inner L4 type. */
};
};
此資料結構比較清晰,無需多餘解釋。有一個疑問,這裡的inner && outer具體是什麼呢?
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
pkt_len,包括所有分片的長度。
data_len,目前的資料長度。如果沒有分片,pkt_len與data_len數值應該是相同的。也就是pkt_len >= data_len.
/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
uint16_t vlan_tci;
隻有開啟了PKT_RX_VLAN_STRIPPED标記,此字段才是有效的。vlan時使用,學習vlan時,需要關注此字段。
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
struct {
RTE_STD_C11
union {
struct {
uint16_t hash;
uint16_t id;
};
uint32_t lo;
/**< Second 4 flexible bytes */
};
uint32_t hi;
/**< First 4 flexible bytes or FD ID, dependent on
PKT_RX_FDIR_* flag in ol_flags. */
} fdir; /**< Filter identifier if FDIR enabled */
struct {
uint32_t lo;
uint32_t hi;
} sched; /**< Hierarchical scheduler */
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */
哈希資料。這裡是一個union。當RSS開啟時,對應rss字段是哈希結果。學習RSS時,關注一下。
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;
隻有開啟了QINQ剝離時,此字段有效。外部vlan相關。
uint16_t buf_len; /**< Length of segment buffer. */
mbuf和priv之後記憶體的長度,包含headroom。
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
uint64_t timestamp;
時間戳。PKT_RX_TIMESAMP開啟時,此字段有效。機關和時間參考未标準化,但對于給定端口始終相同。
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;
第二個cacheline,這部分内容僅用在慢路或者發包流程中。
RTE_STD_C11
union {
void *userdata; /**< Can be used for external metadata */
uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */
};
//#define RTE_STD_C11 __extension__
__extension__字段用于消除編譯告警。
這裡是一個union,
在userdata指針總可以用來存放額外的中繼資料。
udata64,可以存放8位元組的使用者資料。
struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
辨別本mbuf是從哪個rte_mempool池子中申請到的。也就是該mbuf是哪個rte_mempool池子的。
struct rte_mbuf *next; /**< Next segment of scattered packet. */
在分片封包中,标記下一個封包的位置。
/* fields to support TX offloads */
/* 用于支援發包硬體解除安裝的字段 */
RTE_STD_C11
union {
uint64_t tx_offload; /**< combined for easy fetch */
/* tx_offload 組合起來,友善取用 */
__extension__
struct {
uint64_t l2_len:7;
/**< L2 (MAC) Header Length for non-tunneling pkt.
* Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
*/
uint64_t l3_len:9; /**< L3 (IP) Header Length. */
uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
uint64_t tso_segsz:16; /**< TCP TSO segment size */
/* TSO(TCP Segment Offload)是一種利用網卡的少量處理能力,
降低CPU發送資料包負載的技術,需要網卡硬體及驅動的支援。 */
/* fields for TX offloading of tunnels */
uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */
/* uint64_t unused:8; */
};
};
支援硬體發包解除安裝的字段内容。内部為一個union。其中tx_offload字段是為了容易擷取搞出來的。
/** Size of the application private data. In case of an indirect
* mbuf, it stores the direct mbuf private data size. */
uint16_t priv_size;
應用程式私有資料的大小。
在indirect mbuf 的情況下,它存儲direct mbuf 私有資料大小。 關于direct mbuf與indirect mbuf的差別,參考連結
10. Mbuf Library — Data Plane Development Kit 21.08.0-rc1 documentation (dpdk.org)
/** Timesync flags for use with IEEE1588. */
/* IEEE1588 協定,又稱 PTP( precise time protocol,精确時間協定),
* 可以達到亞微秒級别時間同步精度,于 2002 年釋出 version 1,
* 2008 年釋出 version 2。 */
uint16_t timesync;
時間同步。參考IEEE1588。
IEEE 1588_百度百科 (baidu.com)
/** Sequence number. See also rte_reorder_insert(). */
uint32_t seqn;
序列号。這個是哪裡用到呢?