linux網卡驅動源碼分析

轉自http://blog.csdn.net/ustc_dylan/article/details/6329375

網絡驅動是一種典型的pci裝置驅動，無論在嵌入式平台還是在pc領域，網絡相關的項目開發有着比較廣闊的前景，是以，分析目前linux核心中網絡裝置的驅動，不但能了解網絡相關的基本原理，而且可以借鑒linux核心的先進的技術，将其應用到嵌入式或其他領域。本文以linux核心中的rtl8139網絡驅動為例，對網絡驅動的源碼進行了簡單分析，并對其中涉及的相關概念和技術進行了簡單的介紹。

一、pci裝置驅動模型

rtl8139是典型的pci裝置，linux核心的pci核心驅動為pci驅動開發者提供了友善的系統接口，極大地友善了pci裝置驅動的開發。

1 pci裝置驅動相關的資料結構

pci驅動描述結構體

struct pci_driver {

struct list_head node; /*用于連接配接入pci驅動清單*/

char *name; /*pci驅動的名稱*/

const struct pci_device_id *id_table; /* must be

non-null for probe to be called *//*該驅動支援的pci裝置*/

int (*probe) (struct pci_dev *dev, const struct

pci_device_id *id); /* new device inserted */

void (*remove) (struct pci_dev *dev); /* device

removed (null if not a hot-plug capable driver) */

int (*suspend) (struct pci_dev *dev, pm_message_t

state); /* device suspended */

int (*suspend_late) (struct pci_dev *dev, pm_message_t

state);

int (*resume_early) (struct pci_dev *dev);

int (*resume) (struct

pci_dev *dev); /* device woken up */

void (*shutdown) (struct pci_dev *dev);

struct pci_error_handlers *err_handler;

struct device_driver driver;

struct pci_dynids dynids;

};

pci裝置描述結構體

struct pci_dev{

struct list_head bus_list; /* node in per-bus list */

struct pci_bus *bus; /* bus this device is on */

struct pci_bus *subordinate; /* bus this device bridges to */

void *sysdata; /* hook for sys-specific

extension */

struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */

struct pci_slot *slot; /* physical slot this device is in */

unsigned int devfn; /* encoded device & function index */

unsigned short vendor;

unsigned short device;

unsigned short subsystem_vendor;

unsigned short subsystem_device;

unsigned int class; /* 3 bytes: (base,sub,prog-if) */

u8 revision; /* pci revision, low byte of class word */

u8 hdr_type; /* pci header type (`multi' flag

masked out) */

u8 pcie_cap; /* pci-e capability offset */

u8 pcie_type; /* pci-e device/port type */

u8 rom_base_reg; /* which config register controls the rom */

u8 pin; /* which interrupt pin this device uses */

struct pci_driver *driver; /* which driver has allocated this device */

u64 dma_mask; /* mask of the bits of bus address this

device implements. normally this is

0xffffffff. you only need to change

this if your device has broken dma

or supports 64-bit transfers. */

struct device_dma_parameters dma_parms;

pci_power_t current_state; /* current operating state. in acpi-speak,

this is d0-d3, d0 being fully functional,

and d3 being off. */

int pm_cap; /* pm capability offset in the

configuration space */

unsigned int pme_support:5; /* bitmask of states

from which pme#

can be generated */

unsigned int pme_interrupt:1;

unsigned int d1_support:1; /* low power state d1 is supported */

unsigned int d2_support:1; /* low power state d2 is supported */

unsigned int no_d1d2:1; /* only allow d0 and d3 */

unsigned int wakeup_prepared:1;

unsigned int d3_delay; /* d3->d0

transition time in ms */

#ifdef config_pcieaspm

struct pcie_link_state *link_state; /* aspm link state. */

#endif

pci_channel_state_t error_state; /* current connectivity state */

struct device dev; /* generic device interface */

int cfg_size; /* size of configuration space */

* instead of touching interrupt line and base address registers

* directly, use the values stored here. they might be

unsigned int irq;

struct resource resource[device_count_resource]; /* i/o and memory

regions + expansion roms */

resource_size_t fw_addr[device_count_resource]; /* fw-assigned

addr */

/* these fields are used by common fixups */

unsigned int transparent:1; /* transparent pci bridge */

unsigned int multifunction:1;/* part of multi-function device */

/* keep track of device state */

unsigned int is_added:1;

unsigned int is_busmaster:1; /* device is busmaster */

unsigned int no_msi:1; /* device may not use

msi */

unsigned int block_ucfg_access:1; /* userspace config space access is blocked */

unsigned int broken_parity_status:1; /* device generates false positive

parity */

unsigned int irq_reroute_variant:2; /* device needs

irq rerouting variant */

unsigned int msi_enabled:1;

unsigned int msix_enabled:1;

unsigned int ari_enabled:1; /* ari forwarding */

unsigned int is_managed:1;

unsigned int is_pcie:1; /* obsolete. will

be removed.

use pci_is_pcie() instead */

unsigned int needs_freset:1; /* dev requires fundamental

reset */

unsigned int state_saved:1;

unsigned int is_physfn:1;

unsigned int is_virtfn:1;

unsigned int reset_fn:1;

unsigned int is_hotplug_bridge:1;

unsigned int __aer_firmware_first_valid:1;

unsigned int __aer_firmware_first:1;

pci_dev_flags_t dev_flags;

atomic_t enable_cnt; /* pci_enable_device has been called */

u32 saved_config_space[16]; /* config space saved

at suspend time */

struct hlist_head saved_cap_space;

struct bin_attribute *rom_attr; /* attribute descriptor for sysfs

rom entry */

int rom_attr_enabled; /* has display of the rom attribute been enabled? */

struct bin_attribute *res_attr[device_count_resource]; /* sysfs

file for resources */

struct bin_attribute *res_attr_wc[device_count_resource]; /* sysfs

file for wc mapping of resources */

#ifdef config_pci_msi

struct list_head msi_list;

struct pci_vpd *vpd;

#ifdef config_pci_iov

union {

struct pci_sriov *sriov; /* sr-iov capability

related */

struct pci_dev *physfn; /* the pf this vf is associated

with */

};

struct pci_ats *ats; /* address translation service */

}

驅動開發者要想為某個pci裝置開發驅動就必須定義一個與目前pci裝置相對應的pci_driver資料結構，用來描述将要開發的pci驅動的相關資訊，比如驅動的名稱，目前驅動可以支援哪些裝置，以及目前驅動支援的一些操作等，類似地，還需要有個結構體來表示pci裝置，描述pci裝置的硬體資訊，如廠商id,裝置id,以及各種資源等，詳見注釋。

二、pci核心驅動api

linux核心的pci驅動為pci裝置驅動的開發提供了友善的結構，下面列舉幾個常用的接口：

pci_register_driver(struct pci_driver *drv)

功能：注冊pci驅動，參數為要注冊的pci驅動的結構體。下面來詳細的分析以下這個函數，如此，才能更清楚的了解驅動和裝置的比對過程。 pci_register_driver->driver_register(&drv->driver);->bus_add_driver->driver_attach->bus_for_each_dev(drv->bus, null, drv, __driver_attach); 在這個過程中有涉及到一個更為抽象的結構體struct device_driver，它是pci_driver的更進階的抽象，即下層是pci_driver，其上是device_driver，這符合通常的程式設計邏輯，越往上層抽象級别越高，因為在作業系統看來，它并不需要知道具體是什麼裝置，所有的裝置對作業系統來說都是相同的，即都用struct device_driver來表示。在driver_register中先調用driver_find(drv->name, drv->bus)，首先在相應的總線上查找drv->name的驅動是否已經被注冊過，如果被注冊過則傳回，否則進行注冊過程，即調用bus_add_driver(drv)。 int bus_add_driver(struct device_driver *drv)函數首先判斷目前總線是否支援自動探測，如果執行則執行探測函數driver_attach(drv)。 if (drv->bus->p->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_unregister; }int driver_attach(struct device_driver *drv) { return bus_for_each_dev(drv->bus, null, drv, __driver_attach); 這個函數對pci總線的上所有已經連接配接的pci裝置與目前的pci驅動程序一次比對的過程，即對每一個pci裝置都調用比對函數__driver_attach。 static int __driver_attach(struct device *dev, void *data) struct device_driver *drv = data; * lock device and try to bind to it. we drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. if (!driver_match_device(drv, dev)) return 0; if (dev->parent) /* needed for usb */ device_lock(dev->parent); device_lock(dev); if (!dev->driver) driver_probe_device(drv, dev); device_unlock(dev); if (dev->parent) device_unlock(dev->parent); return 0; 該函數首先判斷總線提供的match函數是否為空，如果非空則執行總線提供的match函數，在rtl8139網絡驅動中，match非空，參見代碼： drv->driver.bus = &pci_bus_type;struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, .uevent = pci_uevent, .probe = pci_device_probe, .remove = pci_device_remove, .shutdown = pci_device_shutdown, .dev_attrs = pci_dev_attrs, .bus_attrs = pci_bus_attrs, .pm = pci_pm_ops_ptr, 這裡将match函數即pci_bus_match，最後該函數調用到 static inline const struct pci_device_id * pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) if ((id->vendor == pci_any_id || id->vendor == dev->vendor) && (id->device == pci_any_id || id->device == dev->device) && (id->subvendor == pci_any_id || id->subvendor == dev->subsystem_vendor) && (id->subdevice == pci_any_id || id->subdevice == dev->subsystem_device) && !((id->class ^ dev->class) & id->class_mask)) return id; return null; 在這裡進行了pci_driver和pci_dev的比對，如果比對成功，則傳回pci_device_id。如果比對不成功，而且目前裝置還沒有驅動，則調用driver_probe_device(drv,dev)。 int driver_probe_device(struct device_driver *drv, struct device *dev) int ret = 0; if (!device_is_registered(dev)) return -enodev; pr_debug("bus: '%s': %s: matched device %s with driver %s/n", drv->bus->name, __func__, dev_name(dev), drv->name); pm_runtime_get_noresume(dev); pm_runtime_barrier(dev); ret = really_probe(dev, drv); pm_runtime_put_sync(dev); return ret; 執行到這裡說明，說明pci總線沒有提供match函數或者總線提供的match函數傳回非空。還需要進行更深層次的探測，至少在總線提供的match函數中僅僅是進行了比對，并沒有将驅動和裝置關聯起來，這些操作就是在下面的函數中實作的。重點看really_probe函數： static int really_probe(struct device *dev, struct device_driver *drv) atomic_inc(&probe_count); pr_debug("bus: '%s': %s: probing driver %s with device %s/n", drv->bus->name, __func__, drv->name, dev_name(dev)); warn_on(!list_empty(&dev->devres_head)); dev->driver = drv; if (driver_sysfs_add(dev)) { printk(kern_err "%s: driver_sysfs_add(%s) failed/n", __func__, dev_name(dev)); goto probe_failed; } if (dev->bus->probe) { ret = dev->bus->probe(dev); if (ret) goto probe_failed; } else if (drv->probe) { ret = drv->probe(dev); driver_bound(dev); ret = 1; pr_debug("bus: '%s': %s: bound device %s to driver %s/n", goto done; probe_failed: devres_release_all(dev); driver_sysfs_remove(dev); dev->driver = null; if (ret != -enodev && ret != -enxio) { /* driver matched but the probe failed */ printk(kern_warning "%s: probe of %s failed with error %d/n", drv->name, dev_name(dev), ret); * ignore errors returned by ->probe so that the next driver can try * its luck. ret = 0; done: atomic_dec(&probe_count); wake_up(&probe_waitqueue); 在此函數中，首先将驅動和裝置關聯起來，即紅色代碼dev->driver = drv; 指明了目前裝置的驅動。按照正常的程式設計思想，驅動和裝置關聯後是否還需要做一些其他工作才能是裝置在相應的驅動下正常工作呢，這就是probe函數實作的功能了，很明顯裝置和驅動擷取都需要做一些工作，是以這裡分别留出裝置和驅動的probe函數。其中裝置的probe即裝置所在總線的probe，這裡暫且不去分析，因為與網絡驅動關系不大，都是pci總線相關的東西，重點來看驅動的probe，在前面提到的pci_driver結構體中，對于rtl8139驅動來說，其pci_driver結構體被初始化為： static struct pci_driver rtl8139_pci_driver = { .name = drv_name, .id_table = rtl8139_pci_tbl, .probe = rtl8139_init_one, .remove = __devexit_p(rtl8139_remove_one), #ifdef config_pm .suspend = rtl8139_suspend, .resume = rtl8139_resume, #endif /* config_pm */ 這裡即調用rtl8139_init_one，經過上面的逐層分析，我們從pci核心驅動一步一步的走到了rtl8139網絡裝置的驅動，豁然開朗了，以後看網絡驅動的時候就不會感到開始的地方有點迷糊了。代碼分析重在代碼之間的過渡，如果銜接不好，很多地方都會産生疑問。上次講到如何從pci核心驅動一步一步的進入了rtl8139網絡驅動，并且調用的第一個函數是驅動的probe函數，即rtl8139_init_one，本文就從這裡入手，簡單的介紹rtl8139網絡驅動的相關原理和源碼分析。 1 rtl8139_init_one 上文講到當實作了驅動和裝置的比對後，需要裝置和驅動做一些相應的工作，如正常使用前的初始化操作等，rtl8139_init_one就實作了一些初始化操作，原則上probe函數應該盡可能的短，盡量避免執行耗時的操作。rtl8139_init_one僅僅實作了兩個結構體struct net_device和struct rtl8139_private的初始化。前一篇文章中也提到了資料結構的抽象層次的問題，在網絡子系統中，所有的網絡裝置都用net_device來表示，但是并不是所有的網絡裝置都有相同的屬性，是以，對應不同的網絡裝置增加一個private資料結構來描述，這裡就是struct rtl8139_private。 rtl8139_init_one主要函數和功能分析（1）dev = rtl8139_init_board (pdev); view plaincopy to clipboardprint? /* dev and priv zeroed in alloc_etherdev */ dev = alloc_etherdev (sizeof (*tp)); if (dev == null) { dev_err(&pdev->dev, "unable to alloc new net device/n"); return err_ptr(-enomem); } set_netdev_dev(dev, &pdev->dev); tp = netdev_priv(dev); tp->pci_dev = pdev; /* enable device (incl. pci pm wakeup and hotplug setup) */ rc = pci_enable_device (pdev); if (rc) goto err_out; pio_start = pci_resource_start (pdev, 0); pio_end = pci_resource_end (pdev, 0); pio_flags = pci_resource_flags (pdev, 0); pio_len = pci_resource_len (pdev, 0); mmio_start = pci_resource_start (pdev, 1); mmio_end = pci_resource_end (pdev, 1); mmio_flags = pci_resource_flags (pdev, 1); mmio_len = pci_resource_len (pdev, 1); ... ... rc = pci_request_regions (pdev, drv_name); a). dev = alloc_etherdev (sizeof (*tp)); --> 配置設定struct rtl8139_private資料結構，并進行預初始化，之是以稱之為預初始化是因為隻進行了某些固定資料成員的初始化。 b). 調用pci核心驅動的接口函數：pci_enable_device ()，pci_enable_device 也是一個核心開發出來的接口，代碼在drivers/pci/pci.c中，筆者跟蹤發現這個函數主要就是把pci配置空間的command域的0位和1 位置成了1，進而達到了開啟裝置的目的，因為rtl8139的官方datasheet中，說明了這兩位的作用就是開啟記憶體映射和i/o映射，如果不開的話，那我們以上讨論的把控制寄存器空間映射到記憶體空間的這一功能就被屏蔽了。 pci_resource_[start|end|flags|len]:在硬體加電初始化時，bios固件統一檢查了所有的pci裝置，并統一為他們配置設定了一個和其他互不沖突的位址，讓他們的驅動程式可以向這些位址映射他們的寄存器，這些位址被bios寫進了各個裝置的配置空間，因為這個活動是一個pci的标準的活動，是以自然寫到各個裝置的配置空間裡而不是他們風格各異的控制寄存器空間裡。當然隻有bios可以通路配置空間。當作業系統初始化時，他為每個pci裝置配置設定了pci_dev結構，并且把bios獲得的并寫到了配置空間中的位址讀出來寫到了pci_dev中的resource字段中。這樣以後我們在讀這些位址就不需要在通路配置空間了，直接跟pci_dev要就可以了，我們這裡的四個函數就是直接從pci_dev讀出了相關資料，代碼在include/linux/pci.h中。具體參見pci配置空間相關的介紹。 c). rc = pci_request_regions (pdev, drv_name);通知核心該裝置對應的io端口和記憶體資源已經使用，其他的pci裝置不要再使用這個區域 d). 獲得目前pci裝置對應的io端口和io記憶體的基址。 2. rtl8139_open 此函數在網絡裝置端口被打開時調用，例如執行指令ifconfig eth0 up，就會觸發這個函數，此函數是真正的rtl8139網絡裝置的初始化函數。這個函數主要做了三件事。 ① 注冊這個裝置的中斷處理函數。 retval = request_irq (dev->irq, rtl8139_interrupt, irqf_shared, dev->name, dev); 當網卡發送資料完成或者接收到資料時，是用中斷的形式來告知的，比如有資料從網線傳來，中斷也通知了我們，那麼必須要有一個處理這個中斷的函數來完成資料的接收。關于linux的中斷機制不是我們詳細講解的範疇，但是有個非常重要的資源我們必須注意，那就是中斷号的配置設定，和記憶體位址映射一樣，中斷号也是bios在初始化階段配置設定并寫入裝置的配置空間的，然後linux在建立 pci_dev時從配置空間讀出這個中斷号然後寫入pci_dev的irq成員中，是以我們注冊中斷程式需要中斷号就是直接從pci_dev裡取就可以了。 retval = request_irq (dev->irq, rtl8139_interrupt, sa_shirq, dev->name, dev); if (retval) { return retval; } 我們注冊的中斷處理函數是rtl8139_interrupt，也就是說當網卡發生中斷（如資料到達）時，中斷控制器8259a把中斷号發給cpu，cpu 根據這個中斷号找到處理程式，這裡就是rtl8139_interrupt，然後執行。rtl8139_interrupt也是在我們的程式中定義好了的，這是驅動程式的一個重要的義務，也是一個基本的功能。request_irq的代碼在arch/i386/kernel/irq.c中。 ②配置設定發送和接收的緩存空間根據官方文檔，發送一個資料包的過程是這樣的：先從應用程式中把資料包拷貝到一段連續的記憶體中（這段記憶體就是我們這裡要配置設定的緩存），然後把這段記憶體的位址寫進網卡的資料發送位址寄存器(tsad)中,這個寄存器的偏移量是txaddr0 = 0x20。在把這個資料包的長度寫進另一個寄存器（tsd）中，它的偏移量是txstatus0 = 0x10。然後就把這段記憶體的資料發送到網卡内部的發送緩沖中(fifo),最後由這個發送緩沖區把資料發送到網線上。好了現在建立這麼一個發送和接收緩沖記憶體的目的已經很顯然了。 tp->tx_bufs = dma_alloc_coherent(&tp->pci_dev->dev, tx_buf_tot_len, &tp->tx_bufs_dma, gfp_kernel); tp->rx_ring = dma_alloc_coherent(&tp->pci_dev->dev, rx_buf_tot_len, &tp->rx_ring_dma, gfp_kernel); tp 是net_device的priv的指針，tx_bufs是發送緩沖記憶體的首位址，rx_ring是接收緩存記憶體的首位址，他們都是虛拟位址，而最後一個參數tx_bufs_dma和rx_ring_dma均是這一段記憶體的實體位址。為什麼同一個事物，既用虛拟位址來表示它還要用實體位址呢，是這樣的， cpu執行程式用到這個位址時，用虛拟位址，而網卡裝置通過dma操作向這些記憶體中存取資料時用的是實體位址（因為網卡相對cpu屬于頭腦比較簡單型的）。 pci_alloc_consistent的代碼在linux/arch/i386/kernel/pci-dma.c中。 ③發送和接收緩沖區初始化和網卡開始工作的操作 rtl8139有4個發送描述符（包括4個發送緩沖區的基位址寄存器（tsad0-tsad3）和4個發送狀态寄存器(tsd0-tsd3)。也就是說我們配置設定的緩沖區要分成四個等分并把這四個空間的位址都寫到相關寄存器裡去，下面這段代碼完成了這個操作。 /* initialize the rx and tx rings, along with various 'dev' bits. */ static void rtl8139_init_ring (struct net_device *dev) { struct rtl8139_private *tp = netdev_priv(dev); int i; tp->cur_rx = 0; tp->cur_tx = 0; tp->dirty_tx = 0; for (i = 0; i < num_tx_desc; i++) tp->tx_buf[i] = &tp->tx_bufs[i * tx_buf_size]; } 上面這段代碼負責把發送緩沖區虛拟空間進行了分割。 /* init tx buffer dma addresses */ for (i = 0; i < num_tx_desc; i++) rtl_w32_f (txaddr0 + (i * 4), tp->tx_bufs_dma + (tp->tx_buf[i] - tp->tx_bufs)); 上面這段代碼負責把發送緩沖區實體空間進行了分割，并把它寫到了相關寄存器中，這樣在網卡開始工作後就能夠迅速定位和找到這些記憶體并存取他們的資料。 /* init rx ring buffer dma address */ rtl_w32_f (rxbuf, tp->rx_ring_dma); 上面這行代碼是把接收緩沖區的實體位址寫到了相關寄存器中，這樣網卡接收到資料後就能準确的把資料從網卡中搬運到這些記憶體空間中，等待cpu來領走他們。 /* make sure rxtx has started */ tmp = rtl_r8 (chipcmd); if ((!(tmp & cmdrxenb)) || (!(tmp & cmdtxenb))) rtl_w8 (chipcmd, cmdrxenb | cmdtxenb); 重新reset裝置後，我們要激活裝置的發送和接收的功能，上面這行代碼就是向相關寄存器中寫入相應值，激活了裝置的這些功能。 static const unsigned int rtl8139_tx_config = txifg96 | (tx_dma_burst << txdmashift) | (tx_retry << txretryshift); rtl_w32 (txconfig, rtl8139_tx_config); 上面這行代碼是向網卡的txconfig（位移是0x44）寄存器中寫入tx_dma_burst << txdmashift這個值，翻譯過來就是6<<8，就是把第8到第10這三位置成110，查閱管法文檔發現6就是110代表着一次dma的資料量為1024位元組。 3. 網絡資料包的收發過程當一個網絡應用程式要向網絡發送資料時，它要利用linux的網絡協定棧來解決一系列問題，找到網卡裝置的代表net_device，由這個結構來找到并控制這個網卡裝置來完成資料包的發送，具體是調用net_device的hard_start_xmit成員函數，這是一個函數指針，在我們的驅動程式裡它指向的是rtl8139_start_xmit，正是由它來完成我們的發送工作的，下面我們就來剖析這個函數。它一共做了四件事。 ①檢查這個要發送的資料包的長度，如果它達不到以太網幀的長度，必須采取措施進行填充。 /* calculate the next tx descriptor entry. */ entry = tp->cur_tx % num_tx_desc; /* note: the chip doesn't have auto-pad! */ if (likely(len < tx_buf_size)) { //tx_buf_size = 1536 if (len < eth_zlen) //eth_zlen = 60 memset(tp->tx_buf[entry], 0, eth_zlen); skb_copy_and_csum_dev(skb, tp->tx_buf[entry]); dev_kfree_skb(skb); } else { dev->stats.tx_dropped++; return netdev_tx_ok; ②把包的資料拷貝到我們已經建立好的發送緩存中。主要實作了把skb結構中的資料拷貝到tp->tx_buf[entry]指向的發送緩沖區中。 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) __wsum csum; long csstart; /*首先計算skb->data的長度*/ if (skb->ip_summed == checksum_partial) csstart = skb->csum_start - skb_headroom(skb); else csstart = skb_headlen(skb); bug_on(csstart > skb_headlen(skb)); skb_copy_from_linear_data(skb, to, csstart); csum = 0; if (csstart != skb->len) csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, skb->len - csstart, 0); if (skb->ip_summed == checksum_partial) { long csstuff = csstart + skb->csum_offset; *((__sum16 *)(to + csstuff)) = csum_fold(csum); 在拷貝函數中需要注意幾個問題： a. 如何計算要拷貝的skb的資料的長度，即這裡的csstart的計算，這裡參考下面的公式： if skb is linear (i.e., skb->data_len == 0), the length of skb->data is skb->len. if skb is not linear (i.e., skb->data_len != 0), the length of skb->data is (skb->len) - (skb->data_len) for the head only. the rest must see struct skb_shared_info->frags[i].size and struct skb_shared_info->frag_list, which contains a linked-list of struct sk_buff because, deducing from [2], skb->data_len = struct skb_shared_info->frags[0...struct skb_shared_info->nr_frags].size + size of data in struct skb_shared_info->frag_list the rest of the data is not stored as a separate skb if the length of the data permits, but as an array of struct skb_frag_struct in struct skb_shared_info ([4]: to allow 64k frame to be packed as single skb without frag_list). struct skb_frag_struct contains struct page * to point to the true data. if the length of the data is longer than that that can be contained in the array, struct skb_shared_info->frag_list will be used to contain a linked-list of struct sk_buff (i.e., the data undergo fragmentation because, according to [1], the frag_list is used to maintain a chain of skbs organized for fragmentation purposes, it is not used for maintaining paged data.) as an additional information, skb->truesize = skb->len + sizeof(struct sk_buff). don't forget that skb->len contains the length of the total data space that the skb refers to taking into account skb_data_align() and non-linear condition. skb->len is modified when doing skb_pull(), skb_push() or skb_put(). ③光有了位址和資料還不行，我們要讓網卡知道這個包的長度，才能保證資料不多不少精确的從緩存中截取出來搬運到網卡中去，這是靠寫發送狀态寄存器（tsd）來完成的。 rtl_w32_f (txstatus0 + (entry * sizeof (u32)), tp->tx_flag | max(len, (unsigned int)eth_zlen)); 我們把這個包的長度和一些控制資訊一起寫進了狀态寄存器，使網卡的工作有了依據。 ④判斷發送緩存是否已經滿了，如果滿了在發就覆寫資料了，要停發。 if ((tp->cur_tx - num_tx_desc) == tp->dirty_tx) netif_stop_queue (dev); 談完了發送，我們開始談接收，當有資料從網線上過來時，網卡産生一個中斷，調用的中斷服務程式是rtl8139_interrupt，它主要做了三件事。 ①從網卡的中斷狀态寄存器中讀出狀态值進行分析，status = rtl_r16 (intrstatus); if ((status &(pcierr | pcstimeout | rxunderrun | rxoverflow | rxfifoover | txerr | txok | rxerr | rxok)) == 0) goto out; 上面代碼說明如果上面這9種情況均沒有的表示沒什麼好處理的了，退出。 ② napi接收機制 /* receive packets are processed by poll routine. if not running start it now. */ if (status & rxackbits){ if (napi_schedule_prep(&tp->napi)) { rtl_w16_f (intrmask, rtl8139_norx_intr_mask); __napi_schedule(&tp->napi); } napi_schedule_prep(&tp->napi)判斷以下目前驅動是否支援napi或者napi需要的前提條件是否滿足，如果滿足，設定中斷屏蔽字，屏蔽之後産生的中斷，然後激活一個軟中斷，具體代碼如下：（至于list_add_tail将會稍後分析） static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(net_rx_softirq); 在軟中斷注冊的輪詢函數中完成網絡資料包的接收操作。 ③發送中斷處理 if (status & (txok | txerr)) { rtl8139_tx_interrupt (dev, tp, ioaddr); if (status & txerr) rtl_w16 (intrstatus, txerr); 如果是傳輸完成的信号，就調用rtl8139_tx_interrupt進行發送善後處理。下面我們先來看看接收中斷處理函數rtl8139_rx，在這個函數中主要做了下面四件事 ①這個函數是一個大循環，循環條件是隻要接收緩存不為空就還可以繼續讀取資料，循環不會停止，讀空了之後就跳出。 int ring_offset = cur_rx % rx_buf_len; rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset)); rx_size = rx_status >> 16; 上面三行代碼是計算出要接收的包的長度。 ②根據這個長度來配置設定包的資料結構 pkt_size = rx_size - 4; skb = netdev_alloc_skb_ip_align(dev, pkt_size); ③如果配置設定成功就把資料從接收緩存中拷貝到這個包中 if (likely(skb)) { #if rx_buf_idx == 3 wrap_copy(skb, rx_ring, ring_offset+4, pkt_size); #else skb_copy_to_linear_data (skb, &rx_ring[ring_offset + 4], pkt_size); skb_put (skb, pkt_size); skb->protocol = eth_type_trans (skb, dev); dev->stats.rx_bytes += pkt_size; dev->stats.rx_packets++; netif_receive_skb (skb); 這裡采用了wrap_copy和skb_copy_to_linear_data兩個拷貝函數，實質還是調用了memcpy（）。 static inline void wrap_copy(struct sk_buff *skb, const unsigned char *ring, u32 offset, unsigned int size) u32 left = rx_buf_len - offset; if (size > left) { skb_copy_to_linear_data(skb, ring + offset, left); skb_copy_to_linear_data_offset(skb, left, ring, size - left); } else skb_copy_to_linear_data(skb, ring + offset, size); static inline void skb_copy_to_linear_data(struct sk_buff *skb, const void *from, const unsigned int len) memcpy(skb->data, from, len); 現在我們已經熟知，&rx_ring[ring_offset + 4]就是接收緩存，也是源位址，而skb->data就是包的資料位址，也是目的位址，一目了然。 ④把這個包送到linux協定棧去進行下一步處理 skb->protocol = eth_type_trans (skb, dev); netif_receive_skb (skb); 在netif_receive_skb (skb)函數執行完後，這個包的資料就脫離了網卡驅動範疇，而進入了linux網絡協定棧裡面，把這些資料包的以太網幀頭，ip頭，tcp頭都脫下來，最後把資料送給了應用程式，不過協定棧不再本文讨論範圍内。

linux網卡驅動源碼分析

繼續閱讀

Apache (You don't have permission to access / on this server.）

debian9更新4.9.0核心到4.19.2核心過程

centOS7 配置 vsftpd 虛拟使用者及權限Vsftpd配置虛拟使用者及權限

linux-svn解除安裝與安裝

vsftp虛拟多使用者多權限一鍵部署腳本

Ubuntu14.04 LTS下安裝mongodb

httpd服務的部署、啟動、配置和簡單優化一、部署二、啟動三、配置檔案

配置網頁内容通路

手動安裝Intel network I217-LM網卡的Linux驅動

禁止ubuntu系統彈出報錯界面

Ubuntu Linux下Apache的配置檔案

samba伺服器的功能

【Linux】UDP廣播封包接收速率問題

Linux裝置模型（中）之上層容器

PowerPC平台 Linux移植三