4. 配置設定實體頁面（夥伴系統配置設定記憶體）linux4.0

思考題

請簡述Linux核心在理想情況下頁面配置設定器(page allocator)是如何配置設定出連續實體頁面的。

答：linux核心配置設定連續實體頁面調用alloc_pages函數，alloc_pages函數包含兩個參數，1. 配置設定掩碼，2. 夥伴系統的order。配置設定掩碼又分為zone的修飾符和action修飾符，前者确定可以優先從哪個zone開始掃描，後者确定實體頁面的遷移類型。根據得到的zone的序号/action/order，三個值，确定了配置設定的路徑。如果在指定的zone/遷移類型/order上配置設定不到連續的實體頁面，則會考慮從不同的zone，不同的遷移類型和不同的order去配置設定實體頁面。

在頁面配置設定器中，如何從配置設定掩碼(gfp_mask)中确定可以從哪些zone中配置設定記憶體？

答：gfp_mask的低4位用于确定配置設定的zone，但是在build zone的時候已經确定好了zone的順序，一般優先從high -> normal->DMA32->DMA 這個順序配置設定

頁面配置設定器是按照什麼方向來掃描zone的？

答：一般優先從high -> normal->DMA32->DMA 這個順序配置設定

為使用者程序配置設定實體記憶體，配置設定掩碼應該選用GFP_KERNEL,還是GFP_HIGHUSER_MOVABLE呢？

答：應該選擇GFP_HIGHUSER_MOVABLE，因為此宏配置設定的記憶體是可遷移的，而GFP_KERNEL,配置設定的是不可遷移的記憶體類型。

夥伴系統配置設定記憶體：

核心中常用的配置設定實體記憶體頁面的接口函數是alloc_pages()，用于配置設定一個或者多個連續的實體頁面，配置設定的頁面個數隻能是2的整數次幂。相比于多次配置設定離散的實體頁面，配置設定連續的實體頁面有利于提高系統記憶體的碎片化，記憶體碎片化是一個很讓人頭疼的問題。alloc_pages()函數的參數有兩個，一個是配置設定掩碼gfp_mask，另一個是配置設定階數order。

include\linux\gfp.h

#define alloc_pages(gfp_mask, order) \
        alloc_pages_node(numa_node_id(), gfp_mask, order)

配置設定掩碼是非常重要的參數，它同樣定義在gfp.h頭檔案中。

/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA      0x01u
#define ___GFP_HIGHMEM      0x02u
#define ___GFP_DMA32        0x04u
#define ___GFP_MOVABLE      0x08u
#define ___GFP_WAIT     0x10u /* 表示配置設定記憶體的請求可以中斷，也就是說，排程器在請求期間可以随意選擇另一個過程執行，
                                或者該請求可以被另一個更重要的事件中斷。配置設定器還可以在傳回記憶體之前，在隊列上等待一個事件*/
#define ___GFP_HIGH     0x20u /*如果請求非常重要，則設定此标志，即核心急切地需要記憶體時，在配置設定記憶體失敗可能給記憶體帶來
                                嚴重後果時(比如威脅到系統穩定性或者系統崩潰)，總是會使用該标志*/
#define ___GFP_IO       0x40u /*說明在查找空閑記憶體期間核心可以進行I/O操作，實際上，這意味着如果核心在記憶體配置設定期間換出頁
                                那麼僅當設定該标志時，才能将選擇的頁寫入硬碟*/
#define ___GFP_FS       0x80u    /*允許核心執行VFS操作，在與VFS層有聯系的核心子系統中必須禁用，因為這可能引起遞歸調用。*/
#define ___GFP_COLD     0x100u /*如果需要配置設定不在CPU高速緩存中的冷頁時，則設定此标志*/
#define ___GFP_NOWARN       0x200u /*在配置設定失敗時禁止核心故障警告。在極少數場合該标志有用*/
#define ___GFP_REPEAT       0x400u /*在配置設定失敗後自動重試，但在嘗試若幹次之後停止*/
#define ___GFP_NOFAIL       0x800u /*在配置設定失敗後一直重試，直到成功*/
#define ___GFP_NORETRY      0x1000u
#define ___GFP_MEMALLOC     0x2000u
#define ___GFP_COMP     0x4000u
#define ___GFP_ZERO     0x8000u     /*在配置設定成功時，将傳回填充位元組0的頁面*/
#define ___GFP_NOMEMALLOC   0x10000u
#define ___GFP_HARDWALL     0x20000u /*隻在NUMA系統上有意義。它限制隻在配置設定目前程序的各個CPU所關聯的結點配置設定記憶體。*/
#define ___GFP_THISNODE     0x40000u /*隻有在NUMA系統上有意義。如果設定該比特位，則記憶體配置設定失敗的情況下不允許使用其他結點作為備用*/
#define ___GFP_RECLAIMABLE  0x80000u
#define ___GFP_NOTRACK      0x200000u
#define ___GFP_NO_KSWAPD    0x400000u
#define ___GFP_OTHER_NODE   0x800000u
#define ___GFP_WRITE        0x1000000u

配置設定掩碼在核心代碼中分成兩類，一類叫zone modifiers，另一類叫action modifiers。zone modifiers指定從哪個zone中配置設定所需的頁面，zone modifiers由配置設定掩碼的最低4位來定義，分别__GFP_DMA __GFP_HIGHMEM __GFP_DMA32 __GFP_MOVABLE。

/*
 * GFP bitmasks..
 *
 * Zone modifiers (see linux/mmzone.h - low three bits)
 *
 * Do not put any conditional on these. If necessary modify the definitions
 * without the underscores and use them consistently. The definitions here may
 * be used in bit comparisons.
 */
#define __GFP_DMA   ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM   ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE   ((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
#define GFP_ZONEMASK    (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

action modifiers 并不限制從哪個記憶體域中配置設定記憶體，但會改變配置設定行為，其定義如下：

/*
 * Action modifiers - doesn't change the zoning
 *
 * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
 * _might_ fail.  This depends upon the particular VM implementation.
 *
 * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
 * cannot handle allocation failures.  This modifier is deprecated and no new
 * users should be added.
 *
 * __GFP_NORETRY: The VM implementation must not retry indefinitely.
 *
 * __GFP_MOVABLE: Flag that this page will be movable by the page migration
 * mechanism or reclaimed
 */
#define __GFP_WAIT  ((__force gfp_t)___GFP_WAIT)    /* Can wait and reschedule? */
#define __GFP_HIGH  ((__force gfp_t)___GFP_HIGH)    /* Should access emergency pools? */
#define __GFP_IO    ((__force gfp_t)___GFP_IO)  /* Can start physical IO? */
#define __GFP_FS    ((__force gfp_t)___GFP_FS)  /* Can call down to low-level FS? */
#define __GFP_COLD  ((__force gfp_t)___GFP_COLD)    /* Cache-cold page required */
#define __GFP_NOWARN    ((__force gfp_t)___GFP_NOWARN)  /* Suppress page allocation failure warning */
#define __GFP_REPEAT    ((__force gfp_t)___GFP_REPEAT)  /* See above */
#define __GFP_NOFAIL    ((__force gfp_t)___GFP_NOFAIL)  /* See above */
#define __GFP_NORETRY   ((__force gfp_t)___GFP_NORETRY) /* See above */
#define __GFP_MEMALLOC  ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
#define __GFP_COMP  ((__force gfp_t)___GFP_COMP)    /* Add compound page metadata */
#define __GFP_ZERO  ((__force gfp_t)___GFP_ZERO)    /* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
                             * This takes precedence over the
                             * __GFP_MEMALLOC flag if both are
                             * set
                             */
#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE  ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
#define __GFP_NOTRACK   ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */

#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to dirty page */

下面以GFP_KERNEL為例，來看在理想情況下alloc_pages()函數是如何配置設定出實體記憶體的。

page = alloc_pages(GFP_KERNEL, order)

GFP_KERNEL配置設定掩碼定義在gfp.h頭檔案中，是一個配置設定掩碼的組合。常用的配置設定掩碼組合如下：

/* This equals 0, but use constants in case they ever change */
#define GFP_NOWAIT  (GFP_ATOMIC & ~__GFP_HIGH)
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC  (__GFP_HIGH)
#define GFP_NOIO    (__GFP_WAIT)
#define GFP_NOFS    (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL  (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_TEMPORARY   (__GFP_WAIT | __GFP_IO | __GFP_FS | \
             __GFP_RECLAIMABLE)
#define GFP_USER    (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER    (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE    (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_IOFS    (__GFP_IO | __GFP_FS)
#define GFP_TRANSHUGE   (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
             __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
             __GFP_NO_KSWAPD)

是以GFP_KERNEL配置設定掩碼包含了__GFP_WAIT __GFP_IO __GFP_FS這3個标志即0x10 | 0x40 | 0x80 = 0xd0

alloc_pages()最終調用__alloc_pages_nodemask()函數，它是夥伴系統的核心函數。

alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()

#define alloc_pages(gfp_mask, order) \
        alloc_pages_node(numa_node_id(), gfp_mask, order)

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
                        unsigned int order)
{
    /* Unknown node is current node */
    if (nid < 0)
        nid = numa_node_id();

    return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist)
{
    return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{
    struct zoneref *preferred_zoneref;
    struct page *page = NULL;
    unsigned int cpuset_mems_cookie;
    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

    /* struct alloc_context資料結構是夥伴系統配置設定函數中用于儲存相關參數的資料結構，
     * gfp_zone()資料從配置設定掩碼中計算出zone的zoneidx，并存放在high_zoneidx成員中。
     * 一般配置設定原則是從低成本記憶體到高成本記憶體，計算方法很奇怪，看不懂
     */
    struct alloc_context ac = {
        .high_zoneidx = gfp_zone(gfp_mask),
        .nodemask = nodemask,
        .migratetype = gfpflags_to_migratetype(gfp_mask),/*把gfp_mask配置設定掩碼轉換成MIGRATE_TYPES類型，
            例如配置設定掩碼為GFP_KERNEL,那麼MIGRATE_TYPES類型是MIGRATE_UNMOVABLE,如果配置設定掩碼為GFP_HIGHUSER_MOVABLE,
            那麼MIGRATE_TYPES類型是MIGRATE_MOVABLE.
        */
    };

    gfp_mask &= gfp_allowed_mask;

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;

    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */
    if (unlikely(!zonelist->_zonerefs->zone))
        return NULL;

    if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;

retry_cpuset:
    cpuset_mems_cookie = read_mems_allowed_begin();

    /* We set it here, as __alloc_pages_slowpath might have changed it */
    ac.zonelist = zonelist;
    /* The preferred zone is used for statistics later */
    preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                ac.nodemask ? : &cpuset_current_mems_allowed,
                &ac.preferred_zone);
    if (!ac.preferred_zone)
        goto out;
    ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);

    /* First allocation attempt */
    alloc_mask = gfp_mask|__GFP_HARDWALL;
    /*get_page_from_freelist 會去嘗試配置設定實體頁面，如果這裡配置設定失敗，就會調用到__alloc_pages_slowpath函數，
        這個函數處理很多特殊場景，參見下面get_page_from_freelist實作*/
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        alloc_mask = memalloc_noio_flags(gfp_mask);

        page = __alloc_pages_slowpath(alloc_mask, order, &ac);
    }

    if (kmemcheck_enabled && page)
        kmemcheck_pagealloc_alloc(page, order, gfp_mask);

    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
        goto retry_cpuset;

    return page;
}

get_page_from_freelist()是夥伴系統使用的另一個重要的輔助函數。它通過标志集和配置設定階來判斷是否能進行配置設定。如果可以，則發起實際的配置設定操作。

/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        const struct alloc_context *ac)
{
    struct zonelist *zonelist = ac->zonelist;
    struct zoneref *z;
    struct page *page = NULL;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;     /* set if using zonelist_cache */
    int did_zlc_setup = 0;      /* just call zlc_setup() one time */
    bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                (gfp_mask & __GFP_WRITE);
    int nr_fair_skipped = 0;
    bool zonelist_rescan;

zonelist_scan:
    zonelist_rescan = false;

    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also __cpuset_node_allowed() comment in kernel/cpuset.c. cpuset用于綁定程序，提高效率
     */
    /*
     * 首先需要判斷從哪個zone來配置設定記憶體，。for_each_zone_zonelist_nodemask 宏掃描記憶體結點中的zonelist去查找合适配置設定記憶體的zone。
     * 掃描zonelist，尋找具有足夠空閑空間的記憶體域。
     */

     /* 首先解釋ALLOC_*标志(cpuset_zone_allowed()是另外一個輔助函數，用于檢查給定記憶體域是否屬于該程序允許運作的CPU. zone_watermark_ok()接下來
     ** 檢查所周遊到的記憶體域是否有足夠的空閑頁，并試圖配置設定一個連續記憶體塊。如果兩個條件之一不滿足，即或者沒有足夠的空閑頁，或者沒有連續記憶體塊可滿足配置設定請求，則
     ** 循環進行到備用清單中的下一個記憶體域，作同樣的檢查。
     ** 如果記憶體域适用于目前的配置設定請求，那麼buffered_rmqueue()試圖從中配置設定所需數目的頁。
     */
    for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
                                ac->nodemask) {
        unsigned long mark;

        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
            !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;
        if (cpusets_enabled() &&
            (alloc_flags & ALLOC_CPUSET) &&
            !cpuset_zone_allowed(zone, gfp_mask))
                continue;
        /*
         * Distribute pages in proportion to the individual
         * zone size to ensure fair page aging.  The zone a
         * page was allocated in should have no effect on the
         * time the page has in memory before being reclaimed.
         */
        if (alloc_flags & ALLOC_FAIR) {
            if (!zone_local(ac->preferred_zone, zone))
                break;
            if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                nr_fair_skipped++;
                continue;
            }
        }
        /*
         * When allocating a page cache page for writing, we
         * want to get it from a zone that is within its dirty
         * limit, such that no single zone holds more than its
         * proportional share of globally allowed dirty pages.
         * The dirty limits take into account the zone's
         * lowmem reserves and high watermark so that kswapd
         * should be able to balance it without having to
         * write pages from its LRU list.
         *
         * This may look like it could increase pressure on
         * lower zones by failing allocations in higher zones
         * before they are full.  But the pages that do spill
         * over are limited as the lower zones are protected
         * by this very same mechanism.  It should not become
         * a practical burden to them.
         *
         * XXX: For now, allow allocations to potentially
         * exceed the per-zone dirty limit in the slowpath
         * (ALLOC_WMARK_LOW unset) before going into reclaim,
         * which is important when on a NUMA setup the allowed
         * zones are together not big enough to reach the
         * global limit.  The proper fix for these situations
         * will require awareness of zones in the
         * dirty-throttling and the flusher threads.
         */
        if (consider_zone_dirty && !zone_dirty_ok(zone))
            continue;

        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
        if (!zone_watermark_ok(zone, order, mark,
                       ac->classzone_idx, alloc_flags)) {
            int ret;

            /* Checked here to keep the fast path fast */
            BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
            if (alloc_flags & ALLOC_NO_WATERMARKS)
                goto try_this_zone;

            if (IS_ENABLED(CONFIG_NUMA) &&
                    !did_zlc_setup && nr_online_nodes > 1) {
                /*
                 * we do zlc_setup if there are multiple nodes
                 * and before considering the first zone allowed
                 * by the cpuset.
                 */
                allowednodes = zlc_setup(zonelist, alloc_flags);
                zlc_active = 1;
                did_zlc_setup = 1;
            }

            if (zone_reclaim_mode == 0 ||
                !zone_allows_reclaim(ac->preferred_zone, zone))
                goto this_zone_full;

            /*
             * As we may have just activated ZLC, check if the first
             * eligible zone has failed zone_reclaim recently.
             */
            if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;

            ret = zone_reclaim(zone, gfp_mask, order);
            switch (ret) {
            case ZONE_RECLAIM_NOSCAN:
                /* did not scan */
                continue;
            case ZONE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                continue;
            default:
                /* did we reclaim enough */
                if (zone_watermark_ok(zone, order, mark,
                        ac->classzone_idx, alloc_flags))
                    goto try_this_zone;

                /*
                 * Failed to reclaim enough to meet watermark.
                 * Only mark the zone full if checking the min
                 * watermark or if we failed to reclaim just
                 * 1<<order pages or else the page allocator
                 * fastpath will prematurely mark zones full
                 * when the watermark is between the low and
                 * min watermarks.
                 */
                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
                    ret == ZONE_RECLAIM_SOME)
                    goto this_zone_full;

                continue;
            }
        }

try_this_zone:
        page = buffered_rmqueue(ac->preferred_zone, zone, order,
                        gfp_mask, ac->migratetype);
        if (page) {
            if (prep_new_page(page, order, gfp_mask, alloc_flags))
                goto try_this_zone;
            return page;
        }
this_zone_full:
        if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
            zlc_mark_zone_full(zonelist, z);
    }

    /*
     * The first pass makes sure allocations are spread fairly within the
     * local node.  However, the local node might have free pages left
     * after the fairness batches are exhausted, and remote zones haven't
     * even been considered yet.  Try once more without fairness, and
     * include remote zones now, before entering the slowpath and waking
     * kswapd: prefer spilling to a remote zone over swapping locally.
     */
    if (alloc_flags & ALLOC_FAIR) {
        alloc_flags &= ~ALLOC_FAIR;
        if (nr_fair_skipped) {
            zonelist_rescan = true;
            reset_alloc_batches(ac->preferred_zone);
        }
        if (nr_online_nodes > 1)
            zonelist_rescan = true;
    }

    if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        zonelist_rescan = true;
    }

    if (zonelist_rescan)
        goto zonelist_scan;

    return NULL;
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->zones being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 * @nodemask - Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \    
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \        
             zone;                           \        
             z = next_zones_zonelist(++z, highidx, nodemask),    \
            zone = zonelist_zone(z))            \
/* 此宏首先通過first_zones_zonelist() 從給定的zoneidx開始查找，這個給定的zoneidx就是highidx，之前通過gfp_zone()函數轉換得來的。*/

first_zones_zonelist()函數會調用next_zones_zonelist()函數來計算zoneref,最後傳回 zone 資料結構。

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist - The zonelist to search for a suitable zone
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 * @zone - The first suitable zone found is returned via this parameter
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone)
{
    struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
                            highest_zoneidx, nodes);
    *zone = zonelist_zone(z);
    return z;
}

/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes)
{
    /*
     * Find the next suitable zone to use for the allocation.
     * Only filter based on nodemask if it's set
     */
    if (likely(nodes == NULL))
        while (zonelist_zone_idx(z) > highest_zoneidx)
            z++;
    else
        while (zonelist_zone_idx(z) > highest_zoneidx ||
                (z->zone && !zref_in_nodemask(z, nodes)))
            z++;

    return z;
}


static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
    return zoneref->zone_idx;
}

計算zone的核心函數在next_zones_zonelist()函數中，這裡zonelist_zone_idx是gfp_zone()函數計算配置設定掩碼得來。zonelist有一個zoneref數組，zoneref資料結構有一個成員zone指針會指向zone資料結構，還有一個zone_index成員指向zone的編号。zone在系統處理時會初始化這個數組，具體函數在build_zonelists_node中。在ARM Vexpress 平台中，zone類型、zoneref[]數組和zoneidx的關系如下：

ZONE_HIGHMEM _zonerefs[0]->zone_index = 1
ZONE_NORMAL  _zonerefs[1]->zone_index = 0

zonerefs[0]表示ZONE_HIGHMEM, 其zone的編号zone_index值為1： zonerefs[1]表示ZONE_NORMAL,其zone的編号zone_index為0，也就是說，基于zone的設計思想是：配置設定實體頁面時會優先考慮ZONE_HIGHMEM，因為ZONE_HIGHMEM在zonelist中排在ZONE_NORMAL前面。

回到我們之前的例子，gfp_zone(GFP_KERNEL)函數傳回0，即highest_zoneidx為0，而這個記憶體節點的第一個zone是ZONE_HIGHMEM, 其zone編号zone_index的值為1，是以next_zones_zonelist()中，z++,最終first_zones_zonelist()函數會傳回ZONE_NORMAL。在for_each_zone_zonelist_nodemask()周遊過程中也隻能周遊ZONE_NORMAL這一個zone了。因為Vexpress平台隻有HIGHMEM和NORMAL.

再舉一個例子，配置設定掩碼為GFP_HIGHUSER_MOVABLE， GFP_HIGHUSER_MOVABLE包含了__GFP_HIGHMEM, 那麼next_zones_zonelist函數會傳回哪個zone呢？

GFP_HIGHUSER_MOVABLE值為0x200da, 那麼gfp_zone(GFP_HIGHUSER_MOVABLE) 函數等于2，即highest_zoneidx為2，而這個記憶體結點的第一個ZONE_HIGHME, 其zhone編号zone_index的值為1.

在first_zones_zonelist()函數中，由于第一個zone的zone_index值小于highest_zoneidx,是以會傳回ZONE_HIGHMEM.
在for_each_zone_zonelist_nodemask()函數中，next_zones_zonelist(++z, highidx, nodemask)依然會傳回ZONE_NORMAL.
是以這裡會周遊ZONE_HIGHMEM和ZONE_NORMAL這兩個zone，但是會先周遊ZONE_HIGHMEM, 然後才是ZONE_NORMAL.

要正确了解for_each_zone_zonelist_nodemask()這個宏的行為，需要了解如下兩個方面。

highest_zoneidx是怎麼計算來的，即如何解析配置設定掩碼，這是gfp_zone()函數的職責。
每個記憶體節點有一個struct pglist_data 資料結構，其成員node_zonelists 是一個struct zonelist資料結構，zonelist中包含了struct zoneref _zonerefs[]數組來描述這些zone。其中ZONE_HIGHMEM 排在前面，并且_zonerefs[0]->zone_index = 1, ZONE_NORMAL排在後面，且_zonerefs[1]->zone_index = 0;

上述這些設計讓人感覺有些複雜，但是這是正确了解以zone為基礎的實體頁面的配置設定機制的基石。

在__alloc_pages_nodemask()中調用first_zones_zonelist()，計算出preferred_zoneref并且儲存到ac.classzone_idx 變量中，該變量在kswapd記憶體線程中還會用到。例如以GFP_KERNEL為配置設定掩碼，preferred_zone指的是ZONE_NORMAL, ac.classzone_idx值為0.

了解上面的函數，需要了解一些輔助函數

首先我們需要定義一些函數使用的标志，用于控制到達各個水位指定的臨界狀态時的行為。

enum zone_watermarks {
    WMARK_MIN,
    WMARK_LOW,
    WMARK_HIGH,
    NR_WMARK
};
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN     WMARK_MIN /*使用pages_min 水位*/
#define ALLOC_WMARK_LOW     WMARK_LOW /*使用pages_low 水位*/
#define ALLOC_WMARK_HIGH    WMARK_HIGH /*使用pages_high 水位*/
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all 完全不檢查水位*/

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK    (ALLOC_NO_WATERMARKS-1)

#define ALLOC_HARDER        0x10 /* try to alloc harder 試圖更努力地配置設定，即放寬限制*/
#define ALLOC_HIGH      0x20 /* __GFP_HIGH set  設定了__GFP_HIGH*/
#define ALLOC_CPUSET        0x40 /* check for correct cpuset */
#define ALLOC_CMA       0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR      0x100 /* fair zone allocation 檢查記憶體結點是否對應着指定的CPU集合*/

前幾個标志表示在判斷頁是否可配置設定時，需要考慮哪些水位。預設情況下(即沒有因其他因素帶來的壓力而需要更多的記憶體)，隻有記憶體域包含頁的數目至少為zone->watermark[WMARK_HIGH]時，才能配置設定頁。這對應于ALLOC_WMARK_HIGH标志。如果要使用較低(zone->watermark[WMARK_LOW])或最低(zone->watermark[WMARK_MIN])設定，則必須相應地設定ALLOC_WMARK_LOW或者ALLOC_WMARK_MIN。ALLOC_HARDER進一步放寬限制，最後，ALLOC_CPUSET告知核心，記憶體隻能從目前程序允許的CPU相關聯的記憶體結點配置設定，當然該選項隻對NUMA系統有意義。

設定的标志在zone_watermark_ok()函數中檢查，該函數根據設定的标志判斷是否能從給定的記憶體域配置設定記憶體。

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
              int classzone_idx, int alloc_flags)
{
    return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                    zone_page_state(z, NR_FREE_PAGES));
}
/*
 * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
如果可用頁面超出标記(mark),這考慮了配置設定順序
 */
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
            unsigned long mark, int classzone_idx, int alloc_flags,
            long free_pages)
{
    /* free_pages may go negative - that's OK free_pages可能變為負數，沒有關系*/
    long min = mark;/*mark表示水水位值*/
    int o;
    long free_cma = 0;

    free_pages -= (1 << order) - 1;
    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;
    if (alloc_flags & ALLOC_HARDER)
        min -= min / 4;
#ifdef CONFIG_CMA
    /* If allocation can't use CMA areas don't use free CMA pages */
    if (!(alloc_flags & ALLOC_CMA))
        free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif

    if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
        return false;
    for (o = 0; o < order; o++) {
        /* At the next order, this order's pages become unavailable  在下一階，目前階的頁是不可用的*/
        free_pages -= z->free_area[o].nr_free << o;

        /* Require fewer higher order pages to be free 所需高階空閑頁的數目相對較少 */
        min >>= 1;

        if (free_pages <= min)
            return false;
    /*此循環的目的可以總結為：
        依次循環，檢查記憶體中是否有足夠多的大塊(即order比較高)空閑記憶體。
        每次循環進行中，先把目前order的free page從總的free pages中減去，因為我們是看是否有足夠多
        的大塊記憶體。既然已經把free pages中的一部分已經劃掉了，比較标準也應該相應放寬。
        放寬多少，就是對應min右移多少來決定。
    例子：如果請求配置設定的order 是1， 還有100個free pages，其中order 0的有96 pages，order 1 的有 4 pages，處理後的min是16
    這樣在第一輪循環中，free_pages即變為4，min 假設右移了1位則為8，這樣判斷下來不滿足watermark要求。
    如果将要求放寬，即将min右移4位，這樣第一輪循環中min變為1，free pages滿足watermark要求。
    */
    }
    return true;
}

zone_page_state(z, NR_FREE_PAGES)用來得到空閑頁的數目。

在解釋了ALLOC_HIGH和ALLOC_HARDER标志後(将最小值标記降低到目前值的1/2或1/4，使配置設定過程努力或更加努力)，該函數會檢查空閑頁的數目是否小于最小值與lowmem_reserve中指定的緊急配置設定值之和。如果不小于，則代碼周遊所有小于目前階的配置設定階，從frea_pages減去目前配置設定階的所有空閑頁(左移o位是必要的，因為nr_free記載的時目前配置設定階的空閑頁塊數目,而非單頁的數目)。同時，每升高一階，所需空閑頁的最小值折半。如果核心周遊所有低端記憶體域之後，發現記憶體不足，則不進行記憶體配置設定。

系統中定義的三個水位(WMARK_MIN/WMARK_LOW/WMARK_HIGH)。watermark水位的計算在__setup_per_zone_wmarks()函數中.

static void __setup_per_zone_wmarks(void)
{
    unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    unsigned long lowmem_pages = 0;
    struct zone *zone;
    unsigned long flags;

    /* Calculate total number of !ZONE_HIGHMEM pages */
    for_each_zone(zone) {
        if (!is_highmem(zone))
            lowmem_pages += zone->managed_pages;
    }

    for_each_zone(zone) {
        u64 tmp;

        spin_lock_irqsave(&zone->lock, flags);
        tmp = (u64)pages_min * zone->managed_pages;
        do_div(tmp, lowmem_pages);
        if (is_highmem(zone)) {
            /*
             * __GFP_HIGH and PF_MEMALLOC allocations usually don't
             * need highmem pages, so cap pages_min to a small
             * value here.
             *
             * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
             * deltas controls asynch page reclaim, and so should
             * not be capped for highmem.
             */
            unsigned long min_pages;

            min_pages = zone->managed_pages / 1024;
            min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
            zone->watermark[WMARK_MIN] = min_pages;
        } else {
            /*
             * If it's a lowmem zone, reserve a number of pages
             * proportionate to the zone's size.
             */
            zone->watermark[WMARK_MIN] = tmp;
        }

        zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
        zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

        __mod_zone_page_state(zone, NR_ALLOC_BATCH,
            high_wmark_pages(zone) - low_wmark_pages(zone) -
            atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

        setup_zone_migrate_reserve(zone);
        spin_unlock_irqrestore(&zone->lock, flags);
    }

    /* update totalreserve_pages */
    calculate_totalreserve_pages();
}

計算watermark水位用到min_free_kbytes這個值，它是系統啟動時通過系統空閑頁面的數量來計算的，具體計算在init_per_zone_wmark_min()函數中。另外系統起來之後也可以通過sysfs來設定，節點在"/proc/sys/vm/min_free_kbytes"，後續夥伴系統和kswapd核心線程會用到。

我們假設zone_watermark_ok()判斷空閑頁面充沛，接下來就會調用buffered_rmqueue()函數從夥伴系統中配置設定實體頁面。

如果記憶體找到适當的記憶體域，具有足夠的空閑頁可供配置設定，那麼還有兩件事情需要完成。首先它必須檢查這些頁是否是連續的(到目前為止，隻知道有許多空閑頁)。其次，必須按夥伴系統的方式從free_lists移除這些頁，這可能需要分解并重排記憶體區。

核心将該工作委托給buffered_rmqueue()函數. 改函數必需各個步驟如下：

4. 配置設定實體頁面（夥伴系統配置設定記憶體）linux4.0

__alloc_pages_nodemask()->get_page_from_freelist()->buffered_rmqueue()

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, unsigned int order,
            gfp_t gfp_flags, int migratetype)
{
    unsigned long flags;
    struct page *page;
    bool cold = ((gfp_flags & __GFP_COLD) != 0);/*是否指定冷頁*/

    if (likely(order == 0)) {/*配置設定單頁*/
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);/*禁止本地CPU中斷，禁止前儲存中斷狀态*/
        pcp = &this_cpu_ptr(zone->pageset)->pcp;/*擷取到cpu高速緩存*/
        list = &pcp->lists[migratetype];/*根據遷移類型，得到高速緩存的freelist*/
        if (list_empty(list)) {/*空的，高速緩存沒有資料，這可能是上次擷取的cpu高速緩存遷移類型和這次不一樣*/
            /*下面檢視rmqueue_bulk()函數實作*/
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);/*該函數向高速緩存添加記憶體頁，從夥伴系統中得到頁，然後填充到cpu高速緩存中, batch:批量*/
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */
            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_freepage_state(zone, -(1 << order),
                      get_freepage_migratetype(page));
    }

    __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
    if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
        !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
        set_bit(ZONE_FAIR_DEPLETED, &zone->flags);

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone, gfp_flags);
    local_irq_restore(flags);

    VM_BUG_ON_PAGE(bad_range(zone, page), page);
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}

如果隻配置設定一頁，核心會進行優化，即配置設定為0的情況，2^0 = 1。該頁不是從夥伴系統直接取得，而是取自per-CPU的頁緩存（（深入Linux核心架構 184頁）回想一下，可知該緩存提供了CPU本地的熱頁和冷頁的清單，zone->pageset是一個數組，用于實作每個CPU的熱/冷頁幀清單，核心使用這些清單來儲存可用于滿足實作的"新鮮"頁。但冷熱頁幀對應的高速緩存狀态不同：有些頁幀也很可能仍然在高速緩存中，是以可以快速通路，故稱為熱的；未緩存的頁幀與此相對，故稱之為冷的）。

配置設定1頁的情況：

如果配置設定标志設定了GFP_COLD，那麼必須從per-CPU緩存取得冷頁，前提是有的話。

rmqueue_bulk()函數實作

[__alloc_pages_nodemask()->get_page_from_freelist()->buffered_rmqueue()->rmqueue_bulk()]

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
            unsigned long count, struct list_head *list,
            int migratetype, bool cold)
{
    int i;

    spin_lock(&zone->lock);
    for (i = 0; i < count; ++i) {/*一個頁面一個頁面處理*/
        /*下面檢視__rmqueue()函數實作*/
        struct page *page = __rmqueue(zone, order, migratetype);/*配置設定到指定遷移類型的記憶體頁*/
        if (unlikely(page == NULL))
            break;

        /*
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */
        if (likely(!cold))
            list_add(&page->lru, list);/*如果是冷頁，則添加到連結清單頭*/
        else
            list_add_tail(&page->lru, list);/*否則添加到連結清單尾部*/
        list = &page->lru;
        if (is_migrate_cma(get_freepage_migratetype(page)))
            __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                          -(1 << order));
    }
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));/*修改空閑頁面的計數*/
    spin_unlock(&zone->lock);
    return i;/*傳回添加到cpu高速緩存連結清單的頁面個數*/
}

當配置設定多頁情況時，下面函數和配置設定一頁的情況都是調用的函數：

__rmqueue()函數實作

[__alloc_pages_nodemask()->get_page_from_freelist()->buffered_rmqueue()->rmqueue_bulk()->__rmqueue()]

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
    配置設定指定遷移類型的記憶體頁
 */
static struct page *__rmqueue(struct zone *zone, unsigned int order,
                        int migratetype)
{
    struct page *page;

retry_reserve:
    /*下面檢視此函數實作*/
    page = __rmqueue_smallest(zone, order, migratetype);/*正常情況下，從zone上配置設定指定的遷移類型的記憶體頁，根據傳遞進來
        的配置設定階、用于擷取頁的記憶體域、遷移類型，來掃描頁的清單，直到找到适當的連續記憶體塊。*/

    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {/*上面沒有配置設定的記憶體頁，并且不是緊急的遷移類型*/
        page = __rmqueue_fallback(zone, order, migratetype);/*嘗試其他的遷移清單，作為應急措施*/

        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */
        if (!page) {/*沒有成功，則把遷移類型調整為MIGRATE_RESERVE表示是緊急配置設定*/
            migratetype = MIGRATE_RESERVE;
            goto retry_reserve;
        }
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}

__rmqueue_smallest()函數實作

[__alloc_pages_nodemask()->get_page_from_freelist()->buffered_rmqueue()->rmqueue_bulk()->__rmqueue()->__rmqueue_smallest()]

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area *area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list 在首選的清單中找到适當大小的頁,
        小的記憶體區無用，因為配置設定的頁必須是連續的。我們知道給定配置設定階的所有頁又再分成對應于不同遷移類型的清單，
        在其中需要選擇正确的一項。
        檢查适當大小的記憶體塊非常簡單。如果檢查的清單中有一個元素，那麼它就是可用的，因為其中包含了所需數目的連續頁。否則
        核心将選擇下一個更高配置設定階，并進行類似的搜尋。
     */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]);
        if (list_empty(&area->free_list[migratetype]))
            continue;

        page = list_entry(area->free_list[migratetype].next,
                            struct page, lru);
        list_del(&page->lru);
        rmv_page_order(page);/*設定屬性，清除buddy辨別，也就是設定page->_mapcount = -1*/
        area->nr_free--;/*在用list_del從連結清單移除一個記憶體塊之後，要注意，必須将struct free_area的nr_free成員減1*/
        /*下面檢視expand()函數實作*/
        expand(zone, page, order, current_order, area, migratetype);
        set_freepage_migratetype(page, migratetype);
        return page;
    }

    return NULL;
}

在__rmqueue_smallest()函數中，首先從order開始查找zone中空閑連結清單，如果zone的目前order對應的空閑區free_area中相應migratetype類型的連結清單沒有空閑對象，那麼就會查找下一級order。

為什麼會這樣？因為在系統啟動時，空閑頁面會盡可能地都配置設定到MAX_ORDER-1的連結清單中，這個可以在系統剛起來之後，通過"cat /proc/pagetypeinfo"指令看出端倪。當找到某一個order的空閑區中對應的migratetype類型的空閑連結清單中有空閑記憶體塊時，就會從中把一個記憶體塊摘取下來，然後調用expand()函數來"切蛋糕"。因為通常摘下來的記憶體塊要比需要的記憶體大，切完之後需要把剩下的記憶體塊重新放回夥伴系統中。

get_page_from_freelist()->buffered_rmqueue()->__rmqueue()->__rmqueue_smallest()->expand()

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
/* 該函數使用了一組桉樹，page、zone、area的語義都很顯然。index指定了該夥伴對在配置設定位圖中的索引位置，low是預期的配置設定階，
    high表示記憶體取自哪個配置設定階。migratetype表示遷移類型。*/
static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {
        area--;
        high--;
        size >>= 1;
        VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
            debug_guardpage_enabled() &&
            high < debug_guardpage_minorder()) {
            /*
             * Mark as guard pages (or page), that will allow to
             * merge back to allocator when buddy will be freed.
             * Corresponding page table entries will not be touched,
             * pages will stay not present in virtual address space
             */
            set_page_guard(zone, &page[size], high, migratetype);
            continue;
        }
        list_add(&page[size].lru, &area->free_list[migratetype]);
        area->nr_free++;
        set_page_order(&page[size], high);
    }
}

最好逐漸看一下代碼，了解其工作方式。我們假定一下情況：将要配置設定一個階為3的塊。記憶體中沒有該長度的塊，是以核心選擇了一個階為5的塊。是以調用該函數的參數如下：

expand(zone, page, low = 3, high = 5, area, migratetype)

4. 配置設定實體頁面（夥伴系統配置設定記憶體）linux4.0

如果在特定的遷移類型清單上沒有連續記憶體區可用，則__rmqueue_smallest()傳回NULL指針。核心接下來根據備用次序，嘗試使用其他遷移類型的清單滿足配置設定請求。該任務委托給__rmqueue_fallback。遷移類型的備用次序在fallbacks數組定義。fallbacks:後備

核心總是使用特定于遷移類型的free_area清單，在處理期間不會改變頁的遷移類型。（上圖有些錯誤，最下面mem_map，應該是page指向的是8頁，是以最後兩格應該是白色，最前面一格應該是淺灰色，然後再分割成兩格（每格8頁））

size的值初始化為2^5 = 32.配置設定的記憶體區已經在__rmqueue中從free_area清單移除，是以在上圖虛線部分已經畫出。
在第一遍循環中，核心切換到低一個配置設定階、遷移類型相同的free_area清單，即階為4。類似地，記憶體區長度降低到16(通過size>>1 計算)。初始記憶體區的後一半插入到階為4的free_area清單中。夥伴系統隻要記憶體區第一個page執行個體，用作管理用途。記憶體區的長度可根據頁所在的清單自動推導而得
後一半記憶體區的位址可以通過&page[size]計算。而page指針一直指向最初配置設定記憶體區的起始位址，并不改變。page指針指向的位置如上圖中用箭頭表示。
下一遍循環将剩餘16頁的後一半放置到對應于size = 8 的free_area清單上，page指針仍然不動。現在剩餘的記憶體區已經是預期長度，可以将page指針作為結果傳回。從上圖可見，顯然使用了初始頁記憶體的起始8頁。所有其餘各頁都進入到夥伴系統中适當的free_area清單裡。

/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
static int fallbacks[MIGRATE_TYPES][4] = {
    [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
    [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
#ifdef CONFIG_CMA
    [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
    [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
#else
    [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
#endif
    [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
    [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
#endif
};

首先，函數再一次周遊各個配置設定階的清單：

/* Remove an element from the buddy allocator from the fallback list 

*/
/*但不隻是相同的遷移類型，還要考慮備用清單中指定的不同遷移類型。請注意：該函數會按照配置設定階從大到小周遊!
這與通常的政策相反(除了MIGRATE_RESERVER)，核心的政策是，如果無法避免配置設定遷移類型不同的記憶體塊，那麼就配置設定一個盡可能大的記憶體塊。
如果優先選擇更小的記憶體塊，則會向其他清單引入碎片，因為不同遷移類型的記憶體塊将會混合起來，這顯然不是我們想要的。
特别清單MIGRATE_RESERVE包含了用于緊急配置設定的記憶體，需要特殊處理。如果目前考慮的遷移類型對應的空閑清單包含空閑記憶體塊，則從該清單配置設定記憶體*/
static inline struct page *
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
    struct free_area *area;
    unsigned int current_order;
    struct page *page;

    /* Find the largest possible block of pages in the other list
        在其他類型清單中找到最大可能的記憶體塊
     */
    for (current_order = MAX_ORDER-1;
                current_order >= order && current_order <= MAX_ORDER-1;
                --current_order) {/*這是和指定遷移類型的周遊不一樣，這裡從最大階開始比周遊，就是為了防止記憶體碎片*/
        int i;
        for (i = 0;; i++) {
            int migratetype = fallbacks[start_migratetype][i];
            int buddy_type = start_migratetype;

            /* MIGRATE_RESERVE handled later if necessary */
            if (migratetype == MIGRATE_RESERVE)
                break;

            area = &(zone->free_area[current_order]);/*得到高階空閑數組元素*/
            if (list_empty(&area->free_list[migratetype])) /*如果對應階上的對應遷移類型的空閑頁連結清單是空的，則循環找備用遷移類型的空閑連結清單*/
                continue;

            page = list_entry(area->free_list[migratetype].next,
                    struct page, lru);/*如果找到了空閑頁塊，則目前階上的空閑頁塊遞減*/
            area->nr_free--;

            if (!is_migrate_cma(migratetype)) {/*不是CMA區域*/
                try_to_steal_freepages(zone, page,
                            start_migratetype,
                            migratetype);
            } else {
                /*
                 * When borrowing from MIGRATE_CMA, we need to
                 * release the excess buddy pages to CMA
                 * itself, and we do not try to steal extra
                 * free pages.
                 */
                buddy_type = migratetype;
            }

            /* Remove the page from the freelists */
            list_del(&page->lru);
            rmv_page_order(page);/*清楚buddy的辨別，辨別該page将不是buddy系統的了*/

            expand(zone, page, order, current_order, area,
                    buddy_type);

            /*
             * The freepage_migratetype may differ from pageblock's
             * migratetype depending on the decisions in
             * try_to_steal_freepages(). This is OK as long as it
             * does not differ for MIGRATE_CMA pageblocks. For CMA
             * we need to make sure unallocated pages flushed from
             * pcp lists are returned to the correct freelist.
             */
            set_freepage_migratetype(page, buddy_type);

            trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, migratetype);

            return page;
        }
    }

    return NULL;
}

/*
 * When we are falling back to another migratetype during allocation, try to
 * steal extra free pages from the same pageblocks to satisfy further
 * allocations, instead of polluting multiple pageblocks.
 *
 * If we are stealing a relatively large buddy page, it is likely there will
 * be more free pages in the pageblock, so try to steal them all. For
 * reclaimable and unmovable allocations, we steal regardless of page size,
 * as fragmentation caused by those allocations polluting movable pageblocks
 * is worse than movable allocations stealing from unmovable and reclaimable
 * pageblocks.
 *
 * If we claim more than half of the pageblock, change pageblock's migratetype
 * as well.
 */
static void try_to_steal_freepages(struct zone *zone, struct page *page,
                  int start_type, int fallback_type)
{
    int current_order = page_order(page);

    /* Take ownership for orders >= pageblock_order  較大記憶體塊有多大的概念由全局變量pageblock_order給出
       該變量定義了大記憶體塊配置設定階，如果需要分解來自其他遷移清單的空閑記憶體塊，那麼核心必須決定如何處理剩餘的頁。
        如果剩餘部分也是一個比較大的記憶體塊，那麼将整個記憶體塊都轉到目前配置設定類型對應的遷移清單是有意義的，這樣可以減少碎片。
        如果是在配置設定可回收記憶體，那麼核心在将空閑頁從一個遷移清單移動另一個時，會更加積極。
        此類配置設定經常猝發湧現，導緻許多小的可回收記憶體塊散布到所有的遷移清單。為了避免此類情況，配置設定MIGRATE_RECLAIMABLE記憶體塊時，
        剩餘的頁總是轉移到可回收遷移清單。*/
    if (current_order >= pageblock_order) { //pageblock_order = (MAX_ODRER-1)
        change_pageblock_range(page, current_order, start_type);
        return;
    }

    if (current_order >= pageblock_order / 2 || /*大記憶體塊，則全部轉到start_migratetype類型下*/
        start_type == MIGRATE_RECLAIMABLE || /*可回收記憶體頁，就遷移類型轉換時，會更加積極*/
        start_type == MIGRATE_UNMOVABLE ||
        page_group_by_mobility_disabled) {
        int pages;

        pages = move_freepages_block(zone, page, start_type);/*把這些頁面轉換到start_migratetype遷移類型下面去*/

        /* Claim the whole block if over half of it is free */
        if (pages >= (1 << (pageblock_order-1)) ||
                page_group_by_mobility_disabled)
            set_pageblock_migratetype(page, start_type);/*這裡是設定整個頁面的遷移類型，上面move_freepages_block函數是設定每個頁的遷移類型*/
    }
}

/*就是将一堆(pageblock大小)的page移動到migratetype類型的連結清單中*/
int move_freepages_block(struct zone *zone, struct page *page,
                int migratetype)
{
    unsigned long start_pfn, end_pfn;
    struct page *start_page, *end_page;

    start_pfn = page_to_pfn(page);/*頁幀号*/
    /*pageblock_nr_pages是遷移類型認為大階所對應的頁數, pageblock_nr_pages = (1UL << pageblock_order) = 1024*/
    start_pfn = start_pfn & ~(pageblock_nr_pages-1);
    start_page = pfn_to_page(start_pfn);
    /*準備遷移pageblock_nr_pages個頁面，一般要轉換遷移類型的話，就轉換pageblock_br_pages個連續頁面，這樣會減少記憶體碎片*/
    end_page = start_page + pageblock_nr_pages - 1;
    end_pfn = start_pfn + pageblock_nr_pages - 1;

    /* Do not cross zone boundaries */
    if (!zone_spans_pfn(zone, start_pfn))
        start_page = page;
    if (!zone_spans_pfn(zone, end_pfn))/*判斷要遷移的記憶體區是否在一個zone上，不能交錯zone*/
        return 0;

    return move_freepages(zone, start_page, end_page, migratetype);/*把要轉換遷移類型的記憶體頁面位址範圍給move_freepages（）進行轉換*/
}

/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
int move_freepages(struct zone *zone,
              struct page *start_page, struct page *end_page,
              int migratetype)
{
    struct page *page;
    unsigned long order;
    int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
    /*
     * page_zone is not safe to call in this context when
     * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
     * anyway as we check zone boundaries in move_freepages_block().
     * Remove at a later date when no bug reports exist related to
     * grouping pages by mobility
     */
    VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

    for (page = start_page; page <= end_page;) {
        /* Make sure we are not inadvertently changing nodes */
        VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);

        if (!pfn_valid_within(page_to_pfn(page))) {
            page++;
            continue;
        }

        if (!PageBuddy(page)) {/*現在頁還是夥伴系統的*/
            page++;
            continue;
        }

        order = page_order(page);//得到階
        list_move(&page->lru,
              &zone->free_area[order].free_list[migratetype]);//把這些頁搬遷到指定遷移類型對應的連結清單上
        set_freepage_migratetype(page, migratetype);/*設定這些頁的遷移類型 page->index = migratetype*/
        page += 1 << order;/*一下子就轉換了2^order個頁面*/
        pages_moved += 1 << order;
    }

    return pages_moved;/*把範圍内的頁都遷移完，傳回實際遷移了多少頁*/
}

所需求的頁面配置設定成功後，__rmqueue()函數傳回這個記憶體塊的起始頁面的struct page 資料結構。回到buffered_rmqueue()函數，最後還需要利用zone_statistics()函數做一些統計資料的計算。

回到get_page_from_freelist()函數中，最後還要通過prep_new_page()函數做一些有趣的檢查，才能出廠。

static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                                int alloc_flags)
{
    int i;

    for (i = 0; i < (1 << order); i++) {
        struct page *p = page + i;
        if (unlikely(check_new_page(p)))
            return 1;
    }

    set_page_private(page, 0);
    set_page_refcounted(page);

    arch_alloc_page(page, order);
    kernel_map_pages(page, 1 << order, 1);
    kasan_alloc_pages(page, order);

    if (gfp_flags & __GFP_ZERO)
        prep_zero_page(page, order, gfp_flags);

    if (order && (gfp_flags & __GFP_COMP))
        prep_compound_page(page, order);

    set_page_owner(page, order, gfp_flags);

    /*
     * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
     * allocate the page. The expectation is that the caller is taking
     * steps that will free more memory. The caller should avoid the page
     * being used for !PFMEMALLOC purposes.
     */
    page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

    return 0;
}


/*
 * This page is about to be returned from the page allocator
 */
static inline int check_new_page(struct page *page)
{
    const char *bad_reason = NULL;
    unsigned long bad_flags = 0;

    if (unlikely(page_mapcount(page))) /*剛配置設定頁面的struct page的_mapcount計數應該為0*/
        bad_reason = "nonzero mapcount";
    if (unlikely(page->mapping != NULL)) /*這時page->mapping 為NULL*/
        bad_reason = "non-NULL mapping";
    if (unlikely(atomic_read(&page->_count) != 0)) /*判斷這時page的_count是否為0.注意alloc_pages()配置設定的page的_count應該為1，但是這裡為0，因為這個函數之後還調用了
            set_page_refcounted()->set_page_count(),把_count設定為1*/
        bad_reason = "nonzero _count";
    if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { /*檢查PAGE_FLAGS_CHECK_AT_PREP标志位，這個flag在free_page時已經清除了，而這時該flag被設定，說明 配置設定過程中有問題。*/
        bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
        bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
    }
#ifdef CONFIG_MEMCG
    if (unlikely(page->mem_cgroup))
        bad_reason = "page still charged to cgroup";
#endif
    if (unlikely(bad_reason)) {
        bad_page(page, bad_reason, bad_flags);
        return 1;
    }
    return 0;
}

4. 配置設定實體頁面（夥伴系統配置設定記憶體）linux4.0

繼續閱讀

14.5 shrink_active_list函數

畢業一年，我的嵌入式軟體的工作漂泊記

14.3 balance_pgdat函數