天天看点

14.5 shrink_active_list函数

首先来看当不活跃LRU的页面数量少于活跃LRU的页面数量的情况,shrink_active_list()函数扫描活跃LRU链表,看是否有页面可以迁移到不活跃LRU链表中。

[kswapd()->balance_pgdat()->kswapd_shrink_zone()->shrink_zone()->shrink_lruvec()->shrink_list()->shrink_active_list()]

/*
    注意在操作LRU链表时,有一把保护LRU的spinlock锁zone->lru_lock。
    isolate_lru_pages()批量地把LRU链表的部分页面先迁移到临时链表中,从而
    减少加锁的时间。
*/

static void shrink_active_list(unsigned long nr_to_scan,
                   struct lruvec *lruvec,
                   struct scan_control *sc,
                   enum lru_list lru)
{
    unsigned long nr_taken;
    unsigned long nr_scanned;
    unsigned long vm_flags;
    /*定义了三个临时链表:l_hold/l_active/l_inactive */
    LIST_HEAD(l_hold);  /* The pages which were snipped off */
    LIST_HEAD(l_active);
    LIST_HEAD(l_inactive);
    struct page *page;
    struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    unsigned long nr_rotated = 0;
    isolate_mode_t isolate_mode = 0;
    int file = is_file_lru(lru);
    /*从lruvec结构返回zone数据结构*/
    struct zone *zone = lruvec_zone(lruvec);
    lru_add_drain();

    if (!sc->may_unmap)
        isolate_mode |= ISOLATE_UNMAPPED;
    if (!sc->may_writepage)
        isolate_mode |= ISOLATE_CLEAN;
    
    /*申请zone->lru_lock锁来保护LRU链表操作*/
    spin_lock_irq(&zone->lru_lock);

    /*isolate_lru_pages()批量地从LRU链表中分离nr_to_scan个页面到l_hold链表中,
    这里会根据isolate_mode来考虑一些特殊情况,基本上就是把LRU链表的页面迁移到临时
    l_hold链表中。下面查看isolate_lru_pages()函数实现*/
    nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                     &nr_scanned, sc, isolate_mode, lru);
    if (global_reclaim(sc))
        __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);/*增加zone中的NR_PAGES_SCANNED计数*/

    /*增加recent_scanned[]计数,在get_scan_count()计算匿名页面和文件缓存页面
    分别扫描数量时会用到。*/
    reclaim_stat->recent_scanned[file] += nr_taken;

    /*增加zone中PGREFILL、NR_LRU_BASE和NR_ISOLATED_ANON计数*/
    __count_zone_vm_events(PGREFILL, zone, nr_scanned);
    __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    spin_unlock_irq(&zone->lru_lock);

    /*下面开始扫描临时l_hold链表的页面,有些页面会添加到l_active中,有些会加入到
    l_inactive中。*/
    while (!list_empty(&l_hold)) {
        cond_resched();
        page = lru_to_page(&l_hold);
        list_del(&page->lru);
         /*如果页面是不可回收的,那么就把它返回到不可回收的LRU链表中*/
        if (unlikely(!page_evictable(page))) {
            putback_lru_page(page);
            continue;
        }

        if (unlikely(buffer_heads_over_limit)) {
            if (page_has_private(page) && trylock_page(page)) {
                if (page_has_private(page))
                    try_to_release_page(page, 0);
                unlock_page(page);
            }
        }
        /*page_referenced()函数返回该页最近访问引用pte的个数,返回0表示最近没有被访问过。
        除了可执行的page cache页面,其他被访问引用的页面(referenced page)为什么都被加入
        到不活跃链表里,而不是继续待在活跃LRU链表中呢?
        把最近又访问引用的页面全部迁移到活跃LRU链表会产生一个比较大的可扩展性问题(scalability
        problem)。在一个内存很大的系统中,当系统用完了这些空闲内存时,每个页面都会被访问引用到,
        这种情况下我们不仅没有时间去扫描活跃LRU链表,而且还重新设置访问比特位(referenced bit),
        而这些信息没有什么用处。所以从Linux2.6.28开始,扫描活跃链表时会把页面全部都迁移到不活跃
        链表中。这里只需要清理硬件的访问比特位(page_referenced()来完成),当有访问引用时,扫描不
        活跃LRU链表就迁移回到活跃LRU链表中。
        让可执行的page cache页面(mapped executable file pages)继续保存在活跃链表中,在扫描活跃
        链表期间它们可能再次被访问到,因为LRU链表的扫描顺序是先扫描不活跃链表,然后再扫描活跃链表
        且扫描不活跃链表的速度要快于活跃链表,因此它们可以获得比较多的时间让用户进程再次访问,从而
        提高用户进程的交互体验。可执行的页面通常是vma的属性标记着VM_EXEC,这些页面通常包括可执行的
        文件和它们连接的库文件等。*/
        if (page_referenced(page, 0, sc->target_mem_cgroup,
                    &vm_flags)) {
            nr_rotated += hpage_nr_pages(page);
            /*
             * Identify referenced, file-backed active pages and
             * give them one more trip around the active list. So
             * that executable code get better chances to stay in
             * memory under moderate memory pressure.  Anon pages
             * are not likely to be evicted by use-once streaming
             * IO, plus JVM can create lots of anon VM_EXEC pages,
             * so we ignore them here.
             */
            if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
                list_add(&page->lru, &l_active);
                continue;
            }
        }

        /*如果页面没有被引用,那么加入到l_inactive链表*/
        ClearPageActive(page);  /* we are de-activating */
        list_add(&page->lru, &l_inactive);
    }

    /*
     * Move pages back to the lru list.
     */
    spin_lock_irq(&zone->lru_lock);
    /*
     * Count referenced pages from currently used mappings as rotated,
     * even though only some of them are actually re-activated.  This
     * helps balance scan pressure between file and anonymous pages in
     * get_scan_count.
     */
    /*这里把最近被引用的页面(referenced pages)统计到recent_rotated中,以便在下一次扫描
    时在 get_scan_count()中重新计算匿名页面和文件映射页面LRU链表的扫描比重。*/
    reclaim_stat->recent_rotated[file] += nr_rotated;

    /*把l_inactive和l_active链表的页迁移到LRU相应的链表中。*/
    move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
    move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
    __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    spin_unlock_irq(&zone->lru_lock);

    mem_cgroup_uncharge_list(&l_hold);
    /*l_hold链表是剩下的页面,表示可以释放*/
    free_hot_cold_page_list(&l_hold, true);
}
           

isolate_lru_pages()函数实现:批量把LRU部分链表迁移到临时链表

[shrink_active_list()->isolate_lru_pages()]

/*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan: The number of pages to look through on the list.
 * @lruvec: The LRU vector to pull pages from.
 * @dst:    The temp list to put pages on to.
 * @nr_scanned: The number of pages that were scanned.
 * @sc:     The scan_control struct for this reclaim session
 * @mode:   One of the LRU isolation modes
 * @lru:    LRU list id for isolating
 *
 * returns how many pages were moved onto *@dst.
 */
/*参数说明:
@nr_to_scan: 表示在这个链表中扫描页面的个数
@lruvec: LRU链表集合
@dst: 临时存放的链表
@nr_scanned: 已经扫描的页面个数
@sc: 页面回收的控制数据结构struct scan_control
@mode: 分离LRU的模式
@lru:哪类LRU
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        struct lruvec *lruvec, struct list_head *dst,
        unsigned long *nr_scanned, struct scan_control *sc,
        isolate_mode_t mode, enum lru_list lru)
{
    struct list_head *src = &lruvec->lists[lru];
    unsigned long nr_taken = 0;
    unsigned long scan;

    for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
        struct page *page;
        int nr_pages;

        page = lru_to_page(src);
        prefetchw_prev_lru_page(page, src, flags);

        VM_BUG_ON_PAGE(!PageLRU(page), page);

        /*调用__isolate_lru_page()来分离页面,返回0,则表示分离成功,并
        把页面迁移到dst临时链表中。下面查看__isolate_lru_page实现*/
        switch (__isolate_lru_page(page, mode)) {
        case 0:
            nr_pages = hpage_nr_pages(page);
            mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
            list_move(&page->lru, dst);
            nr_taken += nr_pages;
            break;

        case -EBUSY:
            /* else it is being freed elsewhere */
            list_move(&page->lru, src);
            continue;

        default:
            BUG();
        }
    }

    *nr_scanned = scan;
    trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
                    nr_taken, mode, is_file_lru(lru));
    return nr_taken;
}
返回shrink_active_list()函数
           

__isolate_lru_page()函数:

[shrink_active_list()->isolate_lru_pages()->__isolate_lru_page()]

/*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
 * freed elsewhere are also ignored.
 *
 * page:    page to consider
 * mode:    one of the LRU isolation modes defined above
 *
 * returns 0 on success, -ve errno on failure.
 */
/*
分离页面有如下4种类型:
(1) ISOLATE_CLEAN: 分离干净的页面。
(2) ISOLATE_UNMAPPED: 分离没有映射的页面。
(3) ISOLATE_ASYNC_MIGRATE: 分离异步合并的页面。
(4) ISOLATE_UNEVICTABLE: 分离不可回收的页面。
*/
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
    int ret = -EINVAL;

    /* Only take pages on the LRU. */
    if (!PageLRU(page))
        return ret;

    /* Compaction should not handle unevictable pages but CMA can do so */
    /*如果page是不可回收的且mode不等于ISOLATE_UNEVICTABLE,则返回错误*/
    if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
        return ret;

    ret = -EBUSY;

    /*
     * To minimise LRU disruption, the caller can indicate that it only
     * wants to isolate pages it will be able to operate on without
     * blocking - clean pages for the most part.
     *
     * ISOLATE_CLEAN means that only clean pages should be isolated. This
     * is used by reclaim when it is cannot write to backing storage
     *
     * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
     * that it is possible to migrate without blocking
     */
    /*分离ISOLATE_CLEAN和ISOLATE_ASYNC_MIGRATE情况的页面*/
    if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
        /* All the caller can do on PageWriteback is block */
        if (PageWriteback(page))
            return ret;

        if (PageDirty(page)) {
            struct address_space *mapping;

            /* ISOLATE_CLEAN means only clean pages */
            if (mode & ISOLATE_CLEAN)
                return ret;

            /*
             * Only pages without mappings or that have a
             * ->migratepage callback are possible to migrate
             * without blocking
             */
            mapping = page_mapping(page);
            if (mapping && !mapping->a_ops->migratepage)
                return ret;
        }
    }
    /*如果mode是ISOLATE_UNMAPPED,但是page有mapped,那么返回-EBUSY*/
    if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
        return ret;

    /*get_page_unless_zero()是为page->_count引用计数加1,并且判断加1之后是否等于0,
    也就是说,这个page不能是空闲页面,否则返回-EBUSY*/
    if (likely(get_page_unless_zero(page))) {
        /*
         * Be careful not to clear PageLRU until after we're
         * sure the page is not being freed elsewhere -- the
         * page release code relies on it.
         */
        ClearPageLRU(page);
        ret = 0;
    }

    return ret;
}