首先来看当不活跃LRU的页面数量少于活跃LRU的页面数量的情况,shrink_active_list()函数扫描活跃LRU链表,看是否有页面可以迁移到不活跃LRU链表中。
[kswapd()->balance_pgdat()->kswapd_shrink_zone()->shrink_zone()->shrink_lruvec()->shrink_list()->shrink_active_list()]
/*
注意在操作LRU链表时,有一把保护LRU的spinlock锁zone->lru_lock。
isolate_lru_pages()批量地把LRU链表的部分页面先迁移到临时链表中,从而
减少加锁的时间。
*/
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
/*定义了三个临时链表:l_hold/l_active/l_inactive */
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
/*从lruvec结构返回zone数据结构*/
struct zone *zone = lruvec_zone(lruvec);
lru_add_drain();
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
/*申请zone->lru_lock锁来保护LRU链表操作*/
spin_lock_irq(&zone->lru_lock);
/*isolate_lru_pages()批量地从LRU链表中分离nr_to_scan个页面到l_hold链表中,
这里会根据isolate_mode来考虑一些特殊情况,基本上就是把LRU链表的页面迁移到临时
l_hold链表中。下面查看isolate_lru_pages()函数实现*/
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);/*增加zone中的NR_PAGES_SCANNED计数*/
/*增加recent_scanned[]计数,在get_scan_count()计算匿名页面和文件缓存页面
分别扫描数量时会用到。*/
reclaim_stat->recent_scanned[file] += nr_taken;
/*增加zone中PGREFILL、NR_LRU_BASE和NR_ISOLATED_ANON计数*/
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
/*下面开始扫描临时l_hold链表的页面,有些页面会添加到l_active中,有些会加入到
l_inactive中。*/
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
/*如果页面是不可回收的,那么就把它返回到不可回收的LRU链表中*/
if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}
/*page_referenced()函数返回该页最近访问引用pte的个数,返回0表示最近没有被访问过。
除了可执行的page cache页面,其他被访问引用的页面(referenced page)为什么都被加入
到不活跃链表里,而不是继续待在活跃LRU链表中呢?
把最近又访问引用的页面全部迁移到活跃LRU链表会产生一个比较大的可扩展性问题(scalability
problem)。在一个内存很大的系统中,当系统用完了这些空闲内存时,每个页面都会被访问引用到,
这种情况下我们不仅没有时间去扫描活跃LRU链表,而且还重新设置访问比特位(referenced bit),
而这些信息没有什么用处。所以从Linux2.6.28开始,扫描活跃链表时会把页面全部都迁移到不活跃
链表中。这里只需要清理硬件的访问比特位(page_referenced()来完成),当有访问引用时,扫描不
活跃LRU链表就迁移回到活跃LRU链表中。
让可执行的page cache页面(mapped executable file pages)继续保存在活跃链表中,在扫描活跃
链表期间它们可能再次被访问到,因为LRU链表的扫描顺序是先扫描不活跃链表,然后再扫描活跃链表
且扫描不活跃链表的速度要快于活跃链表,因此它们可以获得比较多的时间让用户进程再次访问,从而
提高用户进程的交互体验。可执行的页面通常是vma的属性标记着VM_EXEC,这些页面通常包括可执行的
文件和它们连接的库文件等。*/
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}
/*如果页面没有被引用,那么加入到l_inactive链表*/
ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);
}
/*
* Move pages back to the lru list.
*/
spin_lock_irq(&zone->lru_lock);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_count.
*/
/*这里把最近被引用的页面(referenced pages)统计到recent_rotated中,以便在下一次扫描
时在 get_scan_count()中重新计算匿名页面和文件映射页面LRU链表的扫描比重。*/
reclaim_stat->recent_rotated[file] += nr_rotated;
/*把l_inactive和l_active链表的页迁移到LRU相应的链表中。*/
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
/*l_hold链表是剩下的页面,表示可以释放*/
free_hot_cold_page_list(&l_hold, true);
}
isolate_lru_pages()函数实现:批量把LRU部分链表迁移到临时链表
[shrink_active_list()->isolate_lru_pages()]
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
/*参数说明:
@nr_to_scan: 表示在这个链表中扫描页面的个数
@lruvec: LRU链表集合
@dst: 临时存放的链表
@nr_scanned: 已经扫描的页面个数
@sc: 页面回收的控制数据结构struct scan_control
@mode: 分离LRU的模式
@lru:哪类LRU
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
isolate_mode_t mode, enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON_PAGE(!PageLRU(page), page);
/*调用__isolate_lru_page()来分离页面,返回0,则表示分离成功,并
把页面迁移到dst临时链表中。下面查看__isolate_lru_page实现*/
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
list_move(&page->lru, dst);
nr_taken += nr_pages;
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
continue;
default:
BUG();
}
}
*nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
return nr_taken;
}
返回shrink_active_list()函数
__isolate_lru_page()函数:
[shrink_active_list()->isolate_lru_pages()->__isolate_lru_page()]
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
* freed elsewhere are also ignored.
*
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
* returns 0 on success, -ve errno on failure.
*/
/*
分离页面有如下4种类型:
(1) ISOLATE_CLEAN: 分离干净的页面。
(2) ISOLATE_UNMAPPED: 分离没有映射的页面。
(3) ISOLATE_ASYNC_MIGRATE: 分离异步合并的页面。
(4) ISOLATE_UNEVICTABLE: 分离不可回收的页面。
*/
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
/* Compaction should not handle unevictable pages but CMA can do so */
/*如果page是不可回收的且mode不等于ISOLATE_UNEVICTABLE,则返回错误*/
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
return ret;
ret = -EBUSY;
/*
* To minimise LRU disruption, the caller can indicate that it only
* wants to isolate pages it will be able to operate on without
* blocking - clean pages for the most part.
*
* ISOLATE_CLEAN means that only clean pages should be isolated. This
* is used by reclaim when it is cannot write to backing storage
*
* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
* that it is possible to migrate without blocking
*/
/*分离ISOLATE_CLEAN和ISOLATE_ASYNC_MIGRATE情况的页面*/
if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
return ret;
if (PageDirty(page)) {
struct address_space *mapping;
/* ISOLATE_CLEAN means only clean pages */
if (mode & ISOLATE_CLEAN)
return ret;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
* without blocking
*/
mapping = page_mapping(page);
if (mapping && !mapping->a_ops->migratepage)
return ret;
}
}
/*如果mode是ISOLATE_UNMAPPED,但是page有mapped,那么返回-EBUSY*/
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
return ret;
/*get_page_unless_zero()是为page->_count引用计数加1,并且判断加1之后是否等于0,
也就是说,这个page不能是空闲页面,否则返回-EBUSY*/
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
ret = 0;
}
return ret;
}