Linux内存管理之页面回收2.。。。

凝望长空 发表于 2012-03-02 11:40

Linux内存管理之页面回收2.。。。

代码如下，对关键部分做了注释：

view plaincopyprint?01.static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
02.{
03. struct anon_vma *anon_vma;
04. struct vm_area_struct *vma;
05. unsigned int mlocked = 0;
06. int ret = SWAP_AGAIN;
07. int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
08.
09. if (MLOCK_PAGES && unlikely(unlock))
10.    ret = SWAP_SUCCESS; /* default for try_to_munlock() */
11. /*如果该页面为匿名映射，返回该页面对应的匿名结构*/
12. anon_vma = page_lock_anon_vma(page);
13. if (!anon_vma)
14.    return ret;
15. /*这里可以看出，vma的anon_vma_node字段链接到
16. anon_vma的head字段*/
17. /*扫描线性区描述符的anon_vma链表*/
18. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
19.    if (MLOCK_PAGES && unlikely(unlock)) {
20.          if (!((vma->vm_flags & VM_LOCKED) &&
21.                page_mapped_in_vma(page, vma)))
22.             continue;/* must visit all unlocked vmas */
23.          ret = SWAP_MLOCK;/* saw at least one mlocked vma */
24.    } else {
25.          /*对anon_vma链表中的每一个vma线性区描述符
26.          调用该函数*/
27.          ret = try_to_unmap_one(page, vma, flags);
28.          if (ret == SWAP_FAIL || !page_mapped(page))
29.             break;
30.    }
31.    if (ret == SWAP_MLOCK) {
32.          mlocked = try_to_mlock_page(page, vma);
33.          if (mlocked)
34.             break;/* stop if actually mlocked page */
35.    }
36. }
37.
38. page_unlock_anon_vma(anon_vma);
39.
40. if (mlocked)
41.    ret = SWAP_MLOCK; /* actually mlocked the page */
42. else if (ret == SWAP_MLOCK)
43.    ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
44.
45. return ret;
46.｝
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
unsigned int mlocked = 0;
int ret = SWAP_AGAIN;
int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;

if (MLOCK_PAGES && unlikely(unlock))
ret = SWAP_SUCCESS; /* default for try_to_munlock() */
/*如果该页面为匿名映射，返回该页面对应的匿名结构*/
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
/*这里可以看出，vma的anon_vma_node字段链接到
anon_vma的head字段*/
/*扫描线性区描述符的anon_vma链表*/
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
if (MLOCK_PAGES && unlikely(unlock)) {
if (!((vma->vm_flags & VM_LOCKED) &&
      page_mapped_in_vma(page, vma)))
continue;/* must visit all unlocked vmas */
ret = SWAP_MLOCK;/* saw at least one mlocked vma */
} else {
/*对anon_vma链表中的每一个vma线性区描述符
调用该函数*/
ret = try_to_unmap_one(page, vma, flags);
if (ret == SWAP_FAIL || !page_mapped(page))
break;
}
if (ret == SWAP_MLOCK) {
mlocked = try_to_mlock_page(page, vma);
if (mlocked)
break; /* stop if actually mlocked page */
}
}

page_unlock_anon_vma(anon_vma);

if (mlocked)
ret = SWAP_MLOCK; /* actually mlocked the page */
else if (ret == SWAP_MLOCK)
ret = SWAP_AGAIN; /* saw VM_LOCKED vma */

return ret;
｝ view plaincopyprint?01./*
02. * Subfunctions of try_to_unmap: try_to_unmap_one called
03. * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
04. */
05. /**
06. *page是一个指向目标页描述符的指针；
07. *vma是指向线性区描述符的指针
08. */
09.static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
10.             enum ttu_flags flags)
11.{
12. struct mm_struct *mm = vma->vm_mm;
13. unsigned long address;
14. pte_t *pte;
15. pte_t pteval;
16. spinlock_t *ptl;
17. int ret = SWAP_AGAIN;
18. /*计算出待回收页的线性地址*/
19. address = vma_address(page, vma);
20. if (address == -EFAULT)
21.    goto out;
22. /*获取线性地址对应的页表项地址*/
23. pte = page_check_address(page, mm, address, &ptl, 0);
24. if (!pte)
25.    goto out;
26.
27. /*
28. * If the page is mlock()d, we cannot swap it out.
29. * If it's recently referenced (perhaps page_referenced
30. * skipped over this mm) then we should reactivate it.
31. */
32. /*下面为判断是否可以被回收*/
33. if (!(flags & TTU_IGNORE_MLOCK)) {
34.    if (vma->vm_flags & VM_LOCKED) {
35.          ret = SWAP_MLOCK;
36.          goto out_unmap;
37.    }
38. }
39. if (!(flags & TTU_IGNORE_ACCESS)) {
40.    if (ptep_clear_flush_young_notify(vma, address, pte)) {
41.          ret = SWAP_FAIL;
42.          goto out_unmap;
43.    }
44. }
45.
46. /* Nuke the page table entry. */
47. flush_cache_page(vma, address, page_to_pfn(page));
48. /*更新页表项并冲刷相应的TLB*/
49. pteval = ptep_clear_flush_notify(vma, address, pte);
50.
51. /* Move the dirty bit to the physical page now the pte is gone. */
52. if (pte_dirty(pteval))/*如果是脏页面，置位PG_dirty*/
53.    set_page_dirty(page);
54.
55. /* Update high watermark before we lower rss */
56. /*更新mm的hiwater_rss*/
57. update_hiwater_rss(mm);
58.
59. if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
60.    if (PageAnon(page))
61.          dec_mm_counter(mm, anon_rss);
62.    else
63.          dec_mm_counter(mm, file_rss);
64.    set_pte_at(mm, address, pte,
65.             swp_entry_to_pte(make_hwpoison_entry(page)));
66. } else if (PageAnon(page)) {/*如果是匿名页*/
67.    swp_entry_t entry = { .val = page_private(page) };
68.
69.    if (PageSwapCache(page)) {
70.          /*
71.          * Store the swap location in the pte.
72.          * See handle_pte_fault() ...
73.          */
74.          /*保存换出位置*/
75.          swap_duplicate(entry);
76.          if (list_empty(&mm->mmlist)) {
77.             spin_lock(&mmlist_lock);
78.             if (list_empty(&mm->mmlist))
79.                /*添加到init_mm的相应链表，从这里可以
80.                看出mm->mmlist为交换用的链表*/
81.                list_add(&mm->mmlist, &init_mm.mmlist);
82.             spin_unlock(&mmlist_lock);
83.          }
84.          dec_mm_counter(mm, anon_rss);
85.    } else if (PAGE_MIGRATION) {
86.          /*
87.          * Store the pfn of the page in a special migration
88.          * pte. do_swap_page() will wait until the migration
89.          * pte is removed and then restart fault handling.
90.          */
91.          BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
92.          entry = make_migration_entry(page, pte_write(pteval));
93.    }
94.    set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
95.    BUG_ON(pte_file(*pte));
96. } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
97.    /* Establish migration entry for a file page */
98.    swp_entry_t entry;
99.    entry = make_migration_entry(page, pte_write(pteval));
100.    set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
101. } else
102.    dec_mm_counter(mm, file_rss);
103.
104. /*断开页表项和物理页面的关系*/
105. page_remove_rmap(page);
106. /*释放所分配的缓存*/
107. page_cache_release(page);
108.
109.out_unmap:
110. pte_unmap_unlock(pte, ptl);
111.out:
112. return ret;
113.}
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
/**
*page是一个指向目标页描述符的指针；
*vma是指向线性区描述符的指针
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
enum ttu_flags flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
/*计算出待回收页的线性地址*/
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
/*获取线性地址对应的页表项地址*/
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;

/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
/*下面为判断是否可以被回收*/
if (!(flags & TTU_IGNORE_MLOCK)) {
if (vma->vm_flags & VM_LOCKED) {
ret = SWAP_MLOCK;
goto out_unmap;
}
}
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
ret = SWAP_FAIL;
goto out_unmap;
}
}

/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
/*更新页表项并冲刷相应的TLB*/
pteval = ptep_clear_flush_notify(vma, address, pte);

/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))/*如果是脏页面，置位PG_dirty*/
set_page_dirty(page);

/* Update high watermark before we lower rss */
/*更新mm的hiwater_rss*/
update_hiwater_rss(mm);

if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
dec_mm_counter(mm, anon_rss);
else
dec_mm_counter(mm, file_rss);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {/*如果是匿名页*/
swp_entry_t entry = { .val = page_private(page) };

if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
/*保存换出位置*/
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
/*添加到init_mm的相应链表，从这里可以
看出mm->mmlist为交换用的链表*/
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
entry = make_migration_entry(page, pte_write(pteval));
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
} else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
dec_mm_counter(mm, file_rss);

/*断开页表项和物理页面的关系*/
page_remove_rmap(page);
/*释放所分配的缓存*/
page_cache_release(page);

out_unmap:
pte_unmap_unlock(pte, ptl);
out:
return ret;
} 对于给定的物理页面来说，该函数会根据计算出来的线性地址找到对应的页表项地址，并更新页表项。对于匿名页面来说，换出的位置必须要被保存下来，以便于该页面下次被访问的时候可以被换进来。并非所有的页面都是可以被回收的，比如被 mlock() 函数设置过的内存页，或者最近刚被访问过的页面，等等，都是不可以被回收的。一旦遇上这样的页面，该函数会直接跳出执行并返回错误代码。如果涉及到页缓存中的数据，需要设置页缓存中的数据无效，必要的时候还要置位页面标识符以进行数据回写。该函数还会更新相应的一些页面使用计数器，比如前边提到的 _mapcount 字段，还会相应地更新进程拥有的物理页面数目等。

PFRA具体实现

LRU 链表

   在 Linux 中，操作系统对 LRU 的实现主要是基于一对双向链表：active 链表和 inactive 链表，这两个链表是 Linux 操作系统进行页面回收所依赖的关键数据结构，每个内存区域都存在一对这样的链表。顾名思义，那些经常被访问的处于活跃状态的页面会被放在 active 链表上，而那些虽然可能关联到一个或者多个进程，但是并不经常使用的页面则会被放到 inactive 链表上。页面会在这两个双向链表中移动，操作系统会根据页面的活跃程度来判断应该把页面放到哪个链表上。页面可能会从 active 链表上被转移到 inactive 链表上，也可能从 inactive 链表上被转移到 active 链表上，但是，这种转移并不是每次页面访问都会发生，页面的这种转移发生的间隔有可能比较长。那些最近最少使用的页面会被逐个放到 inactive 链表的尾部。进行页面回收的时候，Linux 操作系统会从 inactive 链表的尾部开始进行回收。

用于描述内存区域的 struct zone() 中关于这两个链表以及相关的关键字段的定义如下所示：

view plaincopyprint?01.struct zone {
02. ……
03. spinlock_t          lru_lock;
04. struct list_head active_list;
05. struct list_head inactive_list;
06. unsigned long    nr_active;
07. unsigned long    nr_inactive;
08. ……
09.
10.}
struct zone {
……
spinlock_t    lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long nr_active;
unsigned long nr_inactive;
……

} 各字段含义如下所示：

lru_lock：active_list 和 inactive_list 使用的自旋锁。

active_list：管理内存区域中处于活跃状态的页面。

inactive_list：管理内存区域中处于不活跃状态的页面。

nr_active：active_list 链表上的页面数目。

nr_inactive：inactive_list 链表上的页面数目。

如何在两个LRU 链表之间移动页面

   Linux 引入了两个页面标志符 PG_active 和 PG_referenced 用于标识页面的活跃程度，从而决定如何在两个链表之间移动页面。PG_active 用于表示页面当前是否是活跃的，如果该位被置位，则表示该页面是活跃的。PG_referenced 用于表示页面最近是否被访问过，每次页面被访问，该位都会被置位。Linux 必须同时使用这两个标志符来判断页面的活跃程度，假如只是用一个标志符，在页面被访问时，置位该标志符，之后该页面一直处于活跃状态，如果操作系统不清除该标志位，那么即使之后很长一段时间内该页面都没有或很少被访问过，该页面也还是处于活跃状态。为了能够有效清除该标志位，需要有定时器的支持以便于在超时时间之后该标志位可以自动被清除。然而，很多 Linux 支持的体系结构并不能提供这样的硬件支持，所以 Linux 中使用两个标志符来判断页面的活跃程度。

Linux 2.6 中这两个标志符密切合作，其核心思想如下所示：

•如果页面被认为是活跃的，则将该页的 PG_active 置位；否则，不置位。
•当页面被访问时，检查该页的 PG_referenced 位，若未被置位，则置位之；若发现该页的 PG_referenced 已经被置位了，则意味着该页经常被访问，这时，若该页在 inactive 链表上，则置位其 PG_active 位，将其移动到 active 链表上去，并清除其 PG_referenced 位的设置；如果页面的 PG_referenced 位被置位了一段时间后，该页面没有被再次访问，那么 Linux 操作系统会清除该页面的 PG_referenced 位，因为这意味着这个页面最近这段时间都没有被访问。
•PG_referenced 位同样也可以用于页面从 active 链表移动到 inactive 链表。对于某个在 active 链表上的页面来说，其 PG_active 位被置位，如果 PG_referenced 位未被置位，给定一段时间之后，该页面如果还是没有被访问，那么该页面会被清除其 PG_active 位，挪到 inactive 链表上去。
Linux 中实现在 LRU 链表之间移动页面的关键函数如下所示（本文涉及的源代码均是基于 Linux 2.6.18.1 版本的）：

•mark_page_accessed()：当一个页面被访问时，则调用该函数相应地修改 PG_active 和 PG_referenced。
•page_referenced()：当操作系统进行页面回收时，每扫描到一个页面，就会调用该函数设置页面的 PG_referenced 位。如果一个页面的 PG_referenced 位被置位，但是在一定时间内该页面没有被再次访问，那么该页面的 PG_referenced 位会被清除。
•activate_page()：该函数将页面放到 active 链表上去。
•shrink_active_list()：该函数将页面移动到 inactive 链表上去。
LRU 缓存

   前边提到，页面根据其活跃程度会在 active 链表和 inactive 链表之间来回移动，如果要将某个页面插入到这两个链表中去，必须要通过自旋锁以保证对链表的并发访问操作不会出错。为了降低锁的竞争，Linux 提供了一种特殊的缓存：LRU 缓存，用以批量地向 LRU 链表中快速地添加页面。有了 LRU 缓存之后，新页不会被马上添加到相应的链表上去，而是先被放到一个缓冲区中去，当该缓冲区缓存了足够多的页面之后，缓冲区中的页面才会被一次性地全部添加到相应的 LRU 链表中去。Linux 采用这种方法降低了锁的竞争，极大地提升了系统的性能。

LRU 缓存用到了 pagevec 结构，如下所示 :

view plaincopyprint?01.struct pagevec {
02. unsigned long nr;
03. unsigned long cold;
04. struct page *pages;
05. };
struct pagevec {
unsigned long nr;
unsigned long cold;
struct page *pages;
};
pagevec 这个结构就是用来管理 LRU 缓存中的这些页面的。该结构定义了一个数组，这个数组中的项是指向 page 结构的指针。一个 pagevec 结构最多可以存在 14 个这样的项（PAGEVEC_SIZE 的默认值是 14）。当一个 pagevec 的结构满了，那么该 pagevec 中的所有页面会一次性地被移动到相应的 LRU 链表上去。

用来实现 LRU 缓存的两个关键函数是 lru_cache_add() 和 lru_cache_add_active()。前者用于延迟将页面添加到 inactive 链表上去，后者用于延迟将页面添加到 active 链表上去。这两个函数都会将要移动的页面先放到页向量 pagevec 中，当 pagevec 满了（已经装了 14 个页面的描述符指针），pagevec 结构中的所有页面才会被一次性地移动到相应的链表上去。

清风鸟儿 发表于 2012-03-02 11:40

谢谢分享

页: [1]

Chinaunix's Archiver

Linux内存管理之页面回收2.。。。