- 论坛徽章:
- 6
|
本帖最后由 瀚海书香 于 2014-05-14 14:52 编辑
回复 11# humjb_1983
直接看SUSE 内核源代码的patch就可以的
pagecache_limit_ignore_dirty的patch:- From: Kurt Garloff <garloff@suse.de>
- Subject: Make pagecache limit behavior w.r.t. dirty pages configurable
- References: FATE309111
- Patch-mainline: Never
- The last fixes to this patchset ensured that we don't end up calling
- shrink_page_cache() [from add_to_page_cache()] again and again without
- the ability to actually free something. For this reason we subtracted
- the dirty pages from the list of freeable unmapped pages in the
- calculation.
- With this additional patch, a new sysctl
- /proc/sys/vm/pagecache_limit_ignore_dirty
- is introduced. With the default setting (1), behavior does not change.
- When setting it to 0, we actually consider all of the dirty pages
- freeable -- we then allow for a third pass in shrink_page_cache, where
- we allow writing out pages (if the gfp_mask allows it).
- The value can be set to values above 1 as well; with the value set to 2,
- we consider half of the dirty pages freeable etc.
- Signed-off-by: Kurt Garloff <garloff@suse.de>
- Index: linux-3.0-SLE11-SP2-3.0/include/linux/swap.h
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/include/linux/swap.h
- +++ linux-3.0-SLE11-SP2-3.0/include/linux/swap.h
- @@ -266,6 +266,7 @@ extern int vm_swappiness;
- extern unsigned long pagecache_over_limit(void);
- extern void shrink_page_cache(gfp_t mask, struct page *page);
- extern unsigned int vm_pagecache_limit_mb;
- +extern unsigned int vm_pagecache_ignore_dirty;
- extern int remove_mapping(struct address_space *mapping, struct page *page);
- extern long vm_total_pages;
-
- Index: linux-3.0-SLE11-SP2-3.0/kernel/sysctl.c
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/kernel/sysctl.c
- +++ linux-3.0-SLE11-SP2-3.0/kernel/sysctl.c
- @@ -1133,6 +1133,13 @@ static struct ctl_table vm_table[] = {
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- + {
- + .procname = "pagecache_limit_ignore_dirty",
- + .data = &vm_pagecache_ignore_dirty,
- + .maxlen = sizeof(vm_pagecache_ignore_dirty),
- + .mode = 0644,
- + .proc_handler = &proc_dointvec,
- + },
- #ifdef CONFIG_HUGETLB_PAGE
- {
- .procname = "nr_hugepages",
- Index: linux-3.0-SLE11-SP2-3.0/mm/vmscan.c
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/mm/vmscan.c
- +++ linux-3.0-SLE11-SP2-3.0/mm/vmscan.c
- @@ -150,6 +150,7 @@ struct scan_control {
- */
- int vm_swappiness __read_mostly = 60;
- unsigned int vm_pagecache_limit_mb __read_mostly = 0;
- +unsigned int vm_pagecache_ignore_dirty __read_mostly = 1;
- long vm_total_pages __read_mostly; /* The total number of pages which the VM controls */
-
- static LIST_HEAD(shrinker_list);
- @@ -3012,9 +3013,9 @@ static void __shrink_page_cache(gfp_t ma
- * Shrink the LRU in 2 passes:
- * 0 = Reclaim from inactive_list only (fast)
- * 1 = Reclaim from active list but don't reclaim mapped (not that fast)
- - * 2 = Reclaim from active list but don't reclaim mapped (2nd pass)
- + * 2 = Same as 1, but may_writepage = 1 (only done if we can and need it)
- */
- - for (pass = 0; pass < 2; pass++) {
- + for (pass = 0; pass < 3; pass++) {
- int prio;
-
- for (prio = DEF_PRIORITY; prio >= 0; prio--) {
- @@ -3036,6 +3037,13 @@ static void __shrink_page_cache(gfp_t ma
- goto out;
-
- }
- + if (pass == 1) {
- + if (vm_pagecache_ignore_dirty == 1 ||
- + (mask & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS) )
- + break;
- + else
- + sc.may_writepage = 1;
- + }
- }
-
- out:
- Index: linux-3.0-SLE11-SP2-3.0/mm/page_alloc.c
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/mm/page_alloc.c
- +++ linux-3.0-SLE11-SP2-3.0/mm/page_alloc.c
- @@ -5623,13 +5623,15 @@ unsigned long pagecache_over_limit()
- * minus the dirty ones. (FIXME: pages accounted for in NR_WRITEBACK
- * are not on the LRU lists any more, right?) */
- unsigned long pgcache_lru_pages = global_page_state(NR_ACTIVE_FILE)
- - + global_page_state(NR_INACTIVE_FILE)
- - - global_page_state(NR_FILE_DIRTY);
- + + global_page_state(NR_INACTIVE_FILE);
- unsigned long free_pages = global_page_state(NR_FREE_PAGES);
- /* In theory, we'd need to take the swap lock here ... */
- unsigned long swap_pages = total_swap_pages - nr_swap_pages;
- unsigned long limit;
-
- + if (vm_pagecache_ignore_dirty != 0)
- + pgcache_lru_pages -= global_page_state(NR_FILE_DIRTY)
- + /vm_pagecache_ignore_dirty;
- /* Paranoia */
- if (unlikely(pgcache_lru_pages > LONG_MAX))
- return 0;
- Index: linux-3.0-SLE11-SP2-3.0/Documentation/vm/pagecache-limit
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/Documentation/vm/pagecache-limit
- +++ linux-3.0-SLE11-SP2-3.0/Documentation/vm/pagecache-limit
- @@ -1,6 +1,6 @@
- Functionality:
- -------------
- -The patch introduces a new tunable in the proc filesystem:
- +The patch introduces two new tunables in the proc filesystem:
-
- /proc/sys/vm/pagecache_limit_mb
-
- @@ -15,6 +15,13 @@ As we only consider pagecache pages that
- NOTE: The real limit depends on the amount of free memory. Every existing free page allows the page cache to grow 8x the amount of free memory above the set baseline. As soon as the free memory is needed, we free up page cache.
-
-
- +/proc/sys/vm/pagecache_limit_ignore_dirty
- +
- +The default for this setting is 1; this means that we don't consider dirty memory to be part of the limited pagecache, as we can not easily free up dirty memory (we'd need to do writes for this). By setting this to 0, we actually consider dirty (unampped) memory to be freeable and do a third pass in shrink_page_cache() where we schedule the pages for writeout. Values larger than 1 are also possible and result in a fraction of the dirty pages to be considered non-freeable.
- +
- +
- +
- +
- How it works:
- ------------
- The heart of this patch is a new function called shrink_page_cache(). It is called from balance_pgdat (which is the worker for kswapd) if the pagecache is above the limit.
- @@ -27,7 +34,9 @@ shrink_page_cache does several passes:
- This is fast -- but it might not find enough free pages; if that happens,
- the second pass will happen
- - In the second pass, pages from active list will also be considered.
- -- The third pass is just another round of the second pass
- +- The third pass will only happen if pagecacahe_limig_ignore-dirty is not 1.
- + In that case, the third pass is a repetition of the second pass, but this
- + time we allow pages to be written out.
-
- In all passes, only unmapped pages will be considered.
-
- Index: linux-3.0-SLE11-SP2-3.0/mm/filemap.c
- ===================================================================
- --- linux-3.0-SLE11-SP2-3.0.orig/mm/filemap.c
- +++ linux-3.0-SLE11-SP2-3.0/mm/filemap.c
- @@ -509,6 +509,11 @@ int add_to_page_cache(struct page *page,
-
- if (unlikely(vm_pagecache_limit_mb) && pagecache_over_limit() > 0)
- shrink_page_cache(gfp_mask, page);
- + /* FIXME: If we add dirty pages to pagecache here, and we call
- + * shrink_page_cache(), it might need to write out some pages to
- + * keep us below the set pagecache limit -- in order for that to
- + * be successful, we might need to throttle here and do some
- + * congestion_wait(BLK_RW_ASYNC, HZ/10) here. */
-
- __set_page_locked(page);
- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
复制代码 |
|