- 论坛徽章:
- 0
|
From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
To: BK Commits List:;
Subject: [PATCH] hot-n-cold pages: page allocator core
Date: Wed, 30 Oct 2002 23:35:53 +0000
ChangeSet 1.909, 2002/10/30 15:35:53-08:00, akpm@digeo.com
[PATCH] hot-n-cold pages: page allocator core
Hot/Cold pages and zone->lock amortisation
# This patch includes the following deltas:
# ChangeSet 1.908 -> 1.909
# include/linux/mmzone.h 1.28 -> 1.29
# include/linux/mm.h 1.89 -> 1.90
# mm/page_alloc.c 1.119 -> 1.120
# include/linux/gfp.h 1.8 -> 1.9
# mm/swap.c 1.37 -> 1.38
#
include/linux/gfp.h | 7 +-
include/linux/mm.h | 1
include/linux/mmzone.h | 17 +++++
mm/page_alloc.c | 160 ++++++++++++++++++++++++++++++++++++++-----------
mm/swap.c | 5 -
5 files changed, 151 insertions(+), 39 deletions(-)
diff -Nru a/include/linux/gfp.h b/include/linux/gfp.h
--- a/include/linux/gfp.h Wed Oct 30 16:21:56 2002
+++ b/include/linux/gfp.h Wed Oct 30 16:21:56 2002
@@ -17,6 +17,7 @@
#define __GFP_IO 0x40 /* Can start low memory physical IO? */
#define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */
#define __GFP_FS 0x100 /* Can call down to low-level FS? */
+#define __GFP_COLD 0x200 /* Cache-cold page required */
#define GFP_NOHIGHIO ( __GFP_WAIT | __GFP_IO)
#define GFP_NOIO ( __GFP_WAIT)
@@ -32,6 +33,7 @@
#define GFP_DMA __GFP_DMA
+
/*
* There is only one page-allocator function, and two main namespaces to
* it. The alloc_page*() variants return 'struct page *' and as such
@@ -77,11 +79,10 @@
#define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA,(order))
-/*
- * There is only one 'core' page-freeing function.
- */
extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
+extern void FASTCALL(free_hot_page(struct page *page));
+extern void FASTCALL(free_cold_page(struct page *page));
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Wed Oct 30 16:21:56 2002
+++ b/include/linux/mm.h Wed Oct 30 16:21:56 2002
@@ -211,7 +211,6 @@
#define set_page_count(p,v) atomic_set(&(p)->count, v)
extern void FASTCALL(__page_cache_release(struct page *));
-void FASTCALL(__free_pages_ok(struct page *page, unsigned int order));
static inline void put_page(struct page *page)
{
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h Wed Oct 30 16:21:56 2002
+++ b/include/linux/mmzone.h Wed Oct 30 16:21:56 2002
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/cache.h>
+#include <linux/threads.h>
#include <asm/atomic.h>
#ifdef CONFIG_DISCONTIGMEM
#include <asm/numnodes.h>
@@ -46,6 +47,18 @@
#define ZONE_PADDING(name)
#endif
+struct per_cpu_pages {
+ int count; /* number of pages in the list */
+ int low; /* low watermark, refill needed */
+ int high; /* high watermark, emptying needed */
+ int batch; /* chunk size for buddy add/remove */
+ struct list_head list; /* the list of pages */
+};
+
+struct per_cpu_pageset {
+ struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+} ____cacheline_aligned_in_smp;
+
/*
* On machines where it is needed (eg PCs) we divide physical memory
* into multiple physical zones. On a PC we have 3 zones:
@@ -106,6 +119,10 @@
wait_queue_head_t * wait_table;
unsigned long wait_table_size;
unsigned long wait_table_bits;
+
+ ZONE_PADDING(_pad3_)
+
+ struct per_cpu_pageset pageset[NR_CPUS];
/*
* Discontig memory support fields.
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c Wed Oct 30 16:21:56 2002
+++ b/mm/page_alloc.c Wed Oct 30 16:21:56 2002
@@ -10,6 +10,8 @@
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
#include <linux/config.h>
@@ -151,13 +153,14 @@
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free, or 0 for all on the list.
*/
-static void
+static int
free_pages_bulk(struct zone *zone, int count,
struct list_head *list, unsigned int order)
{
unsigned long mask, flags;
struct free_area *area;
struct page *base, *page = NULL;
+ int ret = 0;
mask = (~0UL) << order;
base = zone->zone_mem_map;
@@ -169,8 +172,10 @@
list_del(&page->list);
__free_pages_bulk(page, base, zone, area, mask, order);
mod_page_state(pgfree, count<<order);
+ ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
}
void __free_pages_ok(struct page *page, unsigned int order)
@@ -201,14 +206,13 @@
index += size;
page += size;
}
- BUG_ON(bad_range(zone, page));
return page;
}
/*
* This page is about to be returned from the page allocator
*/
-static inline void prep_new_page(struct page *page)
+static void prep_new_page(struct page *page)
{
if ( page->mapping ||
page_mapped(page) ||
@@ -248,36 +252,17 @@
continue;
page = list_entry(curr, struct page, list);
- BUG_ON(bad_range(zone, page));
list_del(curr);
index = page - zone->zone_mem_map;
if (current_order != MAX_ORDER-1)
MARK_USED(index, current_order, area);
zone->free_pages -= 1UL << order;
- page = expand(zone, page, index, order, current_order, area);
- return page;
+ return expand(zone, page, index, order, current_order, area);
}
return NULL;
}
-/* Obtain a single element from the buddy allocator */
-static struct page *rmqueue(struct zone *zone, unsigned int order)
-{
- unsigned long flags;
- struct page *page;
-
- spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
- spin_unlock_irqrestore(&zone->lock, flags);
-
- if (page != NULL) {
- BUG_ON(bad_range(zone, page));
- prep_new_page(page);
- }
- return page;
-}
-
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
@@ -341,6 +326,72 @@
#endif /* CONFIG_SOFTWARE_SUSPEND */
/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void free_hot_cold_page(struct page *page, int cold)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ unsigned long flags;
+
+ free_pages_check(__FUNCTION__, page);
+ pcp = &zone->pageset[get_cpu()].pcp[cold];
+ local_irq_save(flags);
+ if (pcp->count >= pcp->high)
+ pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ list_add(&page->list, &pcp->list);
+ pcp->count++;
+ local_irq_restore(flags);
+ put_cpu();
+}
+
+void free_hot_page(struct page *page)
+{
+ free_hot_cold_page(page, 0);
+}
+
+void free_cold_page(struct page *page)
+{
+ free_hot_cold_page(page, 1);
+}
+
+static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ if (order == 0) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[get_cpu()].pcp[cold];
+ local_irq_save(flags);
+ if (pcp->count <= pcp->low)
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, &pcp->list);
+ if (pcp->count) {
+ page = list_entry(pcp->list.next, struct page, list);
+ list_del(&page->list);
+ pcp->count--;
+ }
+ local_irq_restore(flags);
+ put_cpu();
+ }
+
+ if (page == NULL) {
+ spin_lock_irqsave(&zone->lock, flags);
+ page = __rmqueue(zone, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ if (page != NULL) {
+ BUG_ON(bad_range(zone, page));
+ prep_new_page(page);
+ }
+ return page;
+}
+
+/*
* This is the 'heart' of the zoned buddy allocator:
*/
struct page *
@@ -349,13 +400,18 @@
{
unsigned long min;
struct zone **zones, *classzone;
- struct page * page;
+ struct page *page;
int cflags;
int i;
+ int cold;
if (gfp_mask & __GFP_WAIT)
might_sleep();
+ cold = 0;
+ if (gfp_mask & __GFP_COLD)
+ cold = 1;
+
mod_page_state(pgalloc, 1<<order);
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
@@ -371,7 +427,7 @@
/* the incremental min is allegedly to discourage fallback */
min += z->pages_low;
if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
+ page = buffered_rmqueue(z, order, cold);
if (page)
return page;
}
@@ -396,7 +452,7 @@
local_min >>= 2;
min += local_min;
if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
+ page = buffered_rmqueue(z, order, cold);
if (page)
return page;
}
@@ -410,7 +466,7 @@
for (i = 0; zones != NULL; i++) {
struct zone *z = zones;
- page = rmqueue(z, order);
+ page = buffered_rmqueue(z, order, cold);
if (page)
return page;
}
@@ -440,7 +496,7 @@
min += z->pages_min;
if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
+ page = buffered_rmqueue(z, order, cold);
if (page)
return page;
}
@@ -492,13 +548,17 @@
int i = pagevec_count(pvec);
while (--i >= 0)
- __free_pages_ok(pvec->pages, 0);
+ free_hot_page(pvec->pages);
}
void __free_pages(struct page *page, unsigned int order)
{
- if (!PageReserved(page) && put_page_testzero(page))
- __free_pages_ok(page, order);
+ if (!PageReserved(page) && put_page_testzero(page)) {
+ if (order == 0)
+ free_hot_page(page);
+ else
+ __free_pages_ok(page, order);
+ }
}
void free_pages(unsigned long addr, unsigned int order)
@@ -899,7 +959,7 @@
unsigned long i, j;
unsigned long local_offset;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
- int nid = pgdat->node_id;
+ int cpu, nid = pgdat->node_id;
struct page *lmem_map = pgdat->node_mem_map;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
@@ -911,13 +971,13 @@
struct zone *zone = pgdat->node_zones + j;
unsigned long mask;
unsigned long size, realsize;
+ unsigned long batch;
zone_table[nid * MAX_NR_ZONES + j] = zone;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
- printk(" %s zone: %lu pages\n", zone_names[j], realsize);
zone->spanned_pages = size;
zone->present_pages = realsize;
zone->name = zone_names[j];
@@ -925,6 +985,40 @@
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[cpu].pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[1]; /* cold */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ printk(" %s zone: %lu pages, LIFO batch:%lu\n",
+ zone_names[j], realsize, batch);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0);
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c Wed Oct 30 16:21:56 2002
+++ b/mm/swap.c Wed Oct 30 16:21:56 2002
@@ -69,7 +69,8 @@
}
/*
- * This path almost never happens - pages are normally freed via pagevecs.
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs. But it gets used by networking.
*/
void __page_cache_release(struct page *page)
{
@@ -83,7 +84,7 @@
page = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
if (page)
- __free_pages_ok(page, 0);
+ free_hot_page(page);
}
/*
希望对你有帮助! |
|