[PATCH] node local per-cpu-pages
This patch modifies the way pagesets in struct zone are managed.
Each zone has a per-cpu array of pagesets. So any particular CPU has some
memory in each zone structure which belongs to itself. Even if that CPU is
not local to that zone.
So the patch relocates the pagesets for each cpu to the node that is nearest
to the cpu instead of allocating the pagesets in the (possibly remote) target
zone. This means that the operations to manage pages on remote zone can be
done with information available locally.
We play a macro trick so that non-NUMA pmachines avoid the additional
pointer chase on the page allocator fastpath.
AIM7 benchmark on a 32 CPU SGI Altix
w/o patches:
Tasks jobs/min jti jobs/min/task real cpu
1 484.68 100 484.6769 12.01 1.97 Fri Mar 25 11:01:42 2005
100 27140.46 89 271.4046 21.44 148.71 Fri Mar 25 11:02:04 2005
200 30792.02 82 153.9601 37.80 296.72 Fri Mar 25 11:02:42 2005
300 32209.27 81 107.3642 54.21 451.34 Fri Mar 25 11:03:37 2005
400 34962.83 78 87.4071 66.59 588.97 Fri Mar 25 11:04:44 2005
500 31676.92 75 63.3538 91.87 742.71 Fri Mar 25 11:06:16 2005
600 36032.69 73 60.0545 96.91 885.44 Fri Mar 25 11:07:54 2005
700 35540.43 77 50.7720 114.63 1024.28 Fri Mar 25 11:09:49 2005
800 33906.70 74 42.3834 137.32 1181.65 Fri Mar 25 11:12:06 2005
900 34120.67 73 37.9119 153.51 1325.26 Fri Mar 25 11:14:41 2005
1000 34802.37 74 34.8024 167.23 1465.26 Fri Mar 25 11:17:28 2005
with slab API changes and pageset patch:
Tasks jobs/min jti jobs/min/task real cpu
1 485.00 100 485.0000 12.00 1.96 Fri Mar 25 11:46:18 2005
100 28000.96 89 280.0096 20.79 150.45 Fri Mar 25 11:46:39 2005
200 32285.80 79 161.4290 36.05 293.37 Fri Mar 25 11:47:16 2005
300 40424.15 84 134.7472 43.19 438.42 Fri Mar 25 11:47:59 2005
400 39155.01 79 97.8875 59.46 590.05 Fri Mar 25 11:48:59 2005
500 37881.25 82 75.7625 76.82 730.19 Fri Mar 25 11:50:16 2005
600 39083.14 78 65.1386 89.35 872.79 Fri Mar 25 11:51:46 2005
700 38627.83 77 55.1826 105.47 1022.46 Fri Mar 25 11:53:32 2005
800 39631.94 78 49.5399 117.48 1169.94 Fri Mar 25 11:55:30 2005
900 36903.70 79 41.0041 141.94 1310.78 Fri Mar 25 11:57:53 2005
1000 36201.23 77 36.2012 160.77 1458.31 Fri Mar 25 12:00:34 2005
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Shai Fultheim <Shai@Scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b..95cbd30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@
struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_NUMA
+static struct per_cpu_pageset
+ pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
+#endif
+
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -520,7 +525,7 @@
for_each_zone(zone) {
struct per_cpu_pageset *pset;
- pset = &zone->pageset[cpu];
+ pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -583,12 +588,12 @@
local_irq_save(flags);
cpu = smp_processor_id();
- p = &z->pageset[cpu];
+ p = zone_pcp(z,cpu);
if (pg == orig) {
- z->pageset[cpu].numa_hit++;
+ p->numa_hit++;
} else {
p->numa_miss++;
- zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
p->local_node++;
@@ -615,7 +620,7 @@
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count >= pcp->high)
pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@
if (order == 0) {
struct per_cpu_pages *pcp;
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@
if (!cpu_possible(cpu))
continue;
- pageset = zone->pageset + cpu;
+ pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << fls(batch + batch/2)) - 1;
+ return batch;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Dynamicaly allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+ int i;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *npageset = NULL;
+
+ npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!npageset) {
+ zone->pageset[cpu] = NULL;
+ goto bad;
+ }
+
+ if (zone->pageset[cpu]) {
+ memcpy(npageset, zone->pageset[cpu],
+ sizeof(struct per_cpu_pageset));
+
+ /* Relocate lists */
+ for (i = 0; i < 2; i++) {
+ INIT_LIST_HEAD(&npageset->pcp[i].list);
+ list_splice(&zone->pageset[cpu]->pcp[i].list,
+ &npageset->pcp[i].list);
+ }
+ } else {
+ struct per_cpu_pages *pcp;
+ unsigned long batch;
+
+ batch = zone_batchsize(zone);
+
+ pcp = &npageset->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &npageset->pcp[1]; /* cold*/
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ zone->pageset[cpu] = npageset;
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (dzone == zone)
+ break;
+ kfree(dzone->pageset[cpu]);
+ dzone->pageset[cpu] = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ zone_pcp(zone, cpu) = NULL;
+ kfree(pset);
+ }
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1687,43 +1843,28 @@
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- /*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
- *
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
- */
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+ struct per_cpu_pageset *pgset;
+ pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
+ (j * NR_CPUS) + cpu];
- pcp = &zone->pageset[cpu].pcp[0]; /* hot */
+ zone->pageset[cpu] = pgset;
+#else
+ struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+#endif
+
+ pcp = &pgset->pcp[0]; /* hot */
pcp->count = 0;
pcp->low = 2 * batch;
pcp->high = 6 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
- pcp = &zone->pageset[cpu].pcp[1]; /* cold */
+ pcp = &pgset->pcp[1]; /* cold */
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@
struct per_cpu_pageset *pageset;
int j;
- pageset = &zone->pageset[i];
+ pageset = zone_pcp(zone, i);
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
if (pageset->pcp[j].count)
break;