mm/hotplug: correctly setup fallback zonelists when creating new pgdat
When hotadd_new_pgdat() is called to create new pgdat for a new node, a
fallback zonelist should be created for the new node. There's code to try
to achieve that in hotadd_new_pgdat() as below:
/*
* The node we allocated has no zone fallback lists. For avoiding
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
But it doesn't work as expected. When hotadd_new_pgdat() is called, the
new node is still in offline state because node_set_online(nid) hasn't
been called yet. And build_all_zonelists() only builds zonelists for
online nodes as:
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
}
Though we hope to create zonelist for the new pgdat, but it doesn't. So
add a new parameter "pgdat" the build_all_zonelists() to build pgdat for
the new pgdat too.
Signed-off-by: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Keping Chen <chenkeping@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f64afa5..98f079b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -721,7 +721,7 @@
#include <linux/memory_hotplug.h>
extern struct mutex zonelists_mutex;
-void build_all_zonelists(void *data);
+void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags);
diff --git a/init/main.c b/init/main.c
index 95316a1..e60679d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -506,7 +506,7 @@
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
page_alloc_init();
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb522..14d3258 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb29..b873104 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -513,7 +513,7 @@
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
if (need_zonelists_rebuild)
- build_all_zonelists(zone);
+ build_all_zonelists(NULL, zone);
else
zone_pcp_update(zone);
@@ -562,7 +562,7 @@
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
return pgdat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6c7e3bd..9ad6866 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3032,7 +3032,7 @@
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
}
@@ -3415,10 +3415,17 @@
{
int nid;
int cpu;
+ pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ build_zonelist_cache(self);
+ }
+
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
@@ -3463,7 +3470,7 @@
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
@@ -3475,10 +3482,10 @@
/* we have to stop all cpus to guarantee there is no user
of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
- if (data)
- setup_zone_pageset((struct zone *)data);
+ if (zone)
+ setup_zone_pageset(zone);
#endif
- stop_machine(__build_all_zonelists, NULL, NULL);
+ stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();