mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset

For each new populated zone of hotadded node, need to update its pagesets
with dynamically allocated per_cpu_pageset struct for all possible CPUs:

    1) Detach zone->pageset from the shared boot_pageset
       at end of __build_all_zonelists().

    2) Use mutex to protect zone->pageset when it's still
       shared in onlined_pages()

Otherwises, multiple zones of different nodes would share same boot strapping
boot_pageset for same CPU, which will finally cause below kernel panic:

  ------------[ cut here ]------------
  kernel BUG at mm/page_alloc.c:1239!
  invalid opcode: 0000 [#1] SMP
  ...
  Call Trace:
   [<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0
   [<ffffffff81162e67>] alloc_pages_current+0x87/0xd0
   [<ffffffff81128407>] __page_cache_alloc+0x67/0x70
   [<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260
   [<ffffffff81132751>] ra_submit+0x21/0x30
   [<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0
   [<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0
   [<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670
   [<ffffffff81266cfa>] nfs_file_read+0xca/0x130
   [<ffffffff8117b20a>] do_sync_read+0xfa/0x140
   [<ffffffff8117bf75>] vfs_read+0xb5/0x1a0
   [<ffffffff8117c151>] sys_read+0x51/0x80
   [<ffffffff8103c032>] system_call_fastpath+0x16/0x1b
  RIP  [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900
   RSP <ffff88000d1e78a8>
  ---[ end trace 4bda28328b9990db ]

[akpm@linux-foundation.org: merge fix]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 85eb4d3..089cc97 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -389,6 +389,11 @@
 	int nid;
 	int ret;
 	struct memory_notify arg;
+	/*
+	 * mutex to protect zone->pageset when it's still shared
+	 * in onlined_pages()
+	 */
+	static DEFINE_MUTEX(zone_pageset_mutex);
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -415,12 +420,14 @@
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
+	mutex_lock(&zone_pageset_mutex);
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 		online_pages_range);
 	if (ret) {
+		mutex_unlock(&zone_pageset_mutex);
 		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
 			nr_pages, pfn);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +436,12 @@
 
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
+	if (need_zonelists_rebuild)
+		build_all_zonelists(zone);
+	else
+		zone_pcp_update(zone);
 
-	zone_pcp_update(zone);
+	mutex_unlock(&zone_pageset_mutex);
 	setup_per_zone_wmarks();
 	calculate_zone_inactive_ratio(zone);
 	if (onlined_pages) {
@@ -438,10 +449,7 @@
 		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
 	}
 
-	if (need_zonelists_rebuild)
-		build_all_zonelists();
-	else
-		vm_total_pages = nr_free_pagecache_pages();
+	vm_total_pages = nr_free_pagecache_pages();
 
 	writeback_set_ratelimit();