From 1f522509c77a5dea8dc384b735314f03908a6415 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:51 -0700 Subject: mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset For each new populated zone of hotadded node, need to update its pagesets with dynamically allocated per_cpu_pageset struct for all possible CPUs: 1) Detach zone->pageset from the shared boot_pageset at end of __build_all_zonelists(). 2) Use mutex to protect zone->pageset when it's still shared in onlined_pages() Otherwises, multiple zones of different nodes would share same boot strapping boot_pageset for same CPU, which will finally cause below kernel panic: ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:1239! invalid opcode: 0000 [#1] SMP ... Call Trace: [] __alloc_pages_nodemask+0x131/0x7b0 [] alloc_pages_current+0x87/0xd0 [] __page_cache_alloc+0x67/0x70 [] __do_page_cache_readahead+0x120/0x260 [] ra_submit+0x21/0x30 [] ondemand_readahead+0x166/0x2c0 [] page_cache_async_readahead+0x80/0xa0 [] generic_file_aio_read+0x364/0x670 [] nfs_file_read+0xca/0x130 [] do_sync_read+0xfa/0x140 [] vfs_read+0xb5/0x1a0 [] sys_read+0x51/0x80 [] system_call_fastpath+0x16/0x1b RIP [] get_page_from_freelist+0x883/0x900 RSP ---[ end trace 4bda28328b9990db ] [akpm@linux-foundation.org: merge fix] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 595d0ac211e..21c52d2d862 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) - build_all_zonelists(); + build_all_zonelists(NULL); } out: mutex_unlock(&zl_order_mutex); @@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; @@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy) build_zonelist_cache(pgdat); } +#ifdef CONFIG_MEMORY_HOTPLUG + /* Setup real pagesets for the new zone */ + if (data) { + struct zone *zone = data; + setup_zone_pageset(zone); + } +#endif + /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy) return 0; } -void build_all_zonelists(void) +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2969,7 +2978,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, data, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3