From b9ada4281c48076bdb252813be24ddb57e17cade Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:08 +0100 Subject: x86: reinstate numa remap for SPARSEMEM on x86 NUMA systems Recent kernels have been panic'ing trying to allocate memory early in boot, in __alloc_pages: BUG: unable to handle kernel paging request at 00001568 IP: [] __alloc_pages+0x33/0x2cc *pdpt = 00000000013a5001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 1, comm: swapper Not tainted (2.6.25 #78) EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at __alloc_pages+0x33/0x2cc EAX: 00001564 EBX: 000412d0 ECX: 00001564 EDX: 000005c3 ESI: f78012a0 EDI: 00000001 EBP: 00001564 ESP: f7871e50 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f7870000 task=f786f670 task.ti=f7870000) Stack: 00000000 f786f670 00000010 00000000 0000b700 000412d0 f78012a0 00000001 00000000 c105b64d 00000000 000412d0 f78012a0 f7803120 00000000 c105c1c5 00000010 f7803144 000412d0 00000001 f7803130 f7803120 f78012a0 00000001 Call Trace: [] kmem_getpages+0x94/0x129 [] cache_grow+0x8f/0x123 [] ____cache_alloc_node+0xb9/0xe4 [] kmem_cache_alloc_node+0x92/0xd2 [] build_sched_domains+0x536/0x70d [] do_flush_tlb_all+0x0/0x3f [] do_flush_tlb_all+0x0/0x3f [] interleave_nodes+0x23/0x5a [] alternate_node_alloc+0x43/0x5b [] arch_init_sched_domains+0x46/0x51 [] kernel_init+0x0/0x82 [] sched_init_smp+0x10/0xbb [] kernel_init+0x43/0x82 [] kernel_thread_helper+0x7/0x10 Debugging this showed that the NODE_DATA() for nodes other than node 0 were all NULL. Tracing this back showed that the NODE_DATA() pointers were being initialised to each nodes remap space. However under SPARSEMEM remap is disabled which leads to the pgdat's being placed incorrectly at kernel virtual address 0. Leading to the panic when attempting to allocate memory from these nodes. Numa remap was disabled in the commit below. This occured while fixing problems triggered when attempting to boot x86_32 NUMA SPARSEMEM kernels on non-numa hardware. x86: make NUMA work on 32-bit commit 1b000a5dbeb2f34bc03d45ebdf3f6d24a60c3aed The real problem is believed to be related to other alignment issues in the regions blocked out from the bootmem allocator for small memory systems, and has been fixed separately. Therefore re-enable remap for SPARSMEM, which fixes pgdat allocation issues. Testing confirms that SPARSMEM NUMA kernels will boot correctly with this part of the change reverted. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf98368..026201f143a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -164,16 +164,13 @@ static void __init allocate_pgdat(int nid) } } -#ifdef CONFIG_DISCONTIGMEM /* - * In the discontig memory model, a portion of the kernel virtual area (KVA) - * is reserved and portions of nodes are mapped using it. This is to allow - * node-local memory to be allocated for structures that would normally require - * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers - * should be prepared to allocate from the bootmem allocator instead. This KVA - * mechanism is incompatible with SPARSEMEM as it makes assumptions about the - * layout of memory that are broken if alloc_remap() succeeds for some of the - * map and fails for others + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; @@ -290,25 +287,6 @@ static void init_remap_allocator(int nid) (ulong) pfn_to_kaddr(highstart_pfn + node_remap_offset[nid] + node_remap_size[nid])); } -#else -void *alloc_remap(int nid, unsigned long size) -{ - return NULL; -} - -static unsigned long calculate_numa_remap_pages(void) -{ - return 0; -} - -static void init_remap_allocator(int nid) -{ -} - -void __init remap_numa_kva(void) -{ -} -#endif /* CONFIG_DISCONTIGMEM */ extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) -- cgit v1.2.3 From 84d6bd0e272e6eace1e7e4c501f32bddfb665bb2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:19 +0100 Subject: x86: cope with no remap space being allocated for a numa node When allocating the pgdat's for numa nodes on x86_32 we attempt to place them in the numa remap space for that node. However should the node not have any remap space allocated (such as due to having non-ram pages in the remap location in the node) then we will incorrectly place the pgdat at zero. Check we have remap available, falling back to node 0 memory where we do not. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 026201f143a..435c343c14b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,7 +156,7 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid)) + if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); -- cgit v1.2.3 From 163872950dc856fd23849c27f60049feaac49ae6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:57:22 -0700 Subject: x86: extend e820 early_res support 32bit -fix #4 reserve_early pgdata for 32bit numa Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf98368..47749727907 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -159,8 +159,13 @@ static void __init allocate_pgdat(int nid) if (nid && node_has_online_mem(nid)) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), + "NODE_DATA"); } } -- cgit v1.2.3 From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:58:37 -0700 Subject: x86: extend e820 early_res support 32bit -fix #5 reserve early numa kva, so it will not clash with new RAMDISK Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 47749727907..55fdbab6b01 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -357,6 +357,11 @@ unsigned long __init setup_memory(void) printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< system_max_low_pfn) @@ -392,13 +397,6 @@ unsigned long __init setup_memory(void) return max_low_pfn; } -void __init numa_kva_reserve(void) -{ - if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), - BOOTMEM_DEFAULT); -} - void __init zone_sizes_init(void) { int nid; -- cgit v1.2.3 From 2772f54bf37b033263abe4a2314c40a308a1a5cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:09:45 +0200 Subject: x86: add acpi_numa_slit_init() dummy implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed on 32-bit. --- arch/x86/mm/discontig_32.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 435c343c14b..d28987a2f2b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -454,3 +454,12 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifdef CONFIG_ACPI_NUMA +/* + * Dummy on 32-bit, for now: + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} +#endif -- cgit v1.2.3 From b28852d6703e4b72ce363c5168ea8d3fb28b9c57 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:11:07 +0200 Subject: x86: add dummy acpi_numa_processor_affinity_init() implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed. --- arch/x86/mm/discontig_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index d28987a2f2b..bb8544a7cb9 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -462,4 +462,9 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { } + +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ +} #endif -- cgit v1.2.3 From cf3d0cb8a10a4fd19439bb34995d9ad28854739a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 11:36:56 +0200 Subject: x86: 32-bit numa, build fix on Summit it's possible to have: CONFIG_ACPI_SRAT=y CONFIG_HAVE_ARCH_PARSE_SRAT=y in which case acpi.h defines the acpi_numa_slit_init() and acpi_numa_processor_affinity_init() methods as a macro. --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index bb8544a7cb9..0d28b4bff81 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -455,7 +455,7 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_ACPI_NUMA +#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) /* * Dummy on 32-bit, for now: */ -- cgit v1.2.3 From 471b3c1b011f807d16f1e19d1d4ecf703f1e7d1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 12:39:43 +0200 Subject: x86, numaq 32-bit: build fix fix: drivers/built-in.o: In function `acpi_numa_init': : undefined reference to `acpi_numa_arch_fixup' which can happen with ACPI && NUMAQ. --- arch/x86/mm/discontig_32.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 0d28b4bff81..8b4eac0ca07 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -467,4 +467,8 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } + +void __init acpi_numa_arch_fixup(void) +{ +} #endif -- cgit v1.2.3 From ba924c81dd5a7a7fb5ded025af7fdd3b61f8ca67 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:51:51 -0700 Subject: x86, numa, 32-bit: increase max_elements to 1024 so every element will represent 64M instead of 256M. AMD opteron could have HW memory hole remapping, so could have [0, 8g + 64M) on node0. Reduce element size to 64M to keep that on node 0 Later we need to use find_e820_area() to allocate memory_node_map like on 64-bit. But need to move memory_present out of populate_mem_map... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 55fdbab6b01..922af20d1d2 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, * if the first gig is on node 0, and the second gig is on node 1 * physnode_map will contain: * - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; */ s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; EXPORT_SYMBOL(physnode_map); @@ -81,9 +81,9 @@ void memory_present(int nid, unsigned long start, unsigned long end) printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk("%ld ", pfn); + printk(KERN_CONT "%ld ", pfn); } - printk("\n"); + printk(KERN_CONT "\n"); } unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, -- cgit v1.2.3 From b66cd7207387b9b428aaf1988e21dd263c6a4928 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:53:47 -0700 Subject: x86: set node_remap_size[0] in fallback path ... otherwise alloc_remap will not get node_mem_map from kva area, and alloc_node_mem_map has to alloc_bootmem_node to get mem_map. It will use two low address copies ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 922af20d1d2..98b099eeab3 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -124,6 +124,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); -- cgit v1.2.3 From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:52:47 -0700 Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn() we don't need to call memory_present that early. numa and sparse will call memory_present later and might even fail, it will call memory_present for the full range. also for sparse it will call alloc_bootmem ... before we set up bootmem. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 98b099eeab3..ebbbba33815 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - propagate_e820_map(); + find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); -- cgit v1.2.3 From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index ebbbba33815..3150ad38567 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -168,6 +168,8 @@ static void __init allocate_pgdat(int nid) reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), "NODE_DATA"); } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); } #ifdef CONFIG_DISCONTIGMEM @@ -208,8 +210,12 @@ void __init remap_numa_kva(void) int node; for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { vaddr = node_remap_start_vaddr[node]+(pfn< Date: Sun, 1 Jun 2008 21:06:31 -0700 Subject: x86, numa, 32-bit: avoid clash between ramdisk and kva use find_e820_area to get address space... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3150ad38567..73a983489c6 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -38,6 +38,7 @@ #include #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -326,7 +327,6 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; - unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -337,29 +337,18 @@ unsigned long __init setup_memory(void) */ get_memcfg_numa(); - kva_pages = calculate_numa_remap_pages(); + kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_start_pfn = find_max_low_pfn() - kva_pages; - -#ifdef CONFIG_BLK_DEV_INITRD - /* Numa kva area is below the initrd */ - if (initrd_start) - kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - - kva_pages; -#endif - - /* - * We waste pages past at the end of the KVA for no good reason other - * than how it is located. This is bad. - */ - wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); - kva_start_pfn -= wasted_pages; - kva_pages += wasted_pages; - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); -- cgit v1.2.3 From 6af61a7614a306fe882a0c2b4ddc63b65aa66efc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 23:53:50 -0700 Subject: x86: clean up max_pfn_mapped usage - 32-bit on 32-bit in head_32.S after initial page table is done, we get initial max_pfn_mapped, and then kernel_physical_mapping_init will give us a final one. We need to use that to make sure find_e820_area will get valid addresses for boot_map and for NODE_DATA(0) on numa32. XEN PV and lguest may need to assign max_pfn_mapped too. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 73a983489c6..914a81ee785 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -163,7 +163,8 @@ static void __init allocate_pgdat(int nid) else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), -- cgit v1.2.3 From 84b56fa46b36c2df508e7d421feab514fad30f81 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:32:30 -0700 Subject: x86, numa, 32-bit: make sure get we kva space when 1/3 user/kernel split is used, and less memory is installed, or if we have a big hole below 4g, max_low_pfn is still using 3g-128m try to go down from max_low_pfn until we get it. otherwise will panic. need to make 32-bit code to use register_e820_active_regions ... later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914a81ee785..7ced26ab9ae 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -328,6 +328,7 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -344,11 +345,17 @@ unsigned long __init setup_memory(void) system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); -- cgit v1.2.3 From 7b2a0a6c4866cac146dcb0433e6984eb19a81335 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:35:04 -0700 Subject: x86: make 32-bit use e820_register_active_regions() this way 32-bit is more similar to 64-bit, and smarter e820 and numa. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7ced26ab9ae..a89ccf3d4c1 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,10 +120,9 @@ int __init get_memcfg_numa_flat(void) { printk("NUMA - single node, flat memory mode\n"); - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); @@ -337,6 +336,11 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ + + /* call find_max_low_pfn at first, it could update max_pfn */ + system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + + remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -344,7 +348,6 @@ unsigned long __init setup_memory(void) /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3d4c1..489605bab85 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); -- cgit v1.2.3 From 9043f007963f4039befa3c31f47173f74a0b1c70 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 Jun 2008 18:53:33 -0700 Subject: x86, numa, 32-bit: use find_e820_area() to find KVA RAM on node don't assume we can use RAM near the end of every node. Esp systems that have few memory and they could have kva address and kva RAM all below max_low_pfn. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 489605bab85..accc7c6c57f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -228,17 +228,21 @@ static unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; - unsigned long pfn; for_each_online_node(nid) { - unsigned old_end_pfn = node_end_pfn[nid]; + u64 node_end_target; + u64 node_end_final; /* * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ + printk("node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; + if (!node_end_pfn[nid]) + continue; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -250,37 +254,40 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; + node_end_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_end_target <<= PAGE_SHIFT; + do { + node_end_final = find_e820_area(node_end_target, + ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_end_final == -1ULL) + panic("Can not get kva ram\n"); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } + printk("Shrinking node %d from %ld pages to %lld pages\n", + nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + */ + if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) + reserve_early(node_end_final, + node_end_final+(((u64)size)<>PAGE_SHIFT; node_remap_start_pfn[nid] = node_end_pfn[nid]; shrink_active_range(nid, node_end_pfn[nid]); } -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57f..c3f119e99e0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0..7c4d0255f8d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif -- cgit v1.2.3 From cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 16:11:08 -0700 Subject: x86: kill bad_ppro so don't punish all other cpus without that problem when init highmem Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7c4d0255f8d..6216e43b6e9 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -427,7 +427,7 @@ void __init zone_sizes_init(void) return; } -void __init set_highmem_pages_init(int bad_ppro) +void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM struct zone *zone; @@ -447,7 +447,7 @@ void __init set_highmem_pages_init(int bad_ppro) zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn, bad_ppro); + zone_end_pfn); } totalram_pages += totalhigh_pages; #endif -- cgit v1.2.3 From b2ac82a0909aea0d2620ba4c189f37c567c21fe5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 22 Jun 2008 02:45:39 -0700 Subject: x86: introduce initmem_init for 32 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a2f73ba42b8..3e75be46c4f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,8 +309,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) { int nid; unsigned long system_start_pfn, system_max_low_pfn; -- cgit v1.2.3 From 2ec65f8b89ea003c27ff7723525a2ee335a2b393 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 03:05:30 -0700 Subject: x86: clean up using max_low_pfn on 32-bit so that max_low_pfn is not changed after it is set. so we can move that early and out of initmem_init. could call find_low_pfn_range just after max_pfn is set. also could move reserve_initrd out of setup_bootmem_allocator so 32bit is more like 64bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3e75be46c4f..1dfff700264 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,11 +309,10 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int nid; - unsigned long system_start_pfn, system_max_low_pfn; long kva_target_pfn; /* @@ -324,17 +323,11 @@ unsigned long __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - /* call find_max_low_pfn at first, it could update max_pfn */ - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< system_max_low_pfn) - highstart_pfn = system_max_low_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = system_max_low_pfn; - high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + pages_to_mb(max_low_pfn)); + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); printk("Low memory ends at vaddr %08lx\n", @@ -387,7 +380,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); - return max_low_pfn; } void __init zone_sizes_init(void) -- cgit v1.2.3 From c09434571d4b1d8abf530ba4ce28cb868b45f2e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 16:41:30 -0700 Subject: x86: numa32 pfn print out using hex instead Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 1dfff700264..f5ae31935ca 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -76,13 +76,13 @@ void memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; - printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%ld ", pfn); + printk(KERN_CONT "%lx ", pfn); } printk(KERN_CONT "\n"); } @@ -117,7 +117,7 @@ static unsigned long kva_pages; */ int __init get_memcfg_numa_flat(void) { - printk("NUMA - single node, flat memory mode\n"); + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; @@ -233,7 +233,7 @@ static unsigned long calculate_numa_remap_pages(void) * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ - printk("node %d pfn: [%lx - %lx]\n", + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; @@ -268,7 +268,8 @@ static unsigned long calculate_numa_remap_pages(void) node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", size, nid, node_kva_final>>PAGE_SHIFT); /* @@ -290,7 +291,7 @@ static unsigned long calculate_numa_remap_pages(void) remove_active_range(nid, node_remap_start_pfn[nid], node_remap_start_pfn[nid] + size); } - printk("Reserving total of %ld pages for numa KVA remap\n", + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); return reserve_pages; } @@ -304,7 +305,7 @@ static void init_remap_allocator(int nid) node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) node_remap_end_vaddr[nid]); } @@ -340,9 +341,9 @@ void __init initmem_init(unsigned long start_pfn, if (kva_start_pfn == -1UL) panic("Can not get kva space\n"); - printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", kva_start_pfn, max_low_pfn); - printk("max_pfn = %ld\n", max_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); /* avoid clash with initrd */ reserve_early(kva_start_pfn<spanned_pages; nid = zone_to_nid(zone); - printk("Initializing %s for node %d (%08lx:%08lx)\n", + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, -- cgit v1.2.3 From 3a58a2a6c879b2e47daafd6e641661c50ac9da5a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 12:19:41 -0700 Subject: x86: introduce init_memory_mapping for 32bit #3 move kva related early backto initmem_init for numa32 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index f5ae31935ca..42484d446d0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -void __init remap_numa_kva(void) +static void __init remap_numa_kva(void) { void *vaddr; unsigned long pfn; @@ -373,12 +373,16 @@ void __init initmem_init(unsigned long start_pfn, allocate_pgdat(nid); } + remap_numa_kva(); + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) propagate_e820_map_node(nid); - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); } -- cgit v1.2.3 From 996cf4438f9ed711c98a3cb5ab88f842a4102427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 30 Jun 2008 18:34:58 -0700 Subject: x86: don't reallocate pgt for node0 kva ram already mapped right after away, so don't need to get that for low ram. avoid wasting one copy of pgdat. also add node id in early_res name in case we get it from find_e820_area. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 42484d446d0..e2f0dd13a45 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,17 +156,20 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), - "NODE_DATA"); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); } printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", nid, (unsigned long)NODE_DATA(nid)); -- cgit v1.2.3 From cb95a13a8ace8612ecab042a838e5aab2ec14ef0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 Jul 2008 00:31:02 -0700 Subject: x86: merge zones_sizes_init for numa and non numa on 32-bit move out e820_register_active_regions from non numa zones_sizes_init() and remove numa version zones_sizes_init(). and let 32 bit call remove_all_active_ranges() in setup_arch() directly like 64-bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index e2f0dd13a45..fa389d20383 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -327,7 +327,6 @@ void __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -390,21 +389,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); - return; -} - void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From 698839fe04ae11f228074ad45614343c3921a6c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove have_arch_parse_srat -v2 we already have the same srat handling interface for 32bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index fa389d20383..5dfef9fa061 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -443,20 +443,3 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) -/* - * Dummy on 32-bit, for now: - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ -} - -void __init acpi_numa_arch_fixup(void) -{ -} -#endif -- cgit v1.2.3