From 0bbbc0b33d141f78a0d9218a54a47f50621220d3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: add numa awareness to hugepage allocations It's mostly a matter of replacing alloc_pages with alloc_pages_vma after introducing alloc_pages_vma. khugepaged needs special handling as the allocation has to happen inside collapse_huge_page where the vma is known and an error has to be returned to the outer loop to sleep alloc_sleep_millisecs in case of failure. But it retains the more efficient logic of handling allocation failures in khugepaged in case of CONFIG_NUMA=n. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++--------- mm/mempolicy.c | 13 +++++---- 2 files changed, 82 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0415a83afd6..f6559e7711b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), + return alloc_pages(alloc_hugepage_gfpmask(defrag), HPAGE_PMD_ORDER); } +#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; - page = alloc_hugepage(transparent_hugepage_defrag(vma)); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); else new_page = NULL; @@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long hstart, hend; VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); +#else + VM_BUG_ON(*hpage); +#endif /* * Prevent all access to pagetables with the exception of @@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; +#ifndef CONFIG_NUMA new_page = *hpage; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) +#else + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + *hpage = ERR_PTR(-ENOMEM); goto out; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out_put_page; anon_vma_lock(vma->anon_vma); @@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out; + goto out_put_page; } /* @@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, mm->nr_ptes--; spin_unlock(&mm->page_table_lock); +#ifndef CONFIG_NUMA *hpage = NULL; +#endif khugepaged_pages_collapsed++; out: up_write(&mm->mmap_sem); + return; + +out_put_page: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) while (progress < pages) { cond_resched(); +#ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); if (unlikely(!*hpage)) break; } +#else + if (IS_ERR(*hpage)) + break; +#endif spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) @@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) } } +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA static struct page *khugepaged_alloc_hugepage(void) { struct page *hpage; do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); - } + if (!hpage) + khugepaged_alloc_sleep(); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; } +#endif static void khugepaged_loop(void) { struct page *hpage; +#ifdef CONFIG_NUMA + hpage = NULL; +#endif while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); if (unlikely(!hpage)) break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA if (hpage) put_page(hpage); +#endif if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83b7df309fc..368fc9d2361 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3