diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 99 | ||||
-rw-r--r-- | mm/page-writeback.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 17 | ||||
-rw-r--r-- | mm/rmap.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 58 | ||||
-rw-r--r-- | mm/swap.c | 26 | ||||
-rw-r--r-- | mm/swapfile.c | 17 | ||||
-rw-r--r-- | mm/vmscan.c | 146 |
9 files changed, 246 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index a965b6b35f2..44da3d47699 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3171f884d24..73790188b0e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) } static void gather_stats(struct page *, void *); -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ if (PageReserved(page)) continue; nid = page_to_nid(page); @@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (flags & MPOL_MF_STATS) gather_stats(page, private); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - spin_unlock(ptl); - migrate_page_add(vma, page, private, flags); - spin_lock(ptl); - } + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); else break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; + /* Clear the LRU lists so pages can be isolated */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_add_drain_all(); + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); @@ -519,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, * page migration */ -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; - - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; -} - -/* - * Add a page to be migrated to the pagelist - */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) { /* - * Avoid migrating a page that is shared by others and not writable. + * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); - - if (rc == 1) + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (isolate_lru_page(page)) list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); } } @@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + switch (policy->policy) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return policy->v.zonelist->zones[0]->zone_pgdat->node_id; + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5240e426c1f..945559fb63d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -46,7 +46,7 @@ static long ratelimit_pages = 32; static long total_pages; /* The total number of pages in the machine. */ -static int dirty_exceeded; /* Dirty mem may be over limit */ +static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ /* * When balance_dirty_pages decides that the caller needs to perform some @@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping) if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) break; - dirty_exceeded = 1; + if (!dirty_exceeded) + dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) dirty_exceeded = 0; if (writeback_in_progress(bdi)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d..df54e2fc8ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = (*z)->pages_high; if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) - continue; + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; } page = buffered_rmqueue(zonelist, *z, order, gfp_mask); @@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + + if (distance != node_distance(local_node, prev_node)) node_load[node] += load; prev_node = node; load--; diff --git a/mm/rmap.c b/mm/rmap.c index dfbb89f99a1..d85a99d28c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -33,7 +33,7 @@ * mapping->i_mmap_lock * anon_vma->lock * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed) + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) diff --git a/mm/slab.c b/mm/slab.c index 9374293a301..6f8495e2185 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -68,7 +68,7 @@ * Further notes from the original documentation: * * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The global cache-chain is protected by the mutex 'cache_chain_mutex'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). @@ -103,6 +103,8 @@ #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/nodemask.h> +#include <linux/mempolicy.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -631,7 +633,7 @@ static kmem_cache_t cache_cache = { }; /* Guard access to the cache-chain. */ -static struct semaphore cache_chain_sem; +static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* @@ -772,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries, } #ifdef CONFIG_NUMA +static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); + static inline struct array_cache **alloc_alien_cache(int node, int limit) { struct array_cache **ac_ptr; @@ -857,7 +861,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); /* we need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right @@ -912,7 +916,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, l3->shared = nc; } } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: start_cpu_timer(cpu); @@ -921,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: /* fall thru */ case CPU_UP_CANCELED: - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; @@ -973,13 +977,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, spin_unlock_irq(&cachep->spinlock); kfree(nc); } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); break; #endif } return NOTIFY_OK; bad: - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1047,7 +1051,6 @@ void __init kmem_cache_init(void) */ /* 1) create the cache_cache */ - init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); @@ -1168,10 +1171,10 @@ void __init kmem_cache_init(void) /* 6) resize the head arrays to their final sizes */ { kmem_cache_t *cachep; - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) enable_cpucache(cachep); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); } /* Done! */ @@ -1590,7 +1593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, BUG(); } - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each(p, &cache_chain) { kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); @@ -1856,7 +1859,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2044,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep) lock_cpu_hotplug(); /* Find the cache in the chain of caches. */ - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed */ list_del(&cachep->next); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_add(&cachep->next, &cache_chain); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); unlock_cpu_hotplug(); return 1; } @@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) void *objp; struct array_cache *ac; +#ifdef CONFIG_NUMA + if (unlikely(current->mempolicy && !in_interrupt())) { + int nid = slab_node(current->mempolicy); + + if (nid != numa_node_id()) + return __cache_alloc_node(cachep, flags, nid); + } +#endif + check_irq_off(); ac = ac_data(cachep); if (likely(ac->avail)) { @@ -3314,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, * - clear the per-cpu caches for this CPU. * - return freeable pages to the main free memory pool. * - * If we cannot acquire the cache chain semaphore then just give up - we'll + * If we cannot acquire the cache chain mutex then just give up - we'll * try again on the next iteration. */ static void cache_reap(void *unused) @@ -3322,7 +3334,7 @@ static void cache_reap(void *unused) struct list_head *walk; struct kmem_list3 *l3; - if (down_trylock(&cache_chain_sem)) { + if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); @@ -3393,7 +3405,7 @@ static void cache_reap(void *unused) cond_resched(); } check_irq_on(); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); drain_remote_pages(); /* Setup the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); @@ -3429,7 +3441,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t n = *pos; struct list_head *p; - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); p = cache_chain.next; @@ -3451,7 +3463,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); } static int s_show(struct seq_file *m, void *p) @@ -3603,7 +3615,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, return -EINVAL; /* Find the cache in the chain of caches. */ - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); res = -EINVAL; list_for_each(p, &cache_chain) { kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); @@ -3620,7 +3632,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, break; } } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); if (res >= 0) res = count; return res; diff --git a/mm/swap.c b/mm/swap.c index cbb48e721ab..bc2442a7b0e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -174,6 +174,32 @@ void lru_add_drain(void) put_cpu(); } +#ifdef CONFIG_NUMA +static void lru_add_drain_per_cpu(void *dummy) +{ + lru_add_drain(); +} + +/* + * Returns 0 for success + */ +int lru_add_drain_all(void) +{ + return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); +} + +#else + +/* + * Returns 0 for success + */ +int lru_add_drain_all(void) +{ + lru_add_drain(); + return 0; +} +#endif + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. diff --git a/mm/swapfile.c b/mm/swapfile.c index 957fef43fa6..f1e69c30d20 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -25,6 +25,7 @@ #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> +#include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> @@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; -static DECLARE_MUTEX(swapon_sem); +static DEFINE_MUTEX(swapon_mutex); /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a semaphore. + * cannot be turned into a mutex. */ static DECLARE_RWSEM(swap_unplug_sem); @@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); - down(&swapon_sem); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) p->swap_map = NULL; p->flags = 0; spin_unlock(&swap_lock); - up(&swapon_sem); + mutex_unlock(&swapon_mutex); vfree(swap_map); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) int i; loff_t l = *pos; - down(&swapon_sem); + mutex_lock(&swapon_mutex); for (i = 0; i < nr_swapfiles; i++, ptr++) { if (!(ptr->flags & SWP_USED) || !ptr->swap_map) @@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) static void swap_stop(struct seq_file *swap, void *v) { - up(&swapon_sem); + mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) @@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - down(&swapon_sem); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); p->flags = SWP_ACTIVE; nr_swap_pages += nr_good_pages; @@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) swap_info[prev].next = p - swap_info; } spin_unlock(&swap_lock); - up(&swapon_sem); + mutex_unlock(&swapon_mutex); error = 0; goto out; bad_swap: diff --git a/mm/vmscan.c b/mm/vmscan.c index bf903b2d198..2e34b61a70c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,6 +71,9 @@ struct scan_control { int may_writepage; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { + if (!sc->may_swap) + goto keep_locked; if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } @@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page) } /* - * Add isolated pages on the list back to the LRU + * Add isolated pages on the list back to the LRU. * * returns the number of pages put back. */ @@ -760,46 +765,33 @@ next: return nr_failed + retry; } -static void lru_add_drain_per_cpu(void *dummy) -{ - lru_add_drain(); -} - /* * Isolate one page from the LRU lists and put it on the - * indicated list. Do necessary cache draining if the - * page is not on the LRU lists yet. + * indicated list with elevated refcount. * * Result: * 0 = page not on LRU list * 1 = page removed from LRU list and added to the specified list. - * -ENOENT = page is being freed elsewhere. */ int isolate_lru_page(struct page *page) { - int rc = 0; - struct zone *zone = page_zone(page); + int ret = 0; -redo: - spin_lock_irq(&zone->lru_lock); - rc = __isolate_lru_page(page); - if (rc == 1) { - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - if (rc == 0) { - /* - * Maybe this page is still waiting for a cpu to drain it - * from one of the lru lists? - */ - rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); - if (rc == 0 && PageLRU(page)) - goto redo; + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (TestClearPageLRU(page)) { + ret = 1; + get_page(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); } - return rc; + + return ret; } #endif @@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - switch (__isolate_lru_page(page)) { - case 1: - /* Succeeded to isolate page */ - list_move(&page->lru, dst); - nr_taken++; - break; - case -ENOENT: - /* Not possible to isolate */ - list_move(&page->lru, src); - break; - default: + if (!TestClearPageLRU(page)) BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + continue; + } else { + list_add(&page->lru, dst); + nr_taken++; } } @@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.gfp_mask = gfp_mask; sc.may_writepage = 0; + sc.may_swap = 1; inc_page_state(allocstall); @@ -1279,6 +1274,7 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; + sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1576,3 +1572,71 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + * + * In the future we may add flags to the mode. However, the page allocator + * should only have to check that zone_reclaim_mode != 0 before calling + * zone_reclaim(). + */ +int zone_reclaim_mode __read_mostly; + +/* + * Mininum time between zone reclaim scans + */ +#define ZONE_RECLAIM_INTERVAL HZ/2 +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = 0, + .may_swap = 0, + .nr_mapped = read_page_state(nr_mapped), + .nr_scanned = 0, + .nr_reclaimed = 0, + .priority = 0 + }; + + if (!(gfp_mask & __GFP_WAIT) || + zone->zone_pgdat->node_id != numa_node_id() || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0) + return 0; + + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + return 0; + + disable_swap_token(); + + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + cond_resched(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + shrink_zone(zone, &sc); + p->reclaim_state = NULL; + current->flags &= ~PF_MEMALLOC; + + if (sc.nr_reclaimed == 0) + zone->last_unsuccessful_zone_reclaim = jiffies; + + return sc.nr_reclaimed > nr_pages; +} +#endif + |