From c129ef6d0f76a809b0a96fd52b9509856722e74b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Jul 2012 18:51:22 -0400 Subject: mm: frontswap: remove unneeded headers Signed-off-by: Sasha Levin [v1: Rebased with tracing removed] Signed-off-by: Konrad Rzeszutek Wilk --- mm/frontswap.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 7fb9538bec2..5318b3a5708 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -11,15 +11,11 @@ * This work is licensed under the terms of the GNU GPL, version 2. */ -#include #include #include #include -#include #include -#include #include -#include #include #include #include -- cgit v1.2.3 From 88610238d9a0a5998c4deba201332dd1e35b4199 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 16 Jun 2012 20:37:48 +0800 Subject: mm/frontswap: cleanup doc and comment error Signed-off-by: Wanpeng Li Signed-off-by: Konrad Rzeszutek Wilk --- mm/frontswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 5318b3a5708..6b3e71a2cd4 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -120,7 +120,7 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offse * "Store" data from a page to frontswap and associate it with the page's * swaptype and offset. Page must be locked and in the swap cache. * If frontswap already contains a page with matching swaptype and - * offset, the frontswap implmentation may either overwrite the data and + * offset, the frontswap implementation may either overwrite the data and * return success or invalidate the page from frontswap and return failure. */ int __frontswap_store(struct page *page) -- cgit v1.2.3 From 6b982fcf0260b8371604fcf6a12c6beecb731c57 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 30 Jul 2012 14:47:44 -0500 Subject: mm/frontswap: fix uninit'ed variable warning Fixes uninitialized variable warning on 'type' in frontswap_shrink(). type is set before use by __frontswap_unuse_pages() called by __frontswap_shrink() called by frontswap_shrink() before use by try_to_unuse(). Signed-off-by: Seth Jennings Signed-off-by: Konrad Rzeszutek Wilk --- mm/frontswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 6b3e71a2cd4..89dc399d332 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -292,7 +292,7 @@ static int __frontswap_shrink(unsigned long target_pages, void frontswap_shrink(unsigned long target_pages) { unsigned long pages_to_unuse = 0; - int type, ret; + int uninitialized_var(type), ret; /* * we don't want to hold swap_lock while doing a very -- cgit v1.2.3 From d9b7f22623b5fa9cc189581dcdfb2ac605933bf4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 3 Aug 2012 22:51:37 +0400 Subject: slub: use free_page instead of put_page for freeing kmalloc allocation When freeing objects, the slub allocator will most of the time free empty pages by calling __free_pages(). But high-order kmalloc will be diposed by means of put_page() instead. It makes no sense to call put_page() in kernel pages that are provided by the object allocators, so we shouldn't be doing this ourselves. Aside from the consistency change, we don't change the flow too much. put_page()'s would call its dtor function, which is __free_pages. We also already do all of the Compound page tests ourselves, and the Mlock test we lose don't really matter. Signed-off-by: Glauber Costa Acked-by: Christoph Lameter CC: David Rientjes CC: Pekka Enberg Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 8f78e257703..c83fe96f5e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3477,7 +3477,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - put_page(page); + __free_pages(page, compound_order(page)); return; } slab_free(page->slab, page, object, _RET_IP_); -- cgit v1.2.3 From 48f2474144ac708f1faad97e82a863ca8214b602 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 14 Aug 2012 14:53:20 -0700 Subject: slab: do not call compound_head() in page_get_cache() page_get_cache() does not need to call compound_head(), as its unique caller virt_to_slab() already makes sure to return a head page. Additionally, removing the compound_head() call makes page_get_cache() consistent with page_get_slab(). Signed-off-by: Michel Lespinasse Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slab.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f8b0d539b48..cb6ce2dfc7c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -517,7 +517,6 @@ static bool slab_max_order_set __initdata; static inline struct kmem_cache *page_get_cache(struct page *page) { - page = compound_head(page); BUG_ON(!PageSlab(page)); return page->slab_cache; } -- cgit v1.2.3 From 455ce9eb1cfa083da0def023094190aeb133855a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Aug 2012 14:53:22 -0700 Subject: mm/slab_common.c: cleanup Eliminate an ifdef and a label by moving all the CONFIG_DEBUG_VM checking inside the locked region. Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slab_common.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index aa3ca5bb01b..281600b3010 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -53,19 +53,17 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align { struct kmem_cache *s = NULL; + get_online_cpus(); + mutex_lock(&slab_mutex); + #ifdef CONFIG_DEBUG_VM if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "kmem_cache_create(%s) integrity check" - " failed\n", name); - goto out; + " failed\n", name); + goto oops; } -#endif - - get_online_cpus(); - mutex_lock(&slab_mutex); -#ifdef CONFIG_DEBUG_VM list_for_each_entry(s, &slab_caches, list) { char tmp; int res; @@ -104,9 +102,6 @@ oops: mutex_unlock(&slab_mutex); put_online_cpus(); -#ifdef CONFIG_DEBUG_VM -out: -#endif if (!s && (flags & SLAB_PANIC)) panic("kmem_cache_create: Failed to create slab '%s'\n", name); -- cgit v1.2.3 From 19c7ff9ecd89441096dab6a56f926f7df8ba850a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 30 May 2012 12:54:46 -0500 Subject: slub: Take node lock during object free checks Only applies to scenarios where debugging is on: Validation of slabs can currently occur while debugging information is updated from the fast paths of the allocator. This results in various races where we get false reports about slab metadata not being in order. This patch makes the fast paths take the node lock so that serialization with slab validation will occur. Causes additional slowdown in debug scenarios. Reported-by: Waiman Long Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c83fe96f5e4..e131084e87a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1069,13 +1069,13 @@ bad: return 0; } -static noinline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { - unsigned long flags; - int rc = 0; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - local_irq_save(flags); + spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); if (!check_slab(s, page)) @@ -1113,15 +1113,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - rc = 1; out: slab_unlock(page); - local_irq_restore(flags); - return rc; + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; fail: + slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, *flags); slab_fix(s, "Object at 0x%p not freed", object); - goto out; + return NULL; } static int __init setup_slub_debug(char *str) @@ -1214,8 +1218,9 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -2452,7 +2457,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); - if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) return; do { -- cgit v1.2.3 From e24fc410f58cc7851188a6e996dc6ce5c4259eb4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Sat, 23 Jun 2012 03:22:38 +0900 Subject: slub: reduce failure of this_cpu_cmpxchg in put_cpu_partial() after unfreezing In current implementation, after unfreezing, we doesn't touch oldpage, so it remain 'NOT NULL'. When we call this_cpu_cmpxchg() with this old oldpage, this_cpu_cmpxchg() is mostly be failed. We can change value of oldpage to NULL after unfreezing, because unfreeze_partial() ensure that all the cpu partial slabs is removed from cpu partial list. In this time, we could expect that this_cpu_cmpxchg is mostly succeed. Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e131084e87a..c67bd0a4a95 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1962,6 +1962,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_save(flags); unfreeze_partials(s); local_irq_restore(flags); + oldpage = NULL; pobjects = 0; pages = 0; stat(s, CPU_PARTIAL_DRAIN); -- cgit v1.2.3 From b920536aa0a4fd178b2957774adfe409027fe55b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 16 Aug 2012 10:12:18 +0300 Subject: Revert "mm/slab_common.c: cleanup" This reverts commit 455ce9eb1cfa083da0def023094190aeb133855a. Andrew sent a better version. Signed-off-by: Pekka Enberg --- mm/slab_common.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 281600b3010..aa3ca5bb01b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -53,17 +53,19 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align { struct kmem_cache *s = NULL; - get_online_cpus(); - mutex_lock(&slab_mutex); - #ifdef CONFIG_DEBUG_VM if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "kmem_cache_create(%s) integrity check" - " failed\n", name); - goto oops; + " failed\n", name); + goto out; } +#endif + + get_online_cpus(); + mutex_lock(&slab_mutex); +#ifdef CONFIG_DEBUG_VM list_for_each_entry(s, &slab_caches, list) { char tmp; int res; @@ -102,6 +104,9 @@ oops: mutex_unlock(&slab_mutex); put_online_cpus(); +#ifdef CONFIG_DEBUG_VM +out: +#endif if (!s && (flags & SLAB_PANIC)) panic("kmem_cache_create: Failed to create slab '%s'\n", name); -- cgit v1.2.3 From 77be4b1366c97b95bb197e18187ff45b0e4f6bd3 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 16 Aug 2012 00:09:46 -0700 Subject: mm/slab: restructure kmem_cache_create() debug checks kmem_cache_create() does cache integrity checks when CONFIG_DEBUG_VM is defined. These checks interspersed with the regular code path has lead to compile time warnings when compiled without CONFIG_DEBUG_VM defined. Restructuring the code to move the integrity checks in to a new function would eliminate the current compile warning problem and also will allow for future changes to the debug only code to evolve without introducing new warnings in the regular path. This restructuring work is based on the discussion in the following thread: https://lkml.org/lkml/2012/7/13/424 [akpm@linux-foundation.org: fix build, cleanup] Signed-off-by: Shuah Khan Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slab_common.c | 97 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 48 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index aa3ca5bb01b..8cf8b4962d6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -23,49 +23,17 @@ enum slab_state slab_state; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); -/* - * kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @ctor: A constructor for the objects. - * - * Returns a ptr to the cache on success, NULL on failure. - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - */ - -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +#ifdef CONFIG_DEBUG_VM +static int kmem_cache_sanity_check(const char *name, size_t size) { struct kmem_cache *s = NULL; -#ifdef CONFIG_DEBUG_VM if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { - printk(KERN_ERR "kmem_cache_create(%s) integrity check" - " failed\n", name); - goto out; + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; } -#endif - get_online_cpus(); - mutex_lock(&slab_mutex); - -#ifdef CONFIG_DEBUG_VM list_for_each_entry(s, &slab_caches, list) { char tmp; int res; @@ -77,36 +45,67 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align */ res = probe_kernel_address(s->name, tmp); if (res) { - printk(KERN_ERR - "Slab cache with size %d has lost its name\n", + pr_err("Slab cache with size %d has lost its name\n", s->object_size); continue; } if (!strcmp(s->name, name)) { - printk(KERN_ERR "kmem_cache_create(%s): Cache name" - " already exists.\n", - name); + pr_err("%s (%s): Cache name already exists.\n", + __func__, name); dump_stack(); s = NULL; - goto oops; + return -EINVAL; } } WARN_ON(strchr(name, ' ')); /* It confuses parsers */ + return 0; +} +#else +static inline int kmem_cache_sanity_check(const char *name, size_t size) +{ + return 0; +} #endif - s = __kmem_cache_create(name, size, align, flags, ctor); +/* + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s = NULL; -#ifdef CONFIG_DEBUG_VM -oops: -#endif + get_online_cpus(); + mutex_lock(&slab_mutex); + if (kmem_cache_sanity_check(name, size) == 0) + s = __kmem_cache_create(name, size, align, flags, ctor); mutex_unlock(&slab_mutex); put_online_cpus(); -#ifdef CONFIG_DEBUG_VM -out: -#endif if (!s && (flags & SLAB_PANIC)) panic("kmem_cache_create: Failed to create slab '%s'\n", name); -- cgit v1.2.3 From 5b74beb425e0eb009d16d4b9a0f55847a5f342fa Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 16 Aug 2012 12:25:31 -0700 Subject: mm, slab: remove page_get_cache page_get_cache() isn't called from anything, so remove it. Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index cb6ce2dfc7c..3b4587bb7b1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -515,12 +515,6 @@ EXPORT_SYMBOL(slab_buffer_size); static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -static inline struct kmem_cache *page_get_cache(struct page *page) -{ - BUG_ON(!PageSlab(page)); - return page->slab_cache; -} - static inline struct kmem_cache *virt_to_cache(const void *obj) { struct page *page = virt_to_head_page(obj); -- cgit v1.2.3 From 203b42f7317494ae5e5efc7be6fb7f29c927f102 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Aug 2012 13:18:23 -0700 Subject: workqueue: make deferrable delayed_work initializer names consistent Initalizers for deferrable delayed_work are confused. * __DEFERRED_WORK_INITIALIZER() * DECLARE_DEFERRED_WORK() * INIT_DELAYED_WORK_DEFERRABLE() Rename them to * __DEFERRABLE_WORK_INITIALIZER() * DECLARE_DEFERRABLE_WORK() * INIT_DEFERRABLE_WORK() This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- mm/slab.c | 2 +- mm/vmstat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f8b0d539b48..35b5cb0da55 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -900,7 +900,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); + INIT_DEFERRABLE_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } diff --git a/mm/vmstat.c b/mm/vmstat.c index df7a6748231..b3e3b9d525d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1157,7 +1157,7 @@ static void __cpuinit start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + INIT_DEFERRABLE_WORK(work, vmstat_update); schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } -- cgit v1.2.3 From 38f38657444d15e1a8574eae80ed3de9f501737a Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:28 -0400 Subject: xattr: extract simple_xattr code from tmpfs Extract in-memory xattr APIs from tmpfs. Will be used by cgroup. $ size vmlinux.o text data bss dec hex filename 4658782 880729 5195032 10734543 a3cbcf vmlinux.o $ size vmlinux.o text data bss dec hex filename 4658957 880729 5195032 10734718 a3cc7e vmlinux.o v7: - checkpatch warnings fixed - Implement the changes requested by Hugh Dickins: - make simple_xattrs_init and simple_xattrs_free inline - get rid of locking and list reinitialization in simple_xattrs_free, they're not needed v6: - no changes v5: - no changes v4: - move simple_xattrs_free() to fs/xattr.c v3: - in kmem_xattrs_free(), reinitialize the list - use simple_xattr_* prefix - introduce simple_xattr_add() to prevent direct list usage Original-patch-by: Li Zefan Cc: Li Zefan Cc: Hillf Danton Cc: Lennart Poettering Acked-by: Hugh Dickins Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- mm/shmem.c | 171 +++++-------------------------------------------------------- 1 file changed, 13 insertions(+), 158 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index d4e184e2a38..d3752110c8c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 -struct shmem_xattr { - struct list_head list; /* anchored by shmem_inode_info->xattr_list */ - char *name; /* xattr name */ - size_t size; - char value[0]; -}; - /* * shmem_fallocate and shmem_writepage communicate via inode->i_private * (with i_mutex making sure that it has only one user at a time): @@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); @@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode) } else kfree(info->symlink); - list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { - kfree(xattr->name); - kfree(xattr); - } + simple_xattrs_free(&info->xattrs); BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); clear_inode(inode); @@ -1377,7 +1366,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->xattr_list); + simple_xattrs_init(&info->xattrs); cache_no_acl(inode); switch (mode & S_IFMT) { @@ -2059,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co * filesystem level, though. */ -/* - * Allocate new xattr and copy in the value; but leave the name to callers. - */ -static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) -{ - struct shmem_xattr *new_xattr; - size_t len; - - /* wrap around? */ - len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) - return NULL; - - new_xattr = kmalloc(len, GFP_KERNEL); - if (!new_xattr) - return NULL; - - new_xattr->size = size; - memcpy(new_xattr->value, value, size); - return new_xattr; -} - /* * Callback for security_inode_init_security() for acquiring xattrs. */ @@ -2090,11 +2057,11 @@ static int shmem_initxattrs(struct inode *inode, { struct shmem_inode_info *info = SHMEM_I(inode); const struct xattr *xattr; - struct shmem_xattr *new_xattr; + struct simple_xattr *new_xattr; size_t len; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) return -ENOMEM; @@ -2111,91 +2078,12 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - spin_lock(&info->lock); - list_add(&new_xattr->list, &info->xattr_list); - spin_unlock(&info->lock); + simple_xattr_list_add(&info->xattrs, new_xattr); } return 0; } -static int shmem_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct shmem_inode_info *info; - struct shmem_xattr *xattr; - int ret = -ENODATA; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (strcmp(name, xattr->name)) - continue; - - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); - } - break; - } - spin_unlock(&info->lock); - return ret; -} - -static int shmem_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr; - struct shmem_xattr *new_xattr = NULL; - int err = 0; - - /* value == NULL means remove */ - if (value) { - new_xattr = shmem_xattr_alloc(value, size); - if (!new_xattr) - return -ENOMEM; - - new_xattr->name = kstrdup(name, GFP_KERNEL); - if (!new_xattr->name) { - kfree(new_xattr); - return -ENOMEM; - } - } - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - } else { - list_del(&xattr->list); - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &info->xattr_list); - xattr = NULL; - } -out: - spin_unlock(&info->lock); - if (xattr) - kfree(xattr->name); - kfree(xattr); - return err; -} - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, @@ -2226,6 +2114,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2240,12 +2129,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, if (err) return err; - return shmem_xattr_get(dentry, name, buffer, size); + return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2260,15 +2150,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, if (err) return err; - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return shmem_xattr_set(dentry->d_inode, name, value, size, flags); - + return simple_xattr_set(&info->xattrs, name, value, size, flags); } static int shmem_removexattr(struct dentry *dentry, const char *name) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2283,45 +2170,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) if (err) return err; - return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); -} - -static bool xattr_is_trusted(const char *name) -{ - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return simple_xattr_remove(&info->xattrs, name); } static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); - struct shmem_xattr *xattr; - struct shmem_inode_info *info; - size_t used = 0; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - size_t len; - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; - - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } - } - spin_unlock(&info->lock); - - return used; + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ -- cgit v1.2.3 From 7034ed132f0af8526f4a8eb7229dd25366a20bfa Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 25 Aug 2012 16:57:27 +0800 Subject: backing-dev: use kstrto* in preference to simple_strtoul Fix checkpatch warnings: WARNING: consider using kstrto* in preference to simple_strtoul for the below sys entry parsers: /sys/block//bdi/read_ahead_kb /sys/block//bdi/max_ratio /sys/block//bdi/min_ratio Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi --- mm/backing-dev.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b41823cc05e..d3ca2b3ee17 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned long read_ahead_kb; - ssize_t ret = -EINVAL; + ssize_t ret; - read_ahead_kb = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); - ret = count; - } - return ret; + ret = kstrtoul(buf, 10, &read_ahead_kb); + if (ret < 0) + return ret; + + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + + return count; } #define K(pages) ((pages) << (PAGE_SHIFT - 10)) @@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned int ratio; - ssize_t ret = -EINVAL; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; - ratio = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - ret = bdi_set_min_ratio(bdi, ratio); - if (!ret) - ret = count; - } return ret; } BDI_SHOW(min_ratio, bdi->min_ratio) @@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned int ratio; - ssize_t ret = -EINVAL; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; - ratio = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - ret = bdi_set_max_ratio(bdi, ratio); - if (!ret) - ret = count; - } return ret; } BDI_SHOW(max_ratio, bdi->max_ratio) -- cgit v1.2.3 From e21827aadd77e33833eeb393de2e8675e9b399e2 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 14 Aug 2012 09:55:21 -0300 Subject: mm: Use __do_krealloc to do the krealloc job Without this patch we can get (many) kmem trace events with call site at krealloc(). This happens because krealloc is calling __krealloc, which performs the allocation through kmalloc_track_caller. Since neither krealloc nor __krealloc are marked inline explicitly, the caller can be traced as being krealloc, which clearly is not the intended behavior. This patch allows to get the real caller of krealloc, by creating an always inlined function __do_krealloc, thus tracing the call site accurately. Acked-by: Christoph Lameter Cc: Glauber Costa Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/util.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 8c7265afa29..dc3036cdcc6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + /** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. @@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user); */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { - void *ret; - size_t ks = 0; - if (unlikely(!new_size)) return ZERO_SIZE_PTR; - if (p) - ks = ksize(p); + return __do_krealloc(p, new_size, flags); - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; } EXPORT_SYMBOL(__krealloc); @@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, flags); if (ret && p != ret) kfree(p); -- cgit v1.2.3 From 79576102afc24fcc6627d7a15691e432d9a2eacb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:06:14 +0000 Subject: mm/slub: Add debugging to verify correct cache use on kmem_cache_free() Add additional debugging to check that the objects is actually from the cache the caller claims. Doing so currently trips up some other debugging code. It takes a lot to infer from that what was happening. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter [ penberg@kernel.org: Use pr_err() ] Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c67bd0a4a95..99059217434 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2614,6 +2614,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); + if (kmem_cache_debug(s) && page->slab != s) { + pr_err("kmem_cache_free: Wrong slab cache. %s but object" + " is from %s\n", page->slab->name, s->name); + WARN_ON_ONCE(1); + return; + } + slab_free(s, page, x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); -- cgit v1.2.3 From 208c4358dc4a8f0fe99e49eb8d21a869b01e7d34 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:06:14 +0000 Subject: mm/slub: Use kmem_cache for the kmem_cache structure Do not use kmalloc() but kmem_cache_alloc() for the allocation of the kmem_cache structures in slub. Reviewed-by: Glauber Costa Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 99059217434..c6690898321 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -213,7 +213,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) static inline void sysfs_slab_remove(struct kmem_cache *s) { kfree(s->name); - kfree(s); + kmem_cache_free(kmem_cache, s); } #endif @@ -3969,7 +3969,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (!n) return NULL; - s = kmalloc(kmem_size, GFP_KERNEL); + s = kmem_cache_alloc(kmem_cache, GFP_KERNEL); if (s) { if (kmem_cache_open(s, n, size, align, flags, ctor)) { @@ -3986,7 +3986,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, list_del(&s->list); kmem_cache_close(s); } - kfree(s); + kmem_cache_free(kmem_cache, s); } kfree(n); return NULL; @@ -5224,7 +5224,7 @@ static void kmem_cache_release(struct kobject *kobj) struct kmem_cache *s = to_slab(kobj); kfree(s->name); - kfree(s); + kmem_cache_free(kmem_cache, s); } static const struct sysfs_ops slab_sysfs_ops = { -- cgit v1.2.3 From 686d550d222e8f83f6e709debbedf9d8ef77aec7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:20:33 +0000 Subject: mm/slab_common: Improve error handling in kmem_cache_create Instead of using s == NULL use an errorcode. This allows much more detailed diagnostics as to what went wrong. As we add more functionality from the slab allocators to the common kmem_cache_create() function we will also add more error conditions. Print the error code during the panic as well as in a warning if the module can handle failure. The API for kmem_cache_create() currently does not allow the returning of an error code. Return NULL but log the cause of the problem in the syslog. Reviewed-by: Glauber Costa Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab_common.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 8cf8b4962d6..fe8dc943c28 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -98,16 +98,36 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; + int err = 0; get_online_cpus(); mutex_lock(&slab_mutex); - if (kmem_cache_sanity_check(name, size) == 0) - s = __kmem_cache_create(name, size, align, flags, ctor); + + if (!kmem_cache_sanity_check(name, size) == 0) + goto out_locked; + + + s = __kmem_cache_create(name, size, align, flags, ctor); + if (!s) + err = -ENOSYS; /* Until __kmem_cache_create returns code */ + +out_locked: mutex_unlock(&slab_mutex); put_online_cpus(); - if (!s && (flags & SLAB_PANIC)) - panic("kmem_cache_create: Failed to create slab '%s'\n", name); + if (err) { + + if (flags & SLAB_PANIC) + panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", + name, err); + else { + printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + name, err); + dump_stack(); + } + + return NULL; + } return s; } -- cgit v1.2.3 From 7c9adf5a5471647f392169ef19d3e81dcfa76045 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:38:33 +0000 Subject: mm/sl[aou]b: Move list_add() to slab_common.c Move the code to append the new kmem_cache to the list of slab caches to the kmem_cache_create code in the shared code. This is possible now since the acquisition of the mutex was moved into kmem_cache_create(). Acked-by: David Rientjes Reviewed-by: Glauber Costa Reviewed-by: Joonsoo Kim Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 7 +++++-- mm/slab_common.c | 7 +++++++ mm/slob.c | 4 ++++ mm/slub.c | 2 -- 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3b4587bb7b1..a6990316849 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1680,6 +1680,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); + list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = __kmem_cache_create(names[INDEX_L3].name, @@ -1687,6 +1688,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); + list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); } slab_early_init = 0; @@ -1705,6 +1707,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); + list_add(&sizes->cs_cachep->list, &slab_caches); } #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = __kmem_cache_create( @@ -1714,6 +1717,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC, NULL); + list_add(&sizes->cs_dmacachep->list, &slab_caches); #endif sizes++; names++; @@ -2583,6 +2587,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, } cachep->ctor = ctor; cachep->name = name; + cachep->refcount = 1; if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); @@ -2599,8 +2604,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align, slab_set_debugobj_lock_classes(cachep); } - /* cache setup completed, link it into the list */ - list_add(&cachep->list, &slab_caches); return cachep; } diff --git a/mm/slab_common.c b/mm/slab_common.c index fe8dc943c28..5190a7cd02b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -111,6 +111,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align if (!s) err = -ENOSYS; /* Until __kmem_cache_create returns code */ + /* + * Check if the slab has actually been created and if it was a + * real instatiation. Aliases do not belong on the list + */ + if (s && s->refcount == 1) + list_add(&s->list, &slab_caches); + out_locked: mutex_unlock(&slab_mutex); put_online_cpus(); diff --git a/mm/slob.c b/mm/slob.c index 45d4ca79933..5225d28f269 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -540,6 +540,10 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, void kmem_cache_destroy(struct kmem_cache *c) { + mutex_lock(&slab_mutex); + list_del(&c->list); + mutex_unlock(&slab_mutex); + kmemleak_free(c); if (c->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); diff --git a/mm/slub.c b/mm/slub.c index c6690898321..24aa362edef 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3975,7 +3975,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size, align, flags, ctor)) { int r; - list_add(&s->list, &slab_caches); mutex_unlock(&slab_mutex); r = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -3983,7 +3982,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (!r) return s; - list_del(&s->list); kmem_cache_close(s); } kmem_cache_free(kmem_cache, s); -- cgit v1.2.3 From 945cf2b6199be70ff03102b9e642c3bb05d01de9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:18:33 +0000 Subject: mm/sl[aou]b: Extract a common function for kmem_cache_destroy kmem_cache_destroy does basically the same in all allocators. Extract common code which is easy since we already have common mutex handling. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 45 +++------------------------------------------ mm/slab.h | 3 +++ mm/slab_common.c | 25 +++++++++++++++++++++++++ mm/slob.c | 15 +++++++-------- mm/slub.c | 36 +++++++++++------------------------- 5 files changed, 49 insertions(+), 75 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a6990316849..49a74b349e3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2206,7 +2206,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -static void __kmem_cache_destroy(struct kmem_cache *cachep) +void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; struct kmem_list3 *l3; @@ -2763,49 +2763,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a &struct kmem_cache object from the slab cache. - * - * It is expected this function will be called by a module when it is - * unloaded. This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that no one will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cachep) +int __kmem_cache_shutdown(struct kmem_cache *cachep) { - BUG_ON(!cachep || in_interrupt()); - - /* Find the cache in the chain of caches. */ - get_online_cpus(); - mutex_lock(&slab_mutex); - /* - * the chain is never empty, cache_cache is never destroyed - */ - list_del(&cachep->list); - if (__cache_shrink(cachep)) { - slab_error(cachep, "Can't free all objects"); - list_add(&cachep->list, &slab_caches); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return; - } - - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - rcu_barrier(); - - __kmem_cache_destroy(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); + return __cache_shrink(cachep); } -EXPORT_SYMBOL(kmem_cache_destroy); /* * Get the memory for a slab management obj. diff --git a/mm/slab.h b/mm/slab.h index db7848caaa2..07a537ed5da 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -30,4 +30,7 @@ extern struct list_head slab_caches; struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_destroy(struct kmem_cache *); + #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 5190a7cd02b..a1c4f0b5aae 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -140,6 +140,31 @@ out_locked: } EXPORT_SYMBOL(kmem_cache_create); +void kmem_cache_destroy(struct kmem_cache *s) +{ + get_online_cpus(); + mutex_lock(&slab_mutex); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + + if (!__kmem_cache_shutdown(s)) { + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + __kmem_cache_destroy(s); + } else { + list_add(&s->list, &slab_caches); + printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); + dump_stack(); + } + } + mutex_unlock(&slab_mutex); + put_online_cpus(); +} +EXPORT_SYMBOL(kmem_cache_destroy); + int slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slob.c b/mm/slob.c index 5225d28f269..289be4f4681 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -538,18 +538,11 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, return c; } -void kmem_cache_destroy(struct kmem_cache *c) +void __kmem_cache_destroy(struct kmem_cache *c) { - mutex_lock(&slab_mutex); - list_del(&c->list); - mutex_unlock(&slab_mutex); - kmemleak_free(c); - if (c->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); slob_free(c, sizeof(struct kmem_cache)); } -EXPORT_SYMBOL(kmem_cache_destroy); void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { @@ -617,6 +610,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); +int __kmem_cache_shutdown(struct kmem_cache *c) +{ + /* No way to check for remaining objects */ + return 0; +} + int kmem_cache_shrink(struct kmem_cache *d) { return 0; diff --git a/mm/slub.c b/mm/slub.c index 24aa362edef..724adea3438 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -624,7 +624,7 @@ static void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; char buf[100]; @@ -3146,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, sizeof(long), GFP_ATOMIC); if (!map) return; - slab_err(s, page, "%s", text); + slab_err(s, page, text, s->name); slab_lock(page); get_map(s, page, map); @@ -3178,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining on kmem_cache_close()"); + "Objects remaining in %s on kmem_cache_close()"); } } } @@ -3191,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - free_percpu(s->cpu_slab); /* Attempt to free all objects */ for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3200,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { - mutex_lock(&slab_mutex); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - mutex_unlock(&slab_mutex); - if (kmem_cache_close(s)) { - printk(KERN_ERR "SLUB %s: %s called for cache that " - "still has objects.\n", s->name, __func__); - dump_stack(); - } - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - sysfs_slab_remove(s); - } else - mutex_unlock(&slab_mutex); + return kmem_cache_close(s); +} + +void __kmem_cache_destroy(struct kmem_cache *s) +{ + sysfs_slab_remove(s); } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem -- cgit v1.2.3 From 9b030cb865f137e1574596983face2a07e41e8b2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:20:33 +0000 Subject: mm/sl[aou]b: Use "kmem_cache" name for slab cache with kmem_cache struct Make all allocators use the "kmem_cache" slabname for the "kmem_cache" structure. Reviewed-by: Glauber Costa Reviewed-by: Joonsoo Kim Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 72 +++++++++++++++++++++++++++++--------------------------- mm/slab.h | 6 +++++ mm/slab_common.c | 1 + mm/slob.c | 8 +++++++ mm/slub.c | 2 -- 5 files changed, 52 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 49a74b349e3..ef94799a1aa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -578,9 +578,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; -static struct kmem_cache cache_cache = { - .nodelists = cache_cache_nodelists, +static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; +static struct kmem_cache kmem_cache_boot = { + .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -1594,15 +1594,17 @@ void __init kmem_cache_init(void) int order; int node; + kmem_cache = &kmem_cache_boot; + if (num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) - cache_cache.nodelists[i] = NULL; + kmem_cache->nodelists[i] = NULL; } - set_up_list3s(&cache_cache, CACHE_CACHE); + set_up_list3s(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1614,9 +1616,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct - * kmem_cache structures of all caches, except cache_cache itself: - * cache_cache is statically allocated. + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1625,43 +1627,43 @@ void __init kmem_cache_init(void) * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. - * 4) Replace the __init data head arrays for cache_cache and the first + * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_list3 for cache_cache and + * 5) Replace the __init data for kmem_list3 for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ node = numa_mem_id(); - /* 1) create the cache_cache */ + /* 1) create the kmem_cache */ INIT_LIST_HEAD(&slab_caches); - list_add(&cache_cache.list, &slab_caches); - cache_cache.colour_off = cache_line_size(); - cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + list_add(&kmem_cache->list, &slab_caches); + kmem_cache->colour_off = cache_line_size(); + kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; + kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *); - cache_cache.object_size = cache_cache.size; - cache_cache.size = ALIGN(cache_cache.size, + kmem_cache->object_size = kmem_cache->size; + kmem_cache->size = ALIGN(kmem_cache->object_size, cache_line_size()); - cache_cache.reciprocal_buffer_size = - reciprocal_value(cache_cache.size); + kmem_cache->reciprocal_buffer_size = + reciprocal_value(kmem_cache->size); for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, cache_cache.size, - cache_line_size(), 0, &left_over, &cache_cache.num); - if (cache_cache.num) + cache_estimate(order, kmem_cache->size, + cache_line_size(), 0, &left_over, &kmem_cache->num); + if (kmem_cache->num) break; } - BUG_ON(!cache_cache.num); - cache_cache.gfporder = order; - cache_cache.colour = left_over / cache_cache.colour_off; - cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + + BUG_ON(!kmem_cache->num); + kmem_cache->gfporder = order; + kmem_cache->colour = left_over / kmem_cache->colour_off; + kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ @@ -1728,15 +1730,15 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); @@ -1757,7 +1759,7 @@ void __init kmem_cache_init(void) int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -2223,7 +2225,7 @@ void __kmem_cache_destroy(struct kmem_cache *cachep) kfree(l3); } } - kmem_cache_free(&cache_cache, cachep); + kmem_cache_free(kmem_cache, cachep); } @@ -2473,7 +2475,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, gfp = GFP_NOWAIT; /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, gfp); + cachep = kmem_cache_zalloc(kmem_cache, gfp); if (!cachep) return NULL; @@ -2531,7 +2533,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep->num) { printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); - kmem_cache_free(&cache_cache, cachep); + kmem_cache_free(kmem_cache, cachep); return NULL; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) @@ -3299,7 +3301,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == &cache_cache) + if (cachep == kmem_cache) return false; return should_failslab(cachep->object_size, flags, cachep->flags); diff --git a/mm/slab.h b/mm/slab.h index 07a537ed5da..6724aa6f662 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -25,8 +25,14 @@ extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ extern struct list_head slab_caches; +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* Functions provided by the slab allocators */ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); diff --git a/mm/slab_common.c b/mm/slab_common.c index a1c4f0b5aae..5374150f548 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -22,6 +22,7 @@ enum slab_state slab_state; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, size_t size) diff --git a/mm/slob.c b/mm/slob.c index 289be4f4681..7d272c3dcc0 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -622,8 +622,16 @@ int kmem_cache_shrink(struct kmem_cache *d) } EXPORT_SYMBOL(kmem_cache_shrink); +struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; + void __init kmem_cache_init(void) { + kmem_cache = &kmem_cache_boot; slab_state = UP; } diff --git a/mm/slub.c b/mm/slub.c index 724adea3438..e0d1e047030 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3221,8 +3221,6 @@ void __kmem_cache_destroy(struct kmem_cache *s) struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); -static struct kmem_cache *kmem_cache; - #ifdef CONFIG_ZONE_DMA static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; #endif -- cgit v1.2.3 From 8f4c765c22deee766319ae9a1db68325f14816e6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:18:32 +0000 Subject: mm/sl[aou]b: Move freeing of kmem_cache structure to common code The freeing action is basically the same in all slab allocators. Move to the common kmem_cache_destroy() function. Reviewed-by: Glauber Costa Reviewed-by: Joonsoo Kim Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 1 - mm/slab_common.c | 1 + mm/slob.c | 2 -- mm/slub.c | 2 -- 4 files changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ef94799a1aa..8ca6ec6301f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2225,7 +2225,6 @@ void __kmem_cache_destroy(struct kmem_cache *cachep) kfree(l3); } } - kmem_cache_free(kmem_cache, cachep); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 5374150f548..d6deae9108c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -154,6 +154,7 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); __kmem_cache_destroy(s); + kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", diff --git a/mm/slob.c b/mm/slob.c index 7d272c3dcc0..cb4ab967529 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -540,8 +540,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, void __kmem_cache_destroy(struct kmem_cache *c) { - kmemleak_free(c); - slob_free(c, sizeof(struct kmem_cache)); } void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) diff --git a/mm/slub.c b/mm/slub.c index e0d1e047030..6f932f7a821 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -213,7 +213,6 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) static inline void sysfs_slab_remove(struct kmem_cache *s) { kfree(s->name); - kmem_cache_free(kmem_cache, s); } #endif @@ -5206,7 +5205,6 @@ static void kmem_cache_release(struct kobject *kobj) struct kmem_cache *s = to_slab(kobj); kfree(s->name); - kmem_cache_free(kmem_cache, s); } static const struct sysfs_ops slab_sysfs_ops = { -- cgit v1.2.3 From 12c3667fb780e20360ad0bde32dfb3591ef609ad Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:38:33 +0000 Subject: mm/sl[aou]b: Get rid of __kmem_cache_destroy What is done there can be done in __kmem_cache_shutdown. This affects RCU handling somewhat. On rcu free all slab allocators do not refer to other management structures than the kmem_cache structure. Therefore these other structures can be freed before the rcu deferred free to the page allocator occurs. Reviewed-by: Joonsoo Kim Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 46 +++++++++++++++++++++------------------------- mm/slab.h | 1 - mm/slab_common.c | 1 - mm/slob.c | 4 ---- mm/slub.c | 10 +++++----- 5 files changed, 26 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 8ca6ec6301f..de961b48a6a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2208,26 +2208,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -void __kmem_cache_destroy(struct kmem_cache *cachep) -{ - int i; - struct kmem_list3 *l3; - - for_each_online_cpu(i) - kfree(cachep->array[i]); - - /* NUMA: free the list3 structures */ - for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - } - } -} - - /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2364,9 +2344,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -2591,7 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, cachep->refcount = 1; if (setup_cpu_cache(cachep, gfp)) { - __kmem_cache_destroy(cachep); + __kmem_cache_shutdown(cachep); return NULL; } @@ -2766,7 +2743,26 @@ EXPORT_SYMBOL(kmem_cache_shrink); int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __cache_shrink(cachep); + int i; + struct kmem_list3 *l3; + int rc = __cache_shrink(cachep); + + if (rc) + return rc; + + for_each_online_cpu(i) + kfree(cachep->array[i]); + + /* NUMA: free the list3 structures */ + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (l3) { + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + } + } + return 0; } /* diff --git a/mm/slab.h b/mm/slab.h index 6724aa6f662..c4f9a361bd1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -37,6 +37,5 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); int __kmem_cache_shutdown(struct kmem_cache *); -void __kmem_cache_destroy(struct kmem_cache *); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index d6deae9108c..7df814e8fbe 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -153,7 +153,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); - __kmem_cache_destroy(s); kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); diff --git a/mm/slob.c b/mm/slob.c index cb4ab967529..50f60532270 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -538,10 +538,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, return c; } -void __kmem_cache_destroy(struct kmem_cache *c) -{ -} - void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; diff --git a/mm/slub.c b/mm/slub.c index 6f932f7a821..e5e09873f5e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3205,12 +3205,12 @@ static inline int kmem_cache_close(struct kmem_cache *s) int __kmem_cache_shutdown(struct kmem_cache *s) { - return kmem_cache_close(s); -} + int rc = kmem_cache_close(s); -void __kmem_cache_destroy(struct kmem_cache *s) -{ - sysfs_slab_remove(s); + if (!rc) + sysfs_slab_remove(s); + + return rc; } /******************************************************************** -- cgit v1.2.3 From db265eca77000c5dafc5608975afe8dafb2a02d5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:18:33 +0000 Subject: mm/sl[aou]b: Move duping of slab name to slab_common.c Duping of the slabname has to be done by each slab. Moving this code to slab_common avoids duplicate implementations. With this patch we have common string handling for all slab allocators. Strings passed to kmem_cache_create() are copied internally. Subsystems can create temporary strings to create slab caches. Slabs allocated in early states of bootstrap will never be freed (and those can never be freed since they are essential to slab allocator operations). During bootstrap we therefore do not have to worry about duping names. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab_common.c | 30 +++++++++++++++++++++--------- mm/slub.c | 21 ++------------------- 2 files changed, 23 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 7df814e8fbe..f18c06fd97c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -100,6 +100,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align { struct kmem_cache *s = NULL; int err = 0; + char *n; get_online_cpus(); mutex_lock(&slab_mutex); @@ -108,16 +109,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align goto out_locked; - s = __kmem_cache_create(name, size, align, flags, ctor); - if (!s) - err = -ENOSYS; /* Until __kmem_cache_create returns code */ + n = kstrdup(name, GFP_KERNEL); + if (!n) { + err = -ENOMEM; + goto out_locked; + } + + s = __kmem_cache_create(n, size, align, flags, ctor); + + if (s) { + /* + * Check if the slab has actually been created and if it was a + * real instatiation. Aliases do not belong on the list + */ + if (s->refcount == 1) + list_add(&s->list, &slab_caches); - /* - * Check if the slab has actually been created and if it was a - * real instatiation. Aliases do not belong on the list - */ - if (s && s->refcount == 1) - list_add(&s->list, &slab_caches); + } else { + kfree(n); + err = -ENOSYS; /* Until __kmem_cache_create returns code */ + } out_locked: mutex_unlock(&slab_mutex); @@ -153,6 +164,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + kfree(s->name); kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); diff --git a/mm/slub.c b/mm/slub.c index e5e09873f5e..91c9a2fe676 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -210,10 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *); static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s->name); -} +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif @@ -3929,7 +3926,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *n; s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -3948,13 +3944,9 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, return s; } - n = kstrdup(name, GFP_KERNEL); - if (!n) - return NULL; - s = kmem_cache_alloc(kmem_cache, GFP_KERNEL); if (s) { - if (kmem_cache_open(s, n, + if (kmem_cache_open(s, name, size, align, flags, ctor)) { int r; @@ -3969,7 +3961,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, } kmem_cache_free(kmem_cache, s); } - kfree(n); return NULL; } @@ -5200,13 +5191,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, return err; } -static void kmem_cache_release(struct kobject *kobj) -{ - struct kmem_cache *s = to_slab(kobj); - - kfree(s->name); -} - static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5214,7 +5198,6 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release }; static int uevent_filter(struct kset *kset, struct kobject *kobj) -- cgit v1.2.3 From cbb79694d592e9a76880f6ef6db8feccaeee1c32 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:18:32 +0000 Subject: mm/sl[aou]b: Do slab aliasing call from common code The slab aliasing logic causes some strange contortions in slub. So add a call to deal with aliases to slab_common.c but disable it for other slab allocators by providng stubs that fail to create aliases. Full general support for aliases will require additional cleanup passes and more standardization of fields in kmem_cache. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.h | 10 ++++++++++ mm/slab_common.c | 4 ++++ mm/slub.c | 15 +++++++++++---- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index c4f9a361bd1..84c28f451d2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -36,6 +36,16 @@ extern struct kmem_cache *kmem_cache; struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +#ifdef CONFIG_SLUB +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); +#else +static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ return NULL; } +#endif + + int __kmem_cache_shutdown(struct kmem_cache *); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index f18c06fd97c..adc42b01b25 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -115,6 +115,10 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align goto out_locked; } + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_locked; + s = __kmem_cache_create(n, size, align, flags, ctor); if (s) { diff --git a/mm/slub.c b/mm/slub.c index 91c9a2fe676..64d445e7a27 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3708,7 +3708,7 @@ void __init kmem_cache_init(void) slub_max_order = 0; kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); + nr_node_ids * sizeof(struct kmem_cache_node *); /* Allocate two kmem_caches from the page allocator */ kmalloc_size = ALIGN(kmem_size, cache_line_size()); @@ -3922,7 +3922,7 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3939,11 +3939,18 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (sysfs_slab_alias(s, name)) { s->refcount--; - return NULL; + s = NULL; } - return s; } + return s; +} + +struct kmem_cache *__kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s; + s = kmem_cache_alloc(kmem_cache, GFP_KERNEL); if (s) { if (kmem_cache_open(s, name, -- cgit v1.2.3 From 96d17b7be0a9849d381442030886211dbb2a7061 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:18:32 +0000 Subject: mm/sl[aou]b: Move sysfs_slab_add to common Simplify locking by moving the slab_add_sysfs after all locks have been dropped. Eases the upcoming move to provide sysfs support for all allocators. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.h | 3 +++ mm/slab_common.c | 8 ++++++++ mm/slub.c | 15 ++------------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 84c28f451d2..ec7b94429b9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -39,10 +39,13 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, #ifdef CONFIG_SLUB struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +extern int sysfs_slab_add(struct kmem_cache *s); #else static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } + #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index adc42b01b25..4f722084bae 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -152,6 +152,14 @@ out_locked: return NULL; } + if (s->refcount == 1) { + err = sysfs_slab_add(s); + if (err) + printk(KERN_WARNING "kmem_cache_create(%s) failed to" + " create sysfs entry. Error %d\n", + name, err); + } + return s; } EXPORT_SYMBOL(kmem_cache_create); diff --git a/mm/slub.c b/mm/slub.c index 64d445e7a27..8d00fd78df2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -202,12 +202,10 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS -static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); #else -static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } @@ -3955,16 +3953,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (s) { if (kmem_cache_open(s, name, size, align, flags, ctor)) { - int r; - - mutex_unlock(&slab_mutex); - r = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - - if (!r) - return s; - - kmem_cache_close(s); + return s; } kmem_cache_free(kmem_cache, s); } @@ -5258,7 +5247,7 @@ static char *create_unique_id(struct kmem_cache *s) return name; } -static int sysfs_slab_add(struct kmem_cache *s) +int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; -- cgit v1.2.3 From 278b1bb1313664d4999a7f7d47a8a8d964862d02 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 5 Sep 2012 00:20:34 +0000 Subject: mm/sl[aou]b: Move kmem_cache allocations into common code Shift the allocations to common code. That way the allocation and freeing of the kmem_cache structures is handled by common code. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 34 ++++++++++++++++------------------ mm/slab.h | 4 ++-- mm/slab_common.c | 18 ++++++++++-------- mm/slob.c | 42 +++++++++++++++++------------------------- mm/slub.c | 24 +++++++----------------- 5 files changed, 52 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index de961b48a6a..abc83334e5f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1676,7 +1676,8 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, + sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + __kmem_cache_create(sizes[INDEX_AC].cs_cachep, names[INDEX_AC].name, sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, @@ -1684,8 +1685,8 @@ void __init kmem_cache_init(void) list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = - __kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + __kmem_cache_create(sizes[INDEX_L3].cs_cachep, names[INDEX_L3].name, sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, @@ -1704,7 +1705,8 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) { - sizes->cs_cachep = __kmem_cache_create(names->name, + sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + __kmem_cache_create(sizes->cs_cachep, names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, @@ -1712,7 +1714,8 @@ void __init kmem_cache_init(void) list_add(&sizes->cs_cachep->list, &slab_caches); } #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = __kmem_cache_create( + sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + __kmem_cache_create(sizes->cs_dmacachep, names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, @@ -2356,13 +2359,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -struct kmem_cache * -__kmem_cache_create (const char *name, size_t size, size_t align, +int +__kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL; gfp_t gfp; + int err; #if DEBUG #if FORCED_DEBUG @@ -2450,11 +2453,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align, else gfp = GFP_NOWAIT; - /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(kmem_cache, gfp); - if (!cachep) - return NULL; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; cachep->object_size = size; cachep->align = align; @@ -2509,8 +2507,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep->num) { printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); - kmem_cache_free(kmem_cache, cachep); - return NULL; + return -E2BIG; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); @@ -2567,9 +2564,10 @@ __kmem_cache_create (const char *name, size_t size, size_t align, cachep->name = name; cachep->refcount = 1; - if (setup_cpu_cache(cachep, gfp)) { + err = setup_cpu_cache(cachep, gfp); + if (err) { __kmem_cache_shutdown(cachep); - return NULL; + return err; } if (flags & SLAB_DEBUG_OBJECTS) { @@ -2582,7 +2580,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, slab_set_debugobj_lock_classes(cachep); } - return cachep; + return 0; } #if DEBUG diff --git a/mm/slab.h b/mm/slab.h index ec7b94429b9..077b07a24ef 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -33,8 +33,8 @@ extern struct list_head slab_caches; extern struct kmem_cache *kmem_cache; /* Functions provided by the slab allocators */ -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +extern int __kmem_cache_create(struct kmem_cache *, const char *name, + size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); #ifdef CONFIG_SLUB struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, diff --git a/mm/slab_common.c b/mm/slab_common.c index 4f722084bae..f50d2ed4fbf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -119,19 +119,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align if (s) goto out_locked; - s = __kmem_cache_create(n, size, align, flags, ctor); - + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (s) { - /* - * Check if the slab has actually been created and if it was a - * real instatiation. Aliases do not belong on the list - */ - if (s->refcount == 1) + err = __kmem_cache_create(s, n, size, align, flags, ctor); + if (!err) + list_add(&s->list, &slab_caches); + else { + kfree(n); + kmem_cache_free(kmem_cache, s); + } + } else { kfree(n); - err = -ENOSYS; /* Until __kmem_cache_create returns code */ + err = -ENOMEM; } out_locked: diff --git a/mm/slob.c b/mm/slob.c index 50f60532270..9b0cee1e847 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -508,34 +508,26 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +int __kmem_cache_create(struct kmem_cache *c, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *c; - - c = slob_alloc(sizeof(struct kmem_cache), - GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); - - if (c) { - c->name = name; - c->size = size; - if (flags & SLAB_DESTROY_BY_RCU) { - /* leave room for rcu footer at the end of object */ - c->size += sizeof(struct slob_rcu); - } - c->flags = flags; - c->ctor = ctor; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - - kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); - c->refcount = 1; + c->name = name; + c->size = size; + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); } - return c; + c->flags = flags; + c->ctor = ctor; + /* ignore alignment unless it's forced */ + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; + if (c->align < align) + c->align = align; + + c->refcount = 1; + return 0; } void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) diff --git a/mm/slub.c b/mm/slub.c index 8d00fd78df2..0ad3fffc7d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3034,7 +3034,6 @@ static int kmem_cache_open(struct kmem_cache *s, size_t align, unsigned long flags, void (*ctor)(void *)) { - memset(s, 0, kmem_size); s->name = name; s->ctor = ctor; s->object_size = size; @@ -3109,7 +3108,7 @@ static int kmem_cache_open(struct kmem_cache *s, goto error; if (alloc_kmem_cache_cpus(s)) - return 1; + return 0; free_kmem_cache_nodes(s); error: @@ -3118,7 +3117,7 @@ error: "order=%u offset=%u flags=%lx\n", s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); - return 0; + return -EINVAL; } /* @@ -3260,13 +3259,13 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, { struct kmem_cache *s; - s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slab_mutex here. */ - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, + if (kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; @@ -3944,20 +3943,11 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, return s; } -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +int __kmem_cache_create(struct kmem_cache *s, + const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; - - s = kmem_cache_alloc(kmem_cache, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, name, - size, align, flags, ctor)) { - return s; - } - kmem_cache_free(kmem_cache, s); - } - return NULL; + return kmem_cache_open(s, name, size, align, flags, ctor); } #ifdef CONFIG_SMP -- cgit v1.2.3 From 8a13a4cc80bb25c9eab2e7e56bab724fcfa55fce Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:18:33 +0000 Subject: mm/sl[aou]b: Shrink __kmem_cache_create() parameter lists Do the initial settings of the fields in common code. This will allow us to push more processing into common code later and improve readability. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 73 +++++++++++++++++++++++++------------------------------- mm/slab.h | 3 +-- mm/slab_common.c | 26 ++++++++++---------- mm/slob.c | 8 +++---- mm/slub.c | 39 ++++++++++++++---------------- 5 files changed, 68 insertions(+), 81 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index abc83334e5f..f1f6d54e129 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1677,20 +1677,20 @@ void __init kmem_cache_init(void) */ sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - __kmem_cache_create(sizes[INDEX_AC].cs_cachep, names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); - + sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; + sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); + if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - __kmem_cache_create(sizes[INDEX_L3].cs_cachep, names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; + sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); } @@ -1706,22 +1706,21 @@ void __init kmem_cache_init(void) */ if (!sizes->cs_cachep) { sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - __kmem_cache_create(sizes->cs_cachep, names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes->cs_cachep->name = names->name; + sizes->cs_cachep->size = sizes->cs_size; + sizes->cs_cachep->object_size = sizes->cs_size; + sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); list_add(&sizes->cs_cachep->list, &slab_caches); } #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes->cs_dmacachep->name = names->name_dma; + sizes->cs_dmacachep->size = sizes->cs_size; + sizes->cs_dmacachep->object_size = sizes->cs_size; + sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; __kmem_cache_create(sizes->cs_dmacachep, - names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| - SLAB_PANIC, - NULL); + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); list_add(&sizes->cs_dmacachep->list, &slab_caches); #endif sizes++; @@ -2360,12 +2359,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * as davem. */ int -__kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { size_t left_over, slab_size, ralign; gfp_t gfp; int err; + size_t size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2437,8 +2436,8 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ - if (ralign < align) { - ralign = align; + if (ralign < cachep->align) { + ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) @@ -2446,7 +2445,7 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s /* * 4) Store it. */ - align = ralign; + cachep->align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2454,8 +2453,6 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s gfp = GFP_NOWAIT; cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; - cachep->object_size = size; - cachep->align = align; #if DEBUG /* @@ -2500,17 +2497,15 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s */ flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, align); + size = ALIGN(size, cachep->align); - left_over = calculate_slab_order(cachep, size, align, flags); + left_over = calculate_slab_order(cachep, size, cachep->align, flags); - if (!cachep->num) { - printk(KERN_ERR - "kmem_cache_create: couldn't create cache %s.\n", name); + if (!cachep->num) return -E2BIG; - } + slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + + sizeof(struct slab), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2538,8 +2533,8 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < align) - cachep->colour_off = align; + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; @@ -2560,8 +2555,6 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } - cachep->ctor = ctor; - cachep->name = name; cachep->refcount = 1; err = setup_cpu_cache(cachep, gfp); diff --git a/mm/slab.h b/mm/slab.h index 077b07a24ef..67aeaa2d39c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -33,8 +33,7 @@ extern struct list_head slab_caches; extern struct kmem_cache *kmem_cache; /* Functions provided by the slab allocators */ -extern int __kmem_cache_create(struct kmem_cache *, const char *name, - size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); #ifdef CONFIG_SLUB struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, diff --git a/mm/slab_common.c b/mm/slab_common.c index f50d2ed4fbf..8a85a19d90e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -100,7 +100,6 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align { struct kmem_cache *s = NULL; int err = 0; - char *n; get_online_cpus(); mutex_lock(&slab_mutex); @@ -109,32 +108,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align goto out_locked; - n = kstrdup(name, GFP_KERNEL); - if (!n) { - err = -ENOMEM; - goto out_locked; - } - s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) goto out_locked; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (s) { - err = __kmem_cache_create(s, n, size, align, flags, ctor); + s->object_size = s->size = size; + s->align = align; + s->ctor = ctor; + s->name = kstrdup(name, GFP_KERNEL); + if (!s->name) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + + err = __kmem_cache_create(s, flags); if (!err) list_add(&s->list, &slab_caches); else { - kfree(n); + kfree(s->name); kmem_cache_free(kmem_cache, s); } - - } else { - kfree(n); + } else err = -ENOMEM; - } out_locked: mutex_unlock(&slab_mutex); diff --git a/mm/slob.c b/mm/slob.c index 9b0cee1e847..cac05d92f32 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -508,17 +508,15 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -int __kmem_cache_create(struct kmem_cache *c, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - c->name = name; - c->size = size; + size_t align = c->size; + if (flags & SLAB_DESTROY_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } c->flags = flags; - c->ctor = ctor; /* ignore alignment unless it's forced */ c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; if (c->align < ARCH_SLAB_MINALIGN) diff --git a/mm/slub.c b/mm/slub.c index 0ad3fffc7d2..d8ee419d5a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3029,16 +3029,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - s->name = name; - s->ctor = ctor; - s->object_size = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) @@ -3115,7 +3108,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, oo_order(s->oo), + s->name, (unsigned long)s->size, s->size, oo_order(s->oo), s->offset, flags); return -EINVAL; } @@ -3261,12 +3254,15 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + s->name = name; + s->size = s->object_size = size; + s->align = ARCH_KMALLOC_MINALIGN; + /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slab_mutex here. */ - if (kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) + if (kmem_cache_open(s, flags)) goto panic; list_add(&s->list, &slab_caches); @@ -3719,9 +3715,10 @@ void __init kmem_cache_init(void) */ kmem_cache_node = (void *)kmem_cache + kmalloc_size; - kmem_cache_open(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache_node->name = "kmem_cache_node"; + kmem_cache_node->size = kmem_cache_node->object_size = + sizeof(struct kmem_cache_node); + kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -3729,8 +3726,10 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; temp_kmem_cache = kmem_cache; - kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache->name = "kmem_cache"; + kmem_cache->size = kmem_cache->object_size = kmem_size; + kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); memcpy(kmem_cache, temp_kmem_cache, kmem_size); @@ -3943,11 +3942,9 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, return s; } -int __kmem_cache_create(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) { - return kmem_cache_open(s, name, size, align, flags, ctor); + return kmem_cache_open(s, flags); } #ifdef CONFIG_SMP -- cgit v1.2.3 From cce89f4f6911286500cf7be0363f46c9b0a12ce0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 4 Sep 2012 23:38:33 +0000 Subject: mm/sl[aou]b: Move kmem_cache refcounting to common code Get rid of the refcount stuff in the allocators and do that part of kmem_cache management in the common code. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 1 - mm/slab_common.c | 5 +++-- mm/slob.c | 1 - mm/slub.c | 1 - 4 files changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f1f6d54e129..11d9af5f9d2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2555,7 +2555,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } - cachep->refcount = 1; err = setup_cpu_cache(cachep, gfp); if (err) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 8a85a19d90e..651a3c60847 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -125,11 +125,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align } err = __kmem_cache_create(s, flags); - if (!err) + if (!err) { + s->refcount = 1; list_add(&s->list, &slab_caches); - else { + } else { kfree(s->name); kmem_cache_free(kmem_cache, s); } diff --git a/mm/slob.c b/mm/slob.c index cac05d92f32..3edfeaac320 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -524,7 +524,6 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) if (c->align < align) c->align = align; - c->refcount = 1; return 0; } diff --git a/mm/slub.c b/mm/slub.c index d8ee419d5a1..0b122d8ec21 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3093,7 +3093,6 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) else s->cpu_partial = 30; - s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif -- cgit v1.2.3 From aac3a1664aba429f47c70edfc76ee10fcd808471 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 5 Sep 2012 12:07:44 +0300 Subject: Revert "mm/sl[aou]b: Move sysfs_slab_add to common" This reverts commit 96d17b7be0a9849d381442030886211dbb2a7061 which caused the following errors at boot: [ 1.114885] kobject (ffff88001a802578): tried to init an initialized object, something is seriously wrong. [ 1.114885] Pid: 1, comm: swapper/0 Tainted: G W 3.6.0-rc1+ #6 [ 1.114885] Call Trace: [ 1.114885] [] kobject_init+0x87/0xa0 [ 1.115555] [] kobject_init_and_add+0x2a/0x90 [ 1.115555] [] ? sprintf+0x40/0x50 [ 1.115555] [] sysfs_slab_add+0x80/0x210 [ 1.115555] [] kmem_cache_create+0xa5/0x250 [ 1.115555] [] ? md_init+0x144/0x144 [ 1.115555] [] local_init+0xa4/0x11b [ 1.115555] [] dm_init+0x14/0x45 [ 1.115836] [] do_one_initcall+0x3a/0x160 [ 1.116834] [] kernel_init+0x133/0x1b7 [ 1.117835] [] ? do_early_param+0x86/0x86 [ 1.117835] [] kernel_thread_helper+0x4/0x10 [ 1.118401] [] ? start_kernel+0x33f/0x33f [ 1.119832] [] ? gs_change+0xb/0xb [ 1.120325] ------------[ cut here ]------------ [ 1.120835] WARNING: at fs/sysfs/dir.c:536 sysfs_add_one+0xc1/0xf0() [ 1.121437] sysfs: cannot create duplicate filename '/kernel/slab/:t-0000016' [ 1.121831] Modules linked in: [ 1.122138] Pid: 1, comm: swapper/0 Tainted: G W 3.6.0-rc1+ #6 [ 1.122831] Call Trace: [ 1.123074] [] ? sysfs_add_one+0xc1/0xf0 [ 1.123833] [] warn_slowpath_common+0x7a/0xb0 [ 1.124405] [] warn_slowpath_fmt+0x41/0x50 [ 1.124832] [] sysfs_add_one+0xc1/0xf0 [ 1.125337] [] create_dir+0x73/0xd0 [ 1.125832] [] sysfs_create_dir+0x81/0xe0 [ 1.126363] [] kobject_add_internal+0x9d/0x210 [ 1.126832] [] kobject_init_and_add+0x63/0x90 [ 1.127406] [] sysfs_slab_add+0x80/0x210 [ 1.127832] [] kmem_cache_create+0xa5/0x250 [ 1.128384] [] ? md_init+0x144/0x144 [ 1.128833] [] local_init+0xa4/0x11b [ 1.129831] [] dm_init+0x14/0x45 [ 1.130305] [] do_one_initcall+0x3a/0x160 [ 1.130831] [] kernel_init+0x133/0x1b7 [ 1.131351] [] ? do_early_param+0x86/0x86 [ 1.131830] [] kernel_thread_helper+0x4/0x10 [ 1.132392] [] ? start_kernel+0x33f/0x33f [ 1.132830] [] ? gs_change+0xb/0xb [ 1.133315] ---[ end trace 2703540871c8fab7 ]--- [ 1.133830] ------------[ cut here ]------------ [ 1.134274] WARNING: at lib/kobject.c:196 kobject_add_internal+0x1f5/0x210() [ 1.134829] kobject_add_internal failed for :t-0000016 with -EEXIST, don't try to register things with the same name in the same directory. [ 1.135829] Modules linked in: [ 1.136135] Pid: 1, comm: swapper/0 Tainted: G W 3.6.0-rc1+ #6 [ 1.136828] Call Trace: [ 1.137071] [] ? kobject_add_internal+0x1f5/0x210 [ 1.137830] [] warn_slowpath_common+0x7a/0xb0 [ 1.138402] [] warn_slowpath_fmt+0x41/0x50 [ 1.138830] [] ? release_sysfs_dirent+0x73/0xf0 [ 1.139419] [] kobject_add_internal+0x1f5/0x210 [ 1.139830] [] kobject_init_and_add+0x63/0x90 [ 1.140429] [] sysfs_slab_add+0x80/0x210 [ 1.140830] [] kmem_cache_create+0xa5/0x250 [ 1.141829] [] ? md_init+0x144/0x144 [ 1.142307] [] local_init+0xa4/0x11b [ 1.142829] [] dm_init+0x14/0x45 [ 1.143307] [] do_one_initcall+0x3a/0x160 [ 1.143829] [] kernel_init+0x133/0x1b7 [ 1.144352] [] ? do_early_param+0x86/0x86 [ 1.144829] [] kernel_thread_helper+0x4/0x10 [ 1.145405] [] ? start_kernel+0x33f/0x33f [ 1.145828] [] ? gs_change+0xb/0xb [ 1.146313] ---[ end trace 2703540871c8fab8 ]--- Conflicts: mm/slub.c Signed-off-by: Pekka Enberg --- mm/slab.h | 3 --- mm/slab_common.c | 8 -------- mm/slub.c | 19 +++++++++++++++++-- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 67aeaa2d39c..7deeb449a30 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,13 +38,10 @@ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); #ifdef CONFIG_SLUB struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); -extern int sysfs_slab_add(struct kmem_cache *s); #else static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } -static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } - #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 651a3c60847..9c217255ac4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -155,14 +155,6 @@ out_locked: return NULL; } - if (s->refcount == 1) { - err = sysfs_slab_add(s); - if (err) - printk(KERN_WARNING "kmem_cache_create(%s) failed to" - " create sysfs entry. Error %d\n", - name, err); - } - return s; } EXPORT_SYMBOL(kmem_cache_create); diff --git a/mm/slub.c b/mm/slub.c index 0b122d8ec21..dafd465f7a3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -202,10 +202,12 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS +static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); #else +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } @@ -3943,7 +3945,20 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) { - return kmem_cache_open(s, flags); + int err; + + err = kmem_cache_open(s, flags); + if (err) + return err; + + mutex_unlock(&slab_mutex); + err = sysfs_slab_add(s); + mutex_lock(&slab_mutex); + + if (err) + kmem_cache_close(s); + + return err; } #ifdef CONFIG_SMP @@ -5233,7 +5248,7 @@ static char *create_unique_id(struct kmem_cache *s) return name; } -int sysfs_slab_add(struct kmem_cache *s) +static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; -- cgit v1.2.3 From 9df53b154ac712c87db1170057aa6df05eb7bdbd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 8 Sep 2012 18:27:10 +0000 Subject: slub: Zero initial memory segment for kmem_cache and kmem_cache_node Tony Luck reported the following problem on IA-64: Worked fine yesterday on next-20120905, crashes today. First sign of trouble was an unaligned access, then a NULL dereference. SL*B related bits of my config: CONFIG_SLUB_DEBUG=y # CONFIG_SLAB is not set CONFIG_SLUB=y CONFIG_SLABINFO=y # CONFIG_SLUB_DEBUG_ON is not set # CONFIG_SLUB_STATS is not set And he console log. PID hash table entries: 4096 (order: 1, 32768 bytes) Dentry cache hash table entries: 262144 (order: 7, 2097152 bytes) Inode-cache hash table entries: 131072 (order: 6, 1048576 bytes) Memory: 2047920k/2086064k available (13992k code, 38144k reserved, 6012k data, 880k init) kernel unaligned access to 0xca2ffc55fb373e95, ip=0xa0000001001be550 swapper[0]: error during unaligned kernel access -1 [1] Modules linked in: Pid: 0, CPU 0, comm: swapper psr : 00001010084a2018 ifs : 800000000000060f ip : [] Not tainted (3.6.0-rc4-zx1-smp-next-20120906) ip is at new_slab+0x90/0x680 unat: 0000000000000000 pfs : 000000000000060f rsc : 0000000000000003 rnat: 9666960159966a59 bsps: a0000001001441c0 pr : 9666960159965a59 ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70433f csd : 0000000000000000 ssd : 0000000000000000 b0 : a0000001001be500 b6 : a00000010112cb20 b7 : a0000001011660a0 f6 : 0fff7f0f0f0f0e54f0000 f7 : 0ffe8c5c1000000000000 f8 : 1000d8000000000000000 f9 : 100068800000000000000 f10 : 10005f0f0f0f0e54f0000 f11 : 1003e0000000000000078 r1 : a00000010155eef0 r2 : 0000000000000000 r3 : fffffffffffc1638 r8 : e0000040600081b8 r9 : ca2ffc55fb373e95 r10 : 0000000000000000 r11 : e000004040001646 r12 : a000000101287e20 r13 : a000000101280000 r14 : 0000000000004000 r15 : 0000000000000078 r16 : ca2ffc55fb373e75 r17 : e000004040040000 r18 : fffffffffffc1646 r19 : e000004040001646 r20 : fffffffffffc15f8 r21 : 000000000000004d r22 : a00000010132fa68 r23 : 00000000000000ed r24 : 0000000000000000 r25 : 0000000000000000 r26 : 0000000000000001 r27 : a0000001012b8500 r28 : a00000010135f4a0 r29 : 0000000000000000 r30 : 0000000000000000 r31 : 0000000000000001 Unable to handle kernel NULL pointer dereference (address 0000000000000018) swapper[0]: Oops 11003706212352 [2] Modules linked in: Pid: 0, CPU 0, comm: swapper psr : 0000121008022018 ifs : 800000000000cc18 ip : [] Not tainted (3.6.0-rc4-zx1-smp-next-20120906) ip is at __copy_user+0x891/0x960 unat: 0000000000000000 pfs : 0000000000000813 rsc : 0000000000000003 rnat: 0000000000000000 bsps: 0000000000000000 pr : 9666960159961765 ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f csd : 0000000000000000 ssd : 0000000000000000 b0 : a00000010004b550 b6 : a00000010004b740 b7 : a00000010000c750 f6 : 000000000000000000000 f7 : 1003e9e3779b97f4a7c16 f8 : 1003e0a00000010001550 f9 : 100068800000000000000 f10 : 10005f0f0f0f0e54f0000 f11 : 1003e0000000000000078 r1 : a00000010155eef0 r2 : a0000001012870b0 r3 : a0000001012870b8 r8 : 0000000000000298 r9 : 0000000000000013 r10 : 0000000000000000 r11 : 9666960159961a65 r12 : a000000101287010 r13 : a000000101280000 r14 : a000000101287068 r15 : a000000101287080 r16 : 0000000000000298 r17 : 0000000000000010 r18 : 0000000000000018 r19 : a000000101287310 r20 : 0000000000000290 r21 : 0000000000000000 r22 : 0000000000000000 r23 : a000000101386f58 r24 : 0000000000000000 r25 : 000000007fffffff r26 : a000000101287078 r27 : a0000001013c69b0 r28 : 0000000000000000 r29 : 0000000000000014 r30 : 0000000000000000 r31 : 0000000000000813 Sedat Dilek and Hugh Dickins reported similar problems as well. Earlier patches in the common set moved the zeroing of the kmem_cache structure into common code. See "Move allocation of kmem_cache into common code". The allocation for the two special structures is still done from SLUB specific code but no zeroing is done since the cache creation functions used to zero. This now needs to be updated so that the structures are zeroed during allocation in kmem_cache_init(). Otherwise random pointer values may be followed. Reported-by: Tony Luck Reported-by: Sedat Dilek Tested-by: Sedat Dilek Reported-by: Hugh Dickins Tested-by: Sedat Dilek Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index dafd465f7a3..2258ed82880 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3707,7 +3707,7 @@ void __init kmem_cache_init(void) /* Allocate two kmem_caches from the page allocator */ kmalloc_size = ALIGN(kmem_size, cache_line_size()); order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); /* * Must first have the slab cache available for the allocations of the -- cgit v1.2.3 From 947ca1856a7e60aa6d20536785e6a42dff25aa6e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 5 Sep 2012 10:33:18 +0800 Subject: slab: fix the DEADLOCK issue on l3 alien lock DEADLOCK will be report while running a kernel with NUMA and LOCKDEP enabled, the process of this fake report is: kmem_cache_free() //free obj in cachep -> cache_free_alien() //acquire cachep's l3 alien lock -> __drain_alien_cache() -> free_block() -> slab_destroy() -> kmem_cache_free() //free slab in cachep->slabp_cache -> cache_free_alien() //acquire cachep->slabp_cache's l3 alien lock Since the cachep and cachep->slabp_cache's l3 alien are in the same lock class, fake report generated. This should not happen since we already have init_lock_keys() which will reassign the lock class for both l3 list and l3 alien. However, init_lock_keys() was invoked at a wrong position which is before we invoke enable_cpucache() on each cache. Since until set slab_state to be FULL, we won't invoke enable_cpucache() on caches to build their l3 alien while creating them, so although we invoked init_lock_keys(), the l3 alien lock class won't change since we don't have them until invoked enable_cpucache() later. This patch will invoke init_lock_keys() after we done enable_cpucache() instead of before to avoid the fake DEADLOCK report. Michael traced the problem back to a commit in release 3.0.0: commit 30765b92ada267c5395fc788623cb15233276f5c Author: Peter Zijlstra Date: Thu Jul 28 23:22:56 2011 +0200 slab, lockdep: Annotate the locks before using them Fernando found we hit the regular OFF_SLAB 'recursion' before we annotate the locks, cure this. The relevant portion of the stack-trace: > [ 0.000000] [] rt_spin_lock+0x50/0x56 > [ 0.000000] [] __cache_free+0x43/0xc3 > [ 0.000000] [] kmem_cache_free+0x6c/0xdc > [ 0.000000] [] slab_destroy+0x4f/0x53 > [ 0.000000] [] free_block+0x94/0xc1 > [ 0.000000] [] do_tune_cpucache+0x10b/0x2bb > [ 0.000000] [] enable_cpucache+0x7b/0xa7 > [ 0.000000] [] kmem_cache_init_late+0x1f/0x61 > [ 0.000000] [] start_kernel+0x24c/0x363 > [ 0.000000] [] i386_start_kernel+0xa9/0xaf Reported-by: Fernando Lopez-Lezcano Acked-by: Pekka Enberg Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1311888176.2617.379.camel@laptop Signed-off-by: Ingo Molnar The commit moved init_lock_keys() before we build up the alien, so we failed to reclass it. Cc: # 3.0+ Acked-by: Christoph Lameter Tested-by: Paul E. McKenney Signed-off-by: Michael Wang Signed-off-by: Pekka Enberg --- mm/slab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3b4587bb7b1..cd5a9265030 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1774,9 +1774,6 @@ void __init kmem_cache_init_late(void) slab_state = UP; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1784,6 +1781,9 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* Done! */ slab_state = FULL; -- cgit v1.2.3 From 8c7f6edbda01f1b1a2e60ad61f14fe38023e433b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Sep 2012 12:20:58 -0700 Subject: cgroup: mark subsystems with broken hierarchy support and whine if cgroups are nested for them Currently, cgroup hierarchy support is a mess. cpu related subsystems behave correctly - configuration, accounting and control on a parent properly cover its children. blkio and freezer completely ignore hierarchy and treat all cgroups as if they're directly under the root cgroup. Others show yet different behaviors. These differing interpretations of cgroup hierarchy make using cgroup confusing and it impossible to co-mount controllers into the same hierarchy and obtain sane behavior. Eventually, we want full hierarchy support from all subsystems and probably a unified hierarchy. Users using separate hierarchies expecting completely different behaviors depending on the mounted subsystem is deterimental to making any progress on this front. This patch adds cgroup_subsys.broken_hierarchy and sets it to %true for controllers which are lacking in hierarchy support. The goal of this patch is two-fold. * Move users away from using hierarchy on currently non-hierarchical subsystems, so that implementing proper hierarchy support on those doesn't surprise them. * Keep track of which controllers are broken how and nudge the subsystems to implement proper hierarchy support. For now, start with a single warning message. We can whine louder later on. v2: Fixed a typo spotted by Michal. Warning message updated. v3: Updated memcg part so that it doesn't generate warning in the cases where .use_hierarchy=false doesn't make the behavior different from root.use_hierarchy=true. Fixed a typo spotted by Glauber. v4: Check ->broken_hierarchy after cgroup creation is complete so that ->create() can affect the result per Michal. Dropped unnecessary memcg root handling per Michal. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Acked-by: Serge E. Hallyn Cc: Glauber Costa Cc: Peter Zijlstra Cc: Paul Turner Cc: Johannes Weiner Cc: Thomas Graf Cc: Vivek Goyal Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Neil Horman Cc: Aneesh Kumar K.V --- mm/memcontrol.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 795e525afab..a72f2ffdc3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4973,6 +4973,13 @@ mem_cgroup_create(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + /* + * Deeper hierachy with use_hierarchy == false doesn't make + * much sense so let cgroup subsystem know about this + * unfortunate state in our controller. + */ + if (parent && parent != root_mem_cgroup) + mem_cgroup_subsys.broken_hierarchy = true; } memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); -- cgit v1.2.3 From f28510d30c7f03daa290019fbc57ad8277347614 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 11 Sep 2012 19:49:38 +0000 Subject: slab: Only define slab_error for DEBUG On Tue, 11 Sep 2012, Stephen Rothwell wrote: > After merging the final tree, today's linux-next build (sparc64 defconfig) > produced this warning: > > mm/slab.c:808:13: warning: '__slab_error' defined but not used [-Wunused-function] > > Introduced by commit 945cf2b6199b ("mm/sl[aou]b: Extract a common > function for kmem_cache_destroy"). All uses of slab_error() are now > guarded by DEBUG. There is no use case left for slab builds without DEBUG. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 11d9af5f9d2..8524923966b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -803,6 +803,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } +#if DEBUG #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, @@ -812,6 +813,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, function, cachep->name, msg); dump_stack(); } +#endif /* * By default on NUMA we use alien caches to stage the freeing of -- cgit v1.2.3 From 645df230cacc48f4463037016e9dbd3633183fe8 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 18 Sep 2012 15:54:12 -0400 Subject: mm, sl[au]b: Taint kernel when we detect a corrupted slab It doesn't seem worth adding a new taint flag for this, so just re-use the one from 'bad page' Acked-by: Christoph Lameter # SLUB Acked-by: David Rientjes Signed-off-by: Dave Jones Signed-off-by: Pekka Enberg --- mm/slab.c | 1 + mm/slub.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index cd5a9265030..5c6abb831e6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -811,6 +811,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); + add_taint(TAINT_BAD_PAGE); } /* diff --git a/mm/slub.c b/mm/slub.c index c67bd0a4a95..a6d043e1326 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -568,6 +568,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) -- cgit v1.2.3 From a00bb1e9fc0925c0061e9d844523a3a47a7e2f7f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 21 Sep 2012 16:40:30 +0800 Subject: mm: frontswap: fix a wrong if condition in frontswap_shrink pages_to_unuse is set to 0 to unuse all frontswap pages But that doesn't happen since a wrong condition in frontswap_shrink cancel it. -v2: Add comment to explain return value of __frontswap_shrink, as suggested by Dan Carpenter, thanks Signed-off-by: Zhenzhong Duan Signed-off-by: Konrad Rzeszutek Wilk --- mm/frontswap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 89dc399d332..0547a35f798 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -263,6 +263,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, return ret; } +/* + * Used to check if it's necessory and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shink pages, + * error code when there is an error. + */ static int __frontswap_shrink(unsigned long target_pages, unsigned long *pages_to_unuse, int *type) @@ -275,7 +280,7 @@ static int __frontswap_shrink(unsigned long target_pages, if (total_pages <= target_pages) { /* Nothing to do */ *pages_to_unuse = 0; - return 0; + return 1; } total_pages_to_unuse = total_pages - target_pages; return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); @@ -302,7 +307,7 @@ void frontswap_shrink(unsigned long target_pages) spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); spin_unlock(&swap_lock); - if (ret == 0 && pages_to_unuse) + if (ret == 0) try_to_unuse(type, true, pages_to_unuse); return; } -- cgit v1.2.3 From e3483a5f3a8ef448c229a2aceca9b2ad6a46b8ec Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Thu, 20 Sep 2012 12:16:52 -0700 Subject: frontswap: support exclusive gets if tmem backend is capable Tmem, as originally specified, assumes that "get" operations performed on persistent pools never flush the page of data out of tmem on a successful get, waiting instead for a flush operation. This is intended to mimic the model of a swap disk, where a disk read is non-destructive. Unlike a disk, however, freeing up the RAM can be valuable. Over the years that frontswap was in the review process, several reviewers (and notably Hugh Dickins in 2010) pointed out that this would result, at least temporarily, in two copies of the data in RAM: one (compressed for zcache) copy in tmem, and one copy in the swap cache. We wondered if this could be done differently, at least optionally. This patch allows tmem backends to instruct the frontswap code that this backend performs exclusive gets. Zcache2 already contains hooks to support this feature. Other backends are completely unaffected unless/until they are updated to support this feature. While it is not clear that exclusive gets are a performance win on all workloads at all times, this small patch allows for experimentation by backends. P.S. Let's not quibble about the naming of "get" vs "read" vs "load" etc. The naming is currently horribly inconsistent between cleancache and frontswap and existing tmem backends, so will need to be straightened out as a separate patch. "Get" is used by the tmem architecture spec, existing backends, and all documentation and presentation material so I am using it in this patch. Signed-off-by: Dan Magenheimer Signed-off-by: Konrad Rzeszutek Wilk --- mm/frontswap.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 0547a35f798..2890e67d602 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled); */ static bool frontswap_writethrough_enabled __read_mostly; +/* + * If enabled, the underlying tmem implementation is capable of doing + * exclusive gets, so frontswap_load, on a successful tmem_get must + * mark the page as no longer in frontswap AND mark it dirty. + */ +static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; + #ifdef CONFIG_DEBUG_FS /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -96,6 +103,15 @@ void frontswap_writethrough(bool enable) } EXPORT_SYMBOL(frontswap_writethrough); +/* + * Enable/disable frontswap exclusive gets (see above). + */ +void frontswap_tmem_exclusive_gets(bool enable) +{ + frontswap_tmem_exclusive_gets_enabled = enable; +} +EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); + /* * Called when a swap device is swapon'd. */ @@ -174,8 +190,13 @@ int __frontswap_load(struct page *page) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) ret = frontswap_ops.load(type, offset, page); - if (ret == 0) + if (ret == 0) { inc_frontswap_loads(); + if (frontswap_tmem_exclusive_gets_enabled) { + SetPageDirty(page); + frontswap_clear(sis, offset); + } + } return ret; } EXPORT_SYMBOL(__frontswap_load); -- cgit v1.2.3 From 58fac09566bb48592a09ef0fe0c7dbefa0cd2109 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 17 Aug 2012 12:33:34 +0800 Subject: kmemleak: Replace list_for_each_continue_rcu with new interface This patch replaces list_for_each_continue_rcu() with list_for_each_entry_continue_rcu() to save a few lines of code and allow removing list_for_each_continue_rcu(). Signed-off-by: Michael Wang Acked-by: Catalin Marinas Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- mm/kmemleak.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 45eb6217bf3..0de83b4541e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1483,13 +1483,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct kmemleak_object *prev_obj = v; struct kmemleak_object *next_obj = NULL; - struct list_head *n = &prev_obj->object_list; + struct kmemleak_object *obj = prev_obj; ++(*pos); - list_for_each_continue_rcu(n, &object_list) { - struct kmemleak_object *obj = - list_entry(n, struct kmemleak_object, object_list); + list_for_each_entry_continue_rcu(obj, &object_list, object_list) { if (get_object(obj)) { next_obj = obj; break; -- cgit v1.2.3 From 90f2cbbc49a8fe5a49cea1d362d90e377b949d49 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:51 -0300 Subject: mm, slob: Use NUMA_NO_NODE instead of -1 Acked-by: David Rientjes Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slob.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 45d4ca79933..191e1713a6d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) void *page; #ifdef CONFIG_NUMA - if (node != -1) + if (node != NUMA_NO_NODE) page = alloc_pages_exact_node(node, gfp, order); else #endif @@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) * If there's a node specification, search for a partial * page with a matching node id in the freelist. */ - if (node != -1 && page_to_nid(sp) != node) + if (node != NUMA_NO_NODE && page_to_nid(sp) != node) continue; #endif /* Enough room on this page? */ @@ -514,7 +514,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, struct kmem_cache *c; c = slob_alloc(sizeof(struct kmem_cache), - GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); + GFP_KERNEL, ARCH_KMALLOC_MINALIGN, NUMA_NO_NODE); if (c) { c->name = name; -- cgit v1.2.3 From ff4fcd01ec86d98d15d2fd96f22f19bb1d341b88 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:52 -0300 Subject: mm, slab: Remove silly function slab_buffer_size() This function is seldom used, and can be simply replaced with cachep->size. Acked-by: David Rientjes Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5c6abb831e6..7072848701e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_TRACING -size_t slab_buffer_size(struct kmem_cache *cachep) -{ - return cachep->size; -} -EXPORT_SYMBOL(slab_buffer_size); -#endif - /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -3850,7 +3842,7 @@ kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); trace_kmalloc(_RET_IP_, ret, - size, slab_buffer_size(cachep), flags); + size, cachep->size, flags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -3881,7 +3873,7 @@ void *kmem_cache_alloc_node_trace(size_t size, ret = __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0)); trace_kmalloc_node(_RET_IP_, ret, - size, slab_buffer_size(cachep), + size, cachep->size, flags, nodeid); return ret; } -- cgit v1.2.3 From f3f741019595f1e73564d985f5fe8abcbb98c769 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:53 -0300 Subject: mm, slob: Add support for kmalloc_track_caller() Currently slob falls back to regular kmalloc for this case. With this patch kmalloc_track_caller() is correctly implemented, thus tracing the specified caller. This is important to trace accurately allocations performed by krealloc, kstrdup, kmemdup, etc. Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slob.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 191e1713a6d..dd47d16d57b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -425,7 +425,8 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -void *__kmalloc_node(size_t size, gfp_t gfp, int node) +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); @@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) *m = size; ret = (void *)m + align; - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, size + align, gfp, node); } else { unsigned int order = get_order(size); @@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) page->private = size; } - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); return ret; } + +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + return __do_kmalloc_node(size, gfp, node, _RET_IP_); +} EXPORT_SYMBOL(__kmalloc_node); +#ifdef CONFIG_TRACING +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, node, caller); +} +#endif +#endif + void kfree(const void *block) { struct page *sp; -- cgit v1.2.3 From 7c0cb9c64f83dd9bf18b139ac0de00daba6087c2 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:55 -0300 Subject: mm, slab: Replace 'caller' type, void* -> unsigned long This allows to use _RET_IP_ instead of builtin_address(0), thus achiveing implementation consistency in all three allocators. Though maybe a nitpick, the real goal behind this patch is to be able to obtain common code between SLAB and SLUB. Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7072848701e..6c356793612 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3084,7 +3084,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { struct page *page; unsigned int objnr; @@ -3104,7 +3104,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone2(cachep, objp) = RED_INACTIVE; } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; objnr = obj_to_index(cachep, slabp, objp); @@ -3117,7 +3117,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, (unsigned long)caller); + store_stackinfo(cachep, objp, caller); kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 0); } else { @@ -3270,7 +3270,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, #if DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, - gfp_t flags, void *objp, void *caller) + gfp_t flags, void *objp, unsigned long caller) { if (!objp) return objp; @@ -3287,7 +3287,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || @@ -3562,7 +3562,7 @@ done: */ static __always_inline void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, - void *caller) + unsigned long caller) { unsigned long save_flags; void *ptr; @@ -3648,7 +3648,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3784,7 +3784,7 @@ free_done: * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { struct array_cache *ac = cpu_cache_get(cachep); @@ -3824,7 +3824,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3839,7 +3839,7 @@ kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) { void *ret; - ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + ret = __cache_alloc(cachep, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); @@ -3851,8 +3851,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3870,8 +3869,8 @@ void *kmem_cache_alloc_node_trace(size_t size, { void *ret; - ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP); + trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3881,7 +3880,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) +__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; @@ -3894,21 +3893,20 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, - __builtin_return_address(0)); + return __do_kmalloc_node(size, flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, (void *)caller); + return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, NULL); + return __do_kmalloc_node(size, flags, node, 0); } EXPORT_SYMBOL(__kmalloc_node); #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ @@ -3921,7 +3919,7 @@ EXPORT_SYMBOL(__kmalloc_node); * @caller: function caller for debug tracking of the caller */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - void *caller) + unsigned long caller) { struct kmem_cache *cachep; void *ret; @@ -3936,7 +3934,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = __cache_alloc(cachep, flags, caller); - trace_kmalloc((unsigned long) caller, ret, + trace_kmalloc(caller, ret, size, cachep->size, flags); return ret; @@ -3946,20 +3944,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, __builtin_return_address(0)); + return __do_kmalloc(size, flags, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, (void *)caller); + return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); #else void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, NULL); + return __do_kmalloc(size, flags, 0); } EXPORT_SYMBOL(__kmalloc); #endif @@ -3980,7 +3978,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_locks_freed(objp, cachep->object_size); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, __builtin_return_address(0)); + __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); trace_kmem_cache_free(_RET_IP_, objp); @@ -4011,7 +4009,7 @@ void kfree(const void *objp) debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, __builtin_return_address(0)); + __cache_free(c, (void *)objp, _RET_IP_); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); -- cgit v1.2.3 From 4052147c0afa1cf05780ed846f37e87cdde9f628 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:56 -0300 Subject: mm, slab: Match SLAB and SLUB kmem_cache_alloc_xxx_trace() prototype This long (seemingly unnecessary) patch does not fix anything and its only goal is to produce common code between SLAB and SLUB. Cc: Christoph Lameter Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6c356793612..4c54a235793 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3835,7 +3835,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void * -kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) +kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; @@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(size_t size, - struct kmem_cache *cachep, +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, + size_t size) { void *ret; @@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(size, cachep, flags, node); + return kmem_cache_alloc_node_trace(cachep, flags, node, size); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) -- cgit v1.2.3 From 48356303ff8cce7036f13a23df9119d3f47461ce Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:57 -0300 Subject: mm, slab: Rename __cache_alloc() -> slab_alloc() This patch does not fix anything and its only goal is to produce common code between SLAB and SLUB. Cc: Christoph Lameter Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4c54a235793..d011030e961 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3561,7 +3561,7 @@ done: * Fallback to other node is possible if __GFP_THISNODE is not set. */ static __always_inline void * -__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long caller) { unsigned long save_flags; @@ -3648,7 +3648,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3824,7 +3824,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = __cache_alloc(cachep, flags, _RET_IP_); + void *ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3839,7 +3839,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = __cache_alloc(cachep, flags, _RET_IP_); + ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); @@ -3851,7 +3851,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP_); + void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3869,7 +3869,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP); + ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, @@ -3932,7 +3932,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = __cache_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, caller); trace_kmalloc(caller, ret, size, cachep->size, flags); -- cgit v1.2.3 From 2b847c3cb4f8565911bd8ce59b8428e635c90594 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 8 Sep 2012 17:47:58 -0300 Subject: mm, slub: Rename slab_alloc() -> slab_alloc_node() to match SLAB This patch does not fix anything, and its only goal is to enable us to obtain some common code between SLAB and SLUB. Neither behavior nor produced code is affected. Cc: Christoph Lameter Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slub.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a6d043e1326..f074f756405 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2313,7 +2313,7 @@ new_slab: * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { void **object; @@ -2383,9 +2383,15 @@ redo: return object; } +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2396,7 +2402,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); return ret; } @@ -2414,7 +2420,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2428,7 +2434,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -3366,7 +3372,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -3409,7 +3415,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -4037,7 +4043,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); + ret = slab_alloc(s, gfpflags, caller); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4067,7 +4073,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); -- cgit v1.2.3 From 592f41450d03f74747edd81779ae206a0647152f Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 25 Sep 2012 08:07:08 -0300 Subject: mm/slab: Fix typo _RET_IP -> _RET_IP_ The bug was introduced by commit 7c0cb9c64f83 ("mm, slab: Replace 'caller' type, void* -> unsigned long"). Reported-by: Fengguang Wu Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d011030e961..ca3849fe058 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3869,7 +3869,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP); + ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, -- cgit v1.2.3 From 1e5965bf1f018cc30a4659fa3f1a40146e4276f6 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 25 Sep 2012 08:07:09 -0300 Subject: mm/slab: Fix kmem_cache_alloc_node_trace() declaration The bug was introduced in commit 4052147c0afa ("mm, slab: Match SLAB and SLUB kmem_cache_alloc_xxx_trace() prototype"). Reported-by: Fengguang Wu Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ca3849fe058..3409eada242 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, +void *kmem_cache_alloc_node_trace(size_t size, + struct kmem_cache *cachep, gfp_t flags, - int nodeid, - size_t size) + int nodeid) { void *ret; @@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + return kmem_cache_alloc_node_trace(size, cachep, flags, node); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) -- cgit v1.2.3 From 82bd5508b4080e851ac1a9b62bed6d727b1b4a84 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Sep 2012 12:53:51 -0700 Subject: mm, slob: fix build breakage in __kmalloc_node_track_caller On Sat, 8 Sep 2012, Ezequiel Garcia wrote: > @@ -454,15 +455,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) > gfp |= __GFP_COMP; > ret = slob_new_pages(gfp, order, node); > > - trace_kmalloc_node(_RET_IP_, ret, > + trace_kmalloc_node(caller, ret, > size, PAGE_SIZE << order, gfp, node); > } > > kmemleak_alloc(ret, size, 1, gfp); > return ret; > } > + > +void *__kmalloc_node(size_t size, gfp_t gfp, int node) > +{ > + return __do_kmalloc_node(size, gfp, node, _RET_IP_); > +} > EXPORT_SYMBOL(__kmalloc_node); > > +#ifdef CONFIG_TRACING > +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) > +{ > + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); > +} > + > +#ifdef CONFIG_NUMA > +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, > + int node, unsigned long caller) > +{ > + return __do_kmalloc_node(size, gfp, node, caller); > +} > +#endif This breaks Pekka's slab/next tree with this: mm/slob.c: In function '__kmalloc_node_track_caller': mm/slob.c:488: error: 'gfp' undeclared (first use in this function) mm/slob.c:488: error: (Each undeclared identifier is reported only once mm/slob.c:488: error: for each function it appears in.) mm, slob: fix build breakage in __kmalloc_node_track_caller "mm, slob: Add support for kmalloc_track_caller()" breaks the build because gfp is undeclared. Fix it. Acked-by: Ezequiel Garcia Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index dd47d16d57b..8c00d22afc9 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -482,7 +482,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) } #ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, +void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); -- cgit v1.2.3 From 611443783ada3464c44e9168f6b7de2cb3fed39a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Aug 2012 20:27:09 -0400 Subject: switch fadvise(2) to fget_light() Signed-off-by: Al Viro --- mm/fadvise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 9b75a045dbf..a83245763cf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -26,7 +26,8 @@ */ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { - struct file *file = fget(fd); + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -128,7 +129,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = -EINVAL; } out: - fput(file); + fput_light(file, fput_needed); return ret; } #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -- cgit v1.2.3 From 132ea2479f77dd598f11a77bdfebbd23d244ce6f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Aug 2012 20:30:57 -0400 Subject: switch readahead(2) to fget_light() Signed-off-by: Al Viro --- mm/readahead.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index ea8f8fa2164..1011111e2bf 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -580,9 +580,10 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; + int fput_needed; ret = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (file) { if (file->f_mode & FMODE_READ) { struct address_space *mapping = file->f_mapping; @@ -591,7 +592,7 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) unsigned long len = end - start + 1; ret = do_readahead(mapping, file, start, len); } - fput(file); + fput_light(file, fput_needed); } return ret; } -- cgit v1.2.3 From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 14:48:26 -0400 Subject: make get_file() return its argument simplifies a bunch of callers... Signed-off-by: Al Viro --- mm/fremap.c | 3 +-- mm/mmap.c | 3 +-- mm/nommu.c | 6 ++---- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 9ed4fd43246..048659c0c03 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -195,10 +195,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; - struct file *file = vma->vm_file; + struct file *file = get_file(vma->vm_file); flags &= MAP_NONBLOCK; - get_file(file); addr = mmap_region(file, start, size, flags, vma->vm_flags, pgoff); fput(file); diff --git a/mm/mmap.c b/mm/mmap.c index ae18a48e7e4..872441e8191 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1301,8 +1301,7 @@ munmap_back: goto free_vma; correct_wcount = 1; } - vma->vm_file = file; - get_file(file); + vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; diff --git a/mm/nommu.c b/mm/nommu.c index d4b0c10872d..dee2ff89fd5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1282,10 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_pgoff = pgoff; if (file) { - region->vm_file = file; - get_file(file); - vma->vm_file = file; - get_file(file); + region->vm_file = get_file(file); + vma->vm_file = get_file(file); if (vm_flags & VM_EXECUTABLE) { added_exe_file_vma(current->mm); vma->vm_mm = current->mm; -- cgit v1.2.3 From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Aug 2012 12:52:22 -0400 Subject: switch simple cases of fget_light to fdget Signed-off-by: Al Viro --- mm/fadvise.c | 35 +++++++++++++++++------------------ mm/readahead.c | 15 +++++++-------- 2 files changed, 24 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index a83245763cf..a47f0f50c89 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -26,8 +26,7 @@ */ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { - int fput_needed; - struct file *file = fget_light(fd, &fput_needed); + struct fd f = fdget(fd); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -36,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) unsigned long nrpages; int ret = 0; - if (!file) + if (!f.file) return -EBADF; - if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { + if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } - mapping = file->f_mapping; + mapping = f.file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; goto out; @@ -77,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&file->f_lock); - file->f_mode |= FMODE_RANDOM; - spin_unlock(&file->f_lock); + spin_lock(&f.file->f_lock); + f.file->f_mode |= FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -107,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, file, start_index, + force_page_cache_readahead(mapping, f.file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: @@ -129,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = -EINVAL; } out: - fput_light(file, fput_needed); + fdput(f); return ret; } #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS diff --git a/mm/readahead.c b/mm/readahead.c index 1011111e2bf..7963f239123 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -579,20 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp, SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; - struct file *file; - int fput_needed; + struct fd f; ret = -EBADF; - file = fget_light(fd, &fput_needed); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_READ) { + struct address_space *mapping = f.file->f_mapping; pgoff_t start = offset >> PAGE_CACHE_SHIFT; pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); + ret = do_readahead(mapping, f.file, start, len); } - fput_light(file, fput_needed); + fdput(f); } return ret; } -- cgit v1.2.3 From 2f60d628ffd042e65e0b1d3431fb3e38d6f7c1be Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 28 Sep 2012 20:27:49 +0800 Subject: CPU hotplug, writeback: Don't call writeback_set_ratelimit() too often during hotplug The CPU hotplug callback related to writeback calls writeback_set_ratelimit() during every state change in the hotplug sequence. This is unnecessary since num_online_cpus() changes only once during the entire hotplug operation. So invoke the function only once per hotplug, thereby avoiding the unnecessary repetition of those costly calculations. Signed-off-by: Srivatsa S. Bhat --- mm/page-writeback.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5ad5ce23c1e..830893b2b3c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void) } static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +ratelimit_handler(struct notifier_block *self, unsigned long action, + void *hcpu) { - writeback_set_ratelimit(); - return NOTIFY_DONE; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DEAD: + writeback_set_ratelimit(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } } static struct notifier_block __cpuinitdata ratelimit_nb = { -- cgit v1.2.3 From c0b24b5100cb96cba71666953a8619a616684967 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 29 Sep 2012 10:00:59 +0300 Subject: Revert "mm/slab: Fix kmem_cache_alloc_node_trace() declaration" This reverts commit 1e5965bf1f018cc30a4659fa3f1a40146e4276f6. Ezequiel Garcia has a better fix. --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3409eada242..ca3849fe058 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(size_t size, - struct kmem_cache *cachep, +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, + size_t size) { void *ret; @@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(size, cachep, flags, node); + return kmem_cache_alloc_node_trace(cachep, flags, node, size); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) -- cgit v1.2.3 From 788e1aadadd0d5a9cbffce10c34840b4072bc733 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Fri, 28 Sep 2012 16:34:05 +0800 Subject: slub: init_kmem_cache_cpus() and put_cpu_partial() can be static Acked-by: Glauber Costa Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Fengguang Wu Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f074f756405..944b4edaeb1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1716,7 +1716,7 @@ static inline void note_cmpxchg_failure(const char *n, stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } -void init_kmem_cache_cpus(struct kmem_cache *s) +static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; @@ -1941,7 +1941,7 @@ static void unfreeze_partials(struct kmem_cache *s) * If we did not find a slot then simply move all the partials to the * per node partial list. */ -int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { struct page *oldpage; int pages; -- cgit v1.2.3 From 608da7e3fc7259eca0d983b31bc8915af14cf15e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 30 Sep 2012 17:28:25 +0900 Subject: slab: Fix build failure in __kmem_cache_create() Fix build failure with CONFIG_DEBUG_SLAB=y && CONFIG_DEBUG_PAGEALLOC=y caused by commit 8a13a4cc "mm/sl[aou]b: Shrink __kmem_cache_create() parameter lists". mm/slab.c: In function '__kmem_cache_create': mm/slab.c:2474: error: 'align' undeclared (first use in this function) mm/slab.c:2474: error: (Each undeclared identifier is reported only once mm/slab.c:2474: error: for each function it appears in.) make[1]: *** [mm/slab.o] Error 1 make: *** [mm] Error 2 Acked-by: Christoph Lameter Signed-off-by: Tetsuo Handa Signed-off-by: Pekka Enberg --- mm/slab.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d011030e961..c4f6e14c2c7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2492,8 +2492,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); + && cachep->object_size > cache_line_size() + && ALIGN(size, cachep->align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } #endif -- cgit v1.2.3 From 17f3609c21706b377ad80b5251558ed700c2af17 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:12:07 -0700 Subject: sections: fix section conflicts in mm/percpu.c Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index bb4be7435ce..ddc5efb9c5b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #ifdef CONFIG_SMP -const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { +const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", -- cgit v1.2.3 From c654345924f7cce87bb221b89db91cba890421ba Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 8 Oct 2012 16:28:21 -0700 Subject: mm: remove __GFP_NO_KSWAPD When transparent huge pages were introduced, memory compaction and swap storms were an issue, and the kernel had to be careful to not make THP allocations cause pageout or compaction. Now that we have working compaction deferral, kswapd is smart enough to invoke compaction and the quadratic behaviour around isolate_free_pages has been fixed, it should be safe to remove __GFP_NO_KSWAPD. [minchan@kernel.org: Comment fix] [mgorman@suse.de: Avoid direct reclaim for deferred compaction] Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13ea753889..5e92698e539 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2362,9 +2362,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2441,7 +2440,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3 From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:29 -0700 Subject: x86, pat: separate the pfn attribute tracking for remap_pfn_range and vm_insert_pfn With PAT enabled, vm_insert_pfn() looks up the existing pfn memory attribute and uses it. Expectation is that the driver reserves the memory attributes for the pfn before calling vm_insert_pfn(). remap_pfn_range() (when called for the whole vma) will setup a new attribute (based on the prot argument) for the specified pfn range. This addresses the legacy usage which typically calls remap_pfn_range() with a desired memory attribute. For ranges smaller than the vma size (which is typically not the case), remap_pfn_range() will use the existing memory attribute for the pfn range. Expose two different API's for these different behaviors. track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn() and track_pfn_remap() for the remap_pfn_range(). This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear checks in track_pfn_remap()] [akpm@linux-foundation.org: tweak a few comments] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Konstantin Khlebnikov Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 57361708d1a..6bef278ad30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1060,7 +1060,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_vma_copy(vma); + ret = track_pfn_copy(vma); if (ret) return ret; } @@ -1328,7 +1328,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); + untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -2162,14 +2162,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -2311,7 +2308,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { /* * To indicate that track_pfn related cleanup is not @@ -2335,7 +2332,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } -- cgit v1.2.3 From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:34 -0700 Subject: mm, x86, pat: rework linear pfn-mmap tracking Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT. We can toss mapping address from remap_pfn_range() into track_pfn_vma_new(), and collect all PAT-related logic together in arch/x86/. This patch also restores orignal frustration-free is_cow_mapping() check in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73 ("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3") is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c, because it already handled by VM_PFNMAP in VM_NO_THP bit-mask. [suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 +++---------------- mm/memory.c | 26 ++++++++++---------------- 2 files changed, 13 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb69509..73cb22ee966 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1655,11 +1655,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1912,11 +1908,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; if (is_vma_temporary_stack(vma)) goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -2154,12 +2146,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; if (is_vma_temporary_stack(vma)) goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; diff --git a/mm/memory.c b/mm/memory.c index 6bef278ad30..655e1429388 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1055,7 +1055,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (unlikely(is_pfn_mapping(vma))) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -1327,7 +1327,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(is_pfn_mapping(vma))) + if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0); if (start != end) { @@ -2299,26 +2299,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. */ - if (addr == vma->vm_start && end == vma->vm_end) { + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFN_AT_MMAP; - } else if (is_cow_mapping(vma->vm_flags)) + } + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFN_AT_MMAP; - return -EINVAL; - } - BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); -- cgit v1.2.3 From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:37 -0700 Subject: mm: introduce arch-specific vma flag VM_ARCH_1 Combine several arch-specific vma flags into one. before patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 VM_NOHUGEPAGE VM_HUGEPAGE - VM_PAT powerpc - - VM_SAO - parisc VM_GROWSUP - - - ia64 VM_GROWSUP - - - nommu - VM_MAPPED_COPY - - others - - - - after patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 - VM_PAT VM_HUGEPAGE VM_NOHUGEPAGE powerpc - VM_SAO - - parisc - VM_GROWSUP - - ia64 - VM_GROWSUP - - nommu - VM_MAPPED_COPY - - others - VM_ARCH_1 - - And voila! One completely free bit. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/ksm.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73cb22ee966..47206692cf8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1491,7 +1491,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ +#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \ VM_HUGETLB|VM_SHARED|VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, diff --git a/mm/ksm.c b/mm/ksm.c index 47c88536889..d1cbe2aa6b3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1470,9 +1470,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) -- cgit v1.2.3 From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:40 -0700 Subject: mm: kill vma flag VM_INSERTPAGE Merge VM_INSERTPAGE into VM_MIXEDMAP. VM_MIXEDMAP VMA can mix pure-pfn ptes, special ptes and normal ptes. Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like VM_PFNMAP. If driver populates whole VMA at mmap() it probably not expects page-faults. This patch removes special check from vma_wants_writenotify() which disables pages write tracking for VMA populated via vm_instert_page(). BDI below mapped file should not use dirty-accounting, moreover do_wp_page() can handle this. vm_insert_page() still marks vma after first usage. Usually it is called from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to change vma->vm_flags. Caller must set VM_MIXEDMAP at mmap time if it wants to call this function from other places, for example from page-fault handler. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- mm/ksm.c | 2 +- mm/memory.c | 14 ++++++++++++-- mm/mmap.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 47206692cf8..9b72d127051 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1491,8 +1491,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \ - VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/mm/ksm.c b/mm/ksm.c index d1cbe2aa6b3..f9ccb16559e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,7 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_RESERVED | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ diff --git a/mm/memory.c b/mm/memory.c index 655e1429388..7b1e4feaec0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1047,7 +1047,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | + VM_PFNMAP | VM_MIXEDMAP))) { if (!vma->anon_vma) return 0; } @@ -2085,6 +2086,11 @@ out: * ask for a shared writable mapping! * * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -2093,7 +2099,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, return -EFAULT; if (!page_count(page)) return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); diff --git a/mm/mmap.c b/mm/mmap.c index 872441e8191..b0989f4d4f0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1190,7 +1190,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) return 0; /* Specialty mapping? */ - if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + if (vm_flags & VM_PFNMAP) return 0; /* Can the mapping track the dirty pages? */ -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/filemap_xip.c | 3 ++- mm/fremap.c | 14 ++++++++------ mm/mmap.c | 3 +-- mm/nommu.c | 8 ++++++++ mm/shmem.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 384344575c3..a9827b42556 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ @@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b1270..91750227a19 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -305,6 +305,7 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -313,7 +314,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); diff --git a/mm/fremap.c b/mm/fremap.c index 048659c0c03..3d731a49878 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -5,6 +5,7 @@ * * started by Ingo Molnar, Copyright (C) 2002, 2003 */ +#include #include #include #include @@ -80,9 +81,10 @@ out: return err; } -static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) { + struct mm_struct *mm = vma->vm_mm; int err; do { @@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, pgoff++; } while (size); - return 0; - + return 0; } +EXPORT_SYMBOL(generic_file_remap_pages); /** * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma @@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + if (!vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) @@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } mmu_notifier_invalidate_range_start(mm, start, start + size); - err = populate_range(mm, vma, start, size, pgoff); + err = vma->vm_ops->remap_pages(vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/mmap.c b/mm/mmap.c index b0989f4d4f0..d0686d35511 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -669,8 +669,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ - if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) + if (vma->vm_flags ^ vm_flags) return 0; if (vma->vm_file != file) return 0; diff --git a/mm/nommu.c b/mm/nommu.c index dee2ff89fd5..98318dcff74 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1961,6 +1961,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(generic_file_remap_pages); + static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { diff --git a/mm/shmem.c b/mm/shmem.c index d3752110c8c..cc12072f878 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif + .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, @@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.2.3 From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:54 -0700 Subject: mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas Currently the kernel sets mm->exe_file during sys_execve() and then tracks number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets mm->exe_file at last mmput() when mm->mm_users drops to zero. VMA with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma splitting, because sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps executable vmas with this file, they hold mm->exe_file while task is running. comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"), where all this stuff was introduced: > The kernel implements readlink of /proc/pid/exe by getting the file from > the first executable VMA. Then the path to the file is reconstructed and > reported as the result. > > Because of the VMA walk the code is slightly different on nommu systems. > This patch avoids separate /proc/pid/exe code on nommu systems. Instead of > walking the VMAs to find the first executable file-backed VMA we store a > reference to the exec'd file in the mm_struct. > > That reference would prevent the filesystem holding the executable file > from being unmounted even after unmapping the VMAs. So we track the number > of VM_EXECUTABLE VMAs and drop the new reference when the last one is > unmapped. This avoids pinning the mounted filesystem. exe_file's vma accounting is hooked into every file mmap/unmmap and vma split/merge just to fix some hypothetical pinning fs from umounting by mm, which already unmapped all its executable files, but still alive. Seems like currently nobody depends on this behaviour. We can try to remove this logic and keep mm->exe_file until final mmput(). mm->exe_file is still protected with mm->mmap_sem, because we want to change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall task can change its mm->exe_file and unpin mountpoint explicitly. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 25 ++++--------------------- mm/nommu.c | 11 +---------- 2 files changed, 5 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d0686d35511..c1ad2e78ea5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; @@ -636,8 +633,6 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); - if (next->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); } if (next->anon_vma) anon_vma_merge(vma, next); @@ -1304,8 +1299,6 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if (vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); /* Can addr have changed?? * @@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (anon_vma_clone(new, vma)) goto out_free_mpol; - if (new->vm_file) { + if (new->vm_file) get_file(new->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); @@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); - if (new->vm_file) { - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); + if (new->vm_file) fput(new->vm_file); - } unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); @@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) { + if (new_vma->vm_file) get_file(new_vma->vm_file); - - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/nommu.c b/mm/nommu.c index 98318dcff74..9c4a7b63a4d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } put_nommu_region(vma->vm_region); kmem_cache_free(vm_area_cachep, vma); } @@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file, if (file) { region->vm_file = get_file(file); vma->vm_file = get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } } down_write(&nommu_region_sem); @@ -1440,8 +1433,6 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); kleave(" = %d", ret); return ret; -- cgit v1.2.3 From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:59 -0700 Subject: mm: prepare VM_DONTDUMP for using in drivers Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags: VM_DONTEXPAND, VM_DONTCOPY. Currently this flag used only for sys_madvise. The next patch will use it for replacing the outdated flag VM_RESERVED. Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 14d260fa0d1..03dfa5c7adb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags &= ~VM_DONTCOPY; break; case MADV_DONTDUMP: - new_flags |= VM_NODUMP; + new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - new_flags &= ~VM_NODUMP; + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: -- cgit v1.2.3 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 +-- mm/memory.c | 11 +++++------ mm/mlock.c | 2 +- mm/mmap.c | 2 -- mm/nommu.c | 2 +- mm/vmalloc.c | 3 +-- 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index f9ccb16559e..9638620a753 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | - VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/memory.c b/mm/memory.c index 7b1e4feaec0..e09c0481318 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" @@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e..a948be4b7ba 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_IO | VM_PFNMAP)) goto no_mlock; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + if (!((vma->vm_flags & VM_DONTEXPAND) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) { diff --git a/mm/mmap.c b/mm/mmap.c index c1ad2e78ea5..a76042dc806 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ diff --git a/mm/nommu.c b/mm/nommu.c index 9c4a7b63a4d..12e84e69dd0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241..8de704679bf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, usize -= PAGE_SIZE; } while (usize > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } -- cgit v1.2.3 From d741c9cdeee6a569dae0dbbaf028065402955b59 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 8 Oct 2012 16:29:05 -0700 Subject: mm: fix nonuniform page status when writing new file with small buffer When writing a new file with 2048 bytes buffer, such as write(fd, buffer, 2048), it will call generic_perform_write() twice for every page: write_begin mark_page_accessed(page) write_end write_begin mark_page_accessed(page) write_end Pages 1-13 will be added to lru-pvecs in write_begin() and will *NOT* be added to active_list even they have be accessed twice because they are not PageLRU(page). But when page 14th comes, all pages in lru-pvecs will be moved to inactive_list (by __lru_cache_add() ) in first write_begin(), now page 14th *is* PageLRU(page). And after second write_end() only page 14th will be in active_list. In Hadoop environment, we do comes to this situation: after writing a file, we find out that only 14th, 28th, 42th... page are in active_list and others in inactive_list. Now kswapd works, shrinks the inactive_list, the file only have 14th, 28th...pages in memory, the readahead request size will be broken to only 52k (13*4k), system's performance falls dramatically. This problem can also replay by below steps (the machine has 8G memory): 1. dd if=/dev/zero of=/test/file.out bs=1024 count=1048576 2. cat another 7.5G file to /dev/null 3. vmtouch -m 1G -v /test/file.out, it will show: /test/file.out [oooooooooooooooooooOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 187847/262144 the 'o' means same pages are in memory but same are not. The solution for this problem is simple: the 14th page should be added to lru_add_pvecs before mark_page_accessed() just as other pages. [akpm@linux-foundation.org: tweak comment] [akpm@linux-foundation.org: grab better comment from the v3 patch] Signed-off-by: Robin Dong Reviewed-by: Minchan Kim Cc: KOSAKI Motohiro Reviewed-by: Johannes Weiner Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 77825883298..f76c76c7501 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); +/* + * Order of operations is important: flush the pagevec when it's already + * full, not when adding the last page, to make sure that last page is + * not added to the LRU directly when passed to this function. Because + * mark_page_accessed() (called after this when writing) only activates + * pages that are on the LRU, linear writes in subpage chunks would see + * every PAGEVEC_SIZE page activated, which is unexpected. + */ void __lru_cache_add(struct page *page, enum lru_list lru) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); - if (!pagevec_add(pvec, page)) + if (!pagevec_space(pvec)) __pagevec_lru_add(pvec, lru); + pagevec_add(pvec, page); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); -- cgit v1.2.3 From 6597d783397aebb793fb529474cce5089aa4c67f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:29:07 -0700 Subject: mm/mmap.c: replace find_vma_prepare() with clearer find_vma_links() People get confused by find_vma_prepare(), because it doesn't care about what it returns in its output args, when its callers won't be interested. Clarify by passing in end-of-range address too, and returning failure if any existing vma overlaps the new range: instead of returning an ambiguous vma which most callers then must check. find_vma_links() is a clearer name. This does revert 2.6.27's dfe195fb79e88 ("mm: fix uninitialized variables for find_vma_prepare callers"), but it looks like gcc 4.3.0 was one of those releases too eager to shout about uninitialized variables: only copy_vma() warns with 4.5.1 and 4.7.1, which a BUG on error silences. [hughd@google.com: fix warning, remove BUG()] Signed-off-by: Hugh Dickins Cc: Benny Halevy Acked-by: Hillf Danton Signed-off-by: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index a76042dc806..2ba27434e84 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -353,17 +353,14 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) +static int find_vma_links(struct mm_struct *mm, unsigned long addr, + unsigned long end, struct vm_area_struct **pprev, + struct rb_node ***rb_link, struct rb_node **rb_parent) { - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + struct rb_node **__rb_link, *__rb_parent, *rb_prev; __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; - vma = NULL; while (*__rb_link) { struct vm_area_struct *vma_tmp; @@ -372,9 +369,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; + /* Fail if an existing vma overlaps the area */ + if (vma_tmp->vm_start < end) + return -ENOMEM; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; @@ -387,7 +384,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); *rb_link = __rb_link; *rb_parent = __rb_parent; - return vma; + return 0; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -456,11 +453,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *__vma, *prev; + struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - BUG_ON(__vma && __vma->vm_start < vma->vm_end); + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + BUG(); __vma_link(mm, vma, prev, rb_link, rb_parent); mm->map_count++; } @@ -1221,8 +1219,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ error = -ENOMEM; munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2183,8 +2180,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) * Clear old maps. this also does some error checking for us */ munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2298,10 +2294,10 @@ void exit_mmap(struct mm_struct *mm) * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_mutex is taken here. */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; /* * The vm_pgoff of a purely anonymous vma should be irrelevant @@ -2319,8 +2315,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) @@ -2354,7 +2350,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); if (new_vma) { -- cgit v1.2.3 From 4ffb6335da87b51c17e7ff6495170785f21558dd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:09 -0700 Subject: mm: compaction: update comment in try_to_compact_pages Allocation success rates have been far lower since 3.4 due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled"). This commit was introduced for good reasons and it was known in advance that the success rates would suffer but it was justified on the grounds that the high allocation success rates were achieved by aggressive reclaim. Success rates are expected to suffer even more in 3.6 due to commit 7db8889ab05b ("mm: have order > 0 compaction start off where it left") which testing has shown to severely reduce allocation success rates under load - to 0% in one case. This series aims to improve the allocation success rates without regressing the benefits of commit fe2c2a106663. The series is based on latest mmotm and takes into account the __GFP_NO_KSWAPD flag is going away. Patch 1 updates a stale comment seeing as I was in the general area. Patch 2 updates reclaim/compaction to reclaim pages scaled on the number of recent failures. Patch 3 captures suitable high-order pages freed by compaction to reduce races with parallel allocation requests. Patch 4 fixes the upstream commit [7db8889a: mm: have order > 0 compaction start off where it left] to enable compaction again Patch 5 identifies when compacion is taking too long due to contention and aborts. STRESS-HIGHALLOC 3.6-rc1-akpm full-series Pass 1 36.00 ( 0.00%) 51.00 (15.00%) Pass 2 42.00 ( 0.00%) 63.00 (21.00%) while Rested 86.00 ( 0.00%) 86.00 ( 0.00%) From http://www.csn.ul.ie/~mel/postings/mmtests-20120424/global-dhp__stress-highalloc-performance-ext3/hydra/comparison.html I know that the allocation success rates in 3.3.6 was 78% in comparison to 36% in in the current akpm tree. With the full series applied, the success rates are up to around 51% with some variability in the results. This is not as high a success rate but it does not reclaim excessively which is a key point. MMTests Statistics: vmstat Page Ins 3050912 3078892 Page Outs 8033528 8039096 Swap Ins 0 0 Swap Outs 0 0 Note that swap in/out rates remain at 0. In 3.3.6 with 78% success rates there were 71881 pages swapped out. Direct pages scanned 70942 122976 Kswapd pages scanned 1366300 1520122 Kswapd pages reclaimed 1366214 1484629 Direct pages reclaimed 70936 105716 Kswapd efficiency 99% 97% Kswapd velocity 1072.550 1182.615 Direct efficiency 99% 85% Direct velocity 55.690 95.672 The kswapd velocity changes very little as expected. kswapd velocity is around the 1000 pages/sec mark where as in kernel 3.3.6 with the high allocation success rates it was 8140 pages/second. Direct velocity is higher as a result of patch 2 of the series but this is expected and is acceptable. The direct reclaim and kswapd velocities change very little. If these get accepted for merging then there is a difficulty in how they should be handled. 7db8889a ("mm: have order > 0 compaction start off where it left") is broken but it is already in 3.6-rc1 and needs to be fixed. However, if just patch 4 from this series is applied then Jim Schutt's workload is known to break again as his workload also requires patch 5. While it would be preferred to have all these patches in 3.6 to improve compaction in general, it would at least be acceptable if just patches 4 and 5 were merged to 3.6 to fix a known problem without breaking compaction completely. On the face of it, that would force __GFP_NO_KSWAPD patches to be merged at the same time but I can do a version of this series with __GFP_NO_KSWAPD change reverted and then rebase it on top of this series. That might be best overall because I note that the __GFP_NO_KSWAPD patch should have removed deferred_compaction from page_alloc.c but it didn't but fixing that causes collisions with this series. This patch: The comment about order applied when the check was order > PAGE_ALLOC_COSTLY_ORDER which has not been the case since c5a73c3d ("thp: use compaction for all allocation orders"). Fixing the comment while I'm in the general area. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a52e68..7168edc7592 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -869,11 +869,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zone *zone; int rc = COMPACT_SKIPPED; - /* - * Check whether it is worth even starting compaction. The order check is - * made because an assumption is made that the page allocator can satisfy - * the "cheaper" orders without taking special steps - */ + /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) return rc; -- cgit v1.2.3 From 83fde0f22872aa8c1d46f775cc7bdfa864499e65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:11 -0700 Subject: mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures If allocation fails after compaction then compaction may be deferred for a number of allocation attempts. If there are subsequent failures, compact_defer_shift is increased to defer for longer periods. This patch uses that information to scale the number of pages reclaimed with compact_defer_shift until allocations succeed again. The rationale is that reclaiming the normal number of pages still allowed compaction to fail and its success depends on the number of pages. If it's failing, reclaim more pages until it succeeds again. Note that this is not implying that VM reclaim is not reclaiming enough pages or that its logic is broken. try_to_free_pages() always asks for SWAP_CLUSTER_MAX pages to be reclaimed regardless of order and that is what it does. Direct reclaim stops normally with this check. if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; should_continue_reclaim delays when that check is made until a minimum number of pages for reclaim/compaction are reclaimed. It is possible that this patch could instead set nr_to_reclaim in try_to_free_pages() and drive it from there but that's behaves differently and not necessarily for the better. If driven from do_try_to_free_pages(), it is also possible that priorities will rise. When they reach DEF_PRIORITY-2, it will also start stalling and setting pages for immediate reclaim which is more disruptive than not desirable in this case. That is a more wide-reaching change that could cause another regression related to THP requests causing interactive jitter. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b434b674c..f6fdae7a8d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1729,6 +1729,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) return false; } +#ifdef CONFIG_COMPACTION +/* + * If compaction is deferred for sc->order then scale the number of pages + * reclaimed based on the number of consecutive allocation failures + */ +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + struct zone *zone = lruvec_zone(lruvec); + + if (zone->compact_order_failed <= sc->order) + pages_for_compaction <<= zone->compact_defer_shift; + return pages_for_compaction; +} +#else +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + return pages_for_compaction; +} +#endif + /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns @@ -1776,6 +1798,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); + + pages_for_compaction = scale_for_compaction(pages_for_compaction, + lruvec, sc); inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); -- cgit v1.2.3 From 1fb3f8ca0e9222535a39b884cb67a34628411b9f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:12 -0700 Subject: mm: compaction: capture a suitable high-order page immediately when it is made available While compaction is migrating pages to free up large contiguous blocks for allocation it races with other allocation requests that may steal these blocks or break them up. This patch alters direct compaction to capture a suitable free page as soon as it becomes available to reduce this race. It uses similar logic to split_free_page() to ensure that watermarks are still obeyed. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++------- mm/internal.h | 1 + mm/page_alloc.c | 63 ++++++++++++++++++++++++++++++---------- 3 files changed, 127 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 7168edc7592..0fbc6b73a52 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -91,6 +91,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +static void compact_capture_page(struct compact_control *cc) +{ + unsigned long flags; + int mtype, mtype_low, mtype_high; + + if (!cc->page || *cc->page) + return; + + /* + * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP + * regardless of the migratetype of the freelist is is captured from. + * This is fine because the order for a high-order MIGRATE_MOVABLE + * allocation is typically at least a pageblock size and overall + * fragmentation is not impaired. Other allocation types must + * capture pages from their own migratelist because otherwise they + * could pollute other pageblocks like MIGRATE_MOVABLE with + * difficult to move pages and making fragmentation worse overall. + */ + if (cc->migratetype == MIGRATE_MOVABLE) { + mtype_low = 0; + mtype_high = MIGRATE_PCPTYPES; + } else { + mtype_low = cc->migratetype; + mtype_high = cc->migratetype + 1; + } + + /* Speculatively examine the free lists without zone lock */ + for (mtype = mtype_low; mtype < mtype_high; mtype++) { + int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct page *page; + struct free_area *area; + area = &(cc->zone->free_area[order]); + if (list_empty(&area->free_list[mtype])) + continue; + + /* Take the lock and attempt capture of the page */ + if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) + return; + if (!list_empty(&area->free_list[mtype])) { + page = list_entry(area->free_list[mtype].next, + struct page, lru); + if (capture_free_page(page, cc->order, mtype)) { + spin_unlock_irqrestore(&cc->zone->lock, + flags); + *cc->page = page; + return; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + } +} + /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free @@ -645,7 +699,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { - unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -688,14 +741,22 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - for (order = cc->order; order < MAX_ORDER; order++) { - /* Job done if page is free of the right migratetype */ - if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (order >= pageblock_order && zone->free_area[order].nr_free) + if (cc->page) { + /* Was a suitable page captured? */ + if (*cc->page) return COMPACT_PARTIAL; + } else { + unsigned int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[cc->order]; + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) + return COMPACT_PARTIAL; + } } return COMPACT_CONTINUE; @@ -817,6 +878,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } + + /* Capture a page now if it is a suitable size */ + compact_capture_page(cc); } out: @@ -829,7 +893,8 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended) + bool sync, bool *contended, + struct page **page) { struct compact_control cc = { .nr_freepages = 0, @@ -839,6 +904,7 @@ static unsigned long compact_zone_order(struct zone *zone, .zone = zone, .sync = sync, .contended = contended, + .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -860,7 +926,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + bool sync, bool *contended, struct page **page) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -881,7 +947,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended); + contended, page); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -936,6 +1002,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, + .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -946,6 +1013,7 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, + .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index b8c91b342e2..e549a7fbc29 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -131,6 +131,7 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool *contended; /* True if a lock was contended */ + struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e92698e539..cfd565dbe12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1380,16 +1380,11 @@ void split_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. + * Similar to the split_page family of functions except that the page + * required at the given order and being isolated now to prevent races + * with parallel allocators */ -int split_free_page(struct page *page) +int capture_free_page(struct page *page, int alloc_order, int migratetype) { unsigned int order; unsigned long watermark; @@ -1411,10 +1406,11 @@ int split_free_page(struct page *page) rmv_page_order(page); __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); + /* Set the pageblock if the captured page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1425,7 +1421,35 @@ int split_free_page(struct page *page) } } - return 1 << order; + return 1UL << order; +} + +/* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + int nr_pages; + + BUG_ON(!PageBuddy(page)); + order = page_order(page); + + nr_pages = capture_free_page(page, order, 0); + if (!nr_pages) + return 0; + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + return nr_pages; } /* @@ -2105,7 +2129,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page; + struct page *page = NULL; if (!order) return NULL; @@ -2118,10 +2142,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction); + contended_compaction, &page); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { + /* If compaction captured a page, prep and use it */ + if (page) { + prep_new_page(page, order, gfp_mask); + goto got_page; + } + + if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2131,6 +2161,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { +got_page: preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; if (order >= preferred_zone->compact_order_failed) -- cgit v1.2.3 From 8d34694c1abf29df1f3c7317936b7e3e2e308d9b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:14 -0700 Subject: revert "mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy linkages" Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy linkages") removed vma->vm_policy updates code but it is the purpose of mbind_range(). Now, mbind_range() is virtually a no-op and while it does not allow memory corruption it is not the right fix. This patch is a revert. [mgorman@suse.de: Edited changelog] Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ada3be6e25..92daa267baf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,6 +607,27 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } +/* Apply policy to a single VMA */ +static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +{ + int err = 0; + struct mempolicy *old = vma->vm_policy; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + if (vma->vm_ops && vma->vm_ops->set_policy) + err = vma->vm_ops->set_policy(vma, new); + if (!err) { + mpol_get(new); + vma->vm_policy = new; + mpol_put(old); + } + return err; +} + /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) @@ -655,23 +676,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - - /* - * Apply policy to a single VMA. The reference counting of - * policy for vma_policy linkages has already been handled by - * vma_merge and split_vma as necessary. If this is a shared - * policy then ->set_policy will increment the reference count - * for an sp node. - */ - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) { - err = vma->vm_ops->set_policy(vma, new_pol); - if (err) - goto out; - } + err = policy_vma(vma, new_pol); + if (err) + goto out; } out: -- cgit v1.2.3 From 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:16 -0700 Subject: mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones , Cc: Christoph Lameter , Reviewed-by: Christoph Lameter Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 92daa267baf..f0728ae7467 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } -/* Apply policy to a single VMA */ -static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) { - int err = 0; - struct mempolicy *old = vma->vm_policy; + int err; + struct mempolicy *old; + struct mempolicy *new; pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); - if (!err) { - mpol_get(new); - vma->vm_policy = new; - mpol_put(old); + if (err) + goto err_out; } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); return err; } @@ -676,7 +691,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - err = policy_vma(vma, new_pol); + err = vma_replace_policy(vma, new_pol); if (err) goto out; } @@ -2153,15 +2168,24 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { - struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + struct sp_node *n; + struct mempolicy *newpol; + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + n->start = start; n->end = end; - mpol_get(pol); - pol->flags |= MPOL_F_SHARED; /* for unref */ - n->policy = pol; + n->policy = newpol; + return n; } -- cgit v1.2.3 From b22d127a39ddd10d93deee3d96e643657ad53a49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:17 -0700 Subject: mempolicy: fix a race in shared_policy_replace() shared_policy_replace() use of sp_alloc() is unsafe. 1) sp_node cannot be dereferenced if sp->lock is not held and 2) another thread can modify sp_node between spin_unlock for allocating a new sp node and next spin_lock. The bug was introduced before 2.6.12-rc2. Kosaki's original patch for this problem was to allocate an sp node and policy within shared_policy_replace and initialise it when the lock is reacquired. I was not keen on this approach because it partially duplicates sp_alloc(). As the paths were sp->lock is taken are not that performance critical this patch converts sp->lock to sp->mutex so it can sleep when calling sp_alloc(). [kosaki.motohiro@jp.fujitsu.com: Original patch] Signed-off-by: Mel Gorman Acked-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f0728ae7467..b2f12ecc1b3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2083,7 +2083,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* Caller holds sp->mutex */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2147,13 +2147,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + mutex_unlock(&sp->mutex); return pol; } @@ -2193,10 +2193,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2 = NULL; + struct sp_node *n; + int ret = 0; -restart: - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2209,16 +2209,14 @@ restart: } else { /* Old policy spanning whole new range. */ if (n->end > end) { + struct sp_node *new2; + new2 = sp_alloc(end, n->end, n->policy); if (!new2) { - spin_unlock(&sp->lock); - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) - return -ENOMEM; - goto restart; + ret = -ENOMEM; + goto out; } n->end = start; sp_insert(sp, new2); - new2 = NULL; break; } else n->end = start; @@ -2229,12 +2227,9 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); - if (new2) { - mpol_put(new2->policy); - kmem_cache_free(sn_cache, new2); - } - return 0; +out: + mutex_unlock(&sp->mutex); + return ret; } /** @@ -2252,7 +2247,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + mutex_init(&sp->mutex); if (mpol) { struct vm_area_struct pvma; @@ -2318,7 +2313,7 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + mutex_lock(&p->mutex); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); @@ -2327,7 +2322,7 @@ void mpol_free_shared_policy(struct shared_policy *p) mpol_put(n->policy); kmem_cache_free(sn_cache, n); } - spin_unlock(&p->lock); + mutex_unlock(&p->mutex); } /* assumes fs == KERNEL_DS */ -- cgit v1.2.3 From 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:19 -0700 Subject: mempolicy: fix refcount leak in mpol_set_shared_policy() When shared_policy_replace() fails to allocate new->policy is not freed correctly by mpol_set_shared_policy(). The problem is that shared mempolicy code directly call kmem_cache_free() in multiple places where it is easy to make a mistake. This patch creates an sp_free wrapper function and uses it. The bug was introduced pre-git age (IOW, before 2.6.12-rc2). [mgorman@suse.de: Editted changelog] Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b2f12ecc1b3..1763418f1ad 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2157,12 +2157,17 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) return pol; } +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_free(n); } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, @@ -2301,7 +2306,7 @@ int mpol_set_shared_policy(struct shared_policy *info, } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) - kmem_cache_free(sn_cache, new); + sp_free(new); return err; } @@ -2318,9 +2323,7 @@ void mpol_free_shared_policy(struct shared_policy *p) while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - rb_erase(&n->nd, &p->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_delete(p, n); } mutex_unlock(&p->mutex); } -- cgit v1.2.3 From 00442ad04a5eac08a98255697c510e708f6082e2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:20 -0700 Subject: mempolicy: fix a memory corruption by refcount imbalance in alloc_pages_vma() Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a potential memory corruption. shmem_alloc_page() uses a pseudo vma and it has one significant unique combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED. get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL and mpol_cond_put() DOES decrease a policy ref when a policy has MPOL_F_SHARED. Therefore, when a cpuset update race occurs, alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the reference count and frees the policy prematurely. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1763418f1ad..3d64b369180 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1552,8 +1552,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, addr); if (vpol) pol = vpol; - } else if (vma->vm_policy) + } else if (vma->vm_policy) { pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } if (!pol) pol = &default_policy; -- cgit v1.2.3 From 21a92735f660eaecf69a6f2e777f18463760ec32 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:29:24 -0700 Subject: mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_{start,end}() or mmu_notifier_invalidate_page() would not be allowed to call schedule() as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Since srcu allocs 4 machine words per instance per cpu, we may end up with memory exhaustion if we use srcu per mm. So all mms share a global srcu. Note that during large mmu_notifier activity exit & unregister paths might hang for longer periods, but it is tolerable for current mmu_notifier clients. Signed-off-by: Sagi Grimberg Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Haggai Eran Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 73 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b60822d9..35ff447d8d1 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -14,10 +14,14 @@ #include #include #include +#include #include #include #include +/* global SRCU for all MMs */ +struct srcu_struct srcu; + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -25,8 +29,8 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ @@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; struct hlist_node *n; + int id; /* * RCU here will block mmu_notifier_unregister until * ->release returns. */ - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) /* * if ->release runs before mmu_notifier_unregister it @@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to + * synchronize_srcu here prevents mmu_notifier_release to * return to exit_mmap (which would proceed freeing all pages * in the mm) until the ->release method returns, if it was * invoked by mmu_notifier_unregister. @@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * The mmu_notifier_mm can't go away from under us because one * mm_count is hold by exit_mmap. */ - synchronize_rcu(); + synchronize_srcu(&srcu); } /* @@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); @@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, break; } } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); @@ -138,7 +144,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, else if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -146,13 +152,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -160,13 +167,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -174,13 +182,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } static int do_mmu_notifier_register(struct mmu_notifier *mn, @@ -192,6 +201,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -290,8 +305,9 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * RCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ - rcu_read_lock(); + int id; + id = srcu_read_lock(&srcu); /* * exit_mmap will block in mmu_notifier_release to * guarantee ->release is called before freeing the @@ -299,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); hlist_del_rcu(&mn->hlist); @@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * Wait any running method to finish, of course including * ->release if it was run by mmu_notifier_relase instead of us. */ - synchronize_rcu(); + synchronize_srcu(&srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} + +module_init(mmu_notifier_init); -- cgit v1.2.3 From e0f3c3f78da29b114e7c1c68019036559f715948 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 8 Oct 2012 16:29:26 -0700 Subject: mm/mmu_notifier: init notifier if necessary While registering MMU notifier, new instance of MMU notifier_mm will be allocated and later free'd if currrent mm_struct's MMU notifier_mm has been initialized. That causes some overhead. The patch tries to elominate that. Signed-off-by: Gavin Shan Signed-off-by: Wanpeng Li Cc: Andrea Arcangeli Cc: Avi Kivity Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 35ff447d8d1..947df83dccb 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -207,22 +207,23 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, */ BUG_ON(!srcu.per_cpu_ref); - ret = -ENOMEM; - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) - goto out; - if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out_cleanup; + goto out; if (!mm_has_notifiers(mm)) { + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), + GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) { + ret = -ENOMEM; + goto out_of_mem; + } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; - mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -238,13 +239,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); +out_of_mem: mm_drop_all_locks(mm); -out_cleanup: +out: if (take_mmap_sem) up_write(&mm->mmap_sem); - /* kfree() does nothing if mmu_notifier_mm is NULL */ - kfree(mmu_notifier_mm); -out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } -- cgit v1.2.3 From d5dc0ad928fb9e972001e552597fd0b794863f34 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 8 Oct 2012 16:29:27 -0700 Subject: mm/vmscan: fix error number for failed kthread Fix the return value while failing to create the kswapd kernel thread. Also, the error message is prioritized as KERN_ERR. Signed-off-by: Gavin Shan Signed-off-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f6fdae7a8d4..d16bf5a5326 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3126,9 +3126,9 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); pgdat->kswapd = NULL; - ret = -1; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); } return ret; } -- cgit v1.2.3 From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 19860086163..79e0f3e2483 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d, oom_score_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj, + "oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); -- cgit v1.2.3 From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 8 Oct 2012 16:29:32 -0700 Subject: mm: hugetlb: add arch hook for clearing page flags before entering pool The core page allocator ensures that page flags are zeroed when freeing pages via free_pages_check. A number of architectures (ARM, PPC, MIPS) rely on this property to treat new pages as dirty with respect to the data cache and perform the appropriate flushing before mapping the pages into userspace. This can lead to cache synchronisation problems when using hugepages, since the allocator keeps its own pool of pages above the usual page allocator and does not reset the page flags when freeing a page into the pool. This patch adds a new architecture hook, arch_clear_hugepage_flags, so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Signed-off-by: Will Deacon Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd4..f1bb534254f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,6 +637,7 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v1.2.3 From db971418824381d3583c73751181fec76c743bf6 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:29:34 -0700 Subject: mm: adjust final #endif position in mm/internal.h Make sure the #endif that terminates the standard #ifndef / #define / #endif construct gets labeled, and gets positioned at the end of the file as is normally the case. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index e549a7fbc29..bbd7b34db4e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -341,7 +341,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 -#endif extern int hwpoison_filter(struct page *p); @@ -357,3 +356,5 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); + +#endif /* __MM_INTERNAL_H */ -- cgit v1.2.3 From 65b3c07b43f7f8a5cbf8923011bd4e6650e3d1dc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:35 -0700 Subject: thp: fix the count of THP_COLLAPSE_ALLOC THP_COLLAPSE_ALLOC is double counted if NUMA is disabled since it has already been calculated in khugepaged_alloc_hugepage Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b72d127051..860ea912785 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1874,9 +1874,9 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } + count_vm_event(THP_COLLAPSE_ALLOC); #endif - count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { #ifdef CONFIG_NUMA put_page(new_page); -- cgit v1.2.3 From 637e3a27ec2c84f7ecd083fa6943da2f19eb5e9f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:38 -0700 Subject: thp: remove unnecessary check in start_khugepaged The check is unnecessary since if mm_slot_cache or mm_slots_hash initialize failed, no sysfs interface will be created Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ea912785..9833d8ecf38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -140,10 +140,7 @@ static int start_khugepaged(void) int err = 0; if (khugepaged_enabled()) { int wakeup; - if (unlikely(!mm_slot_cache || !mm_slots_hash)) { - err = -ENOMEM; - goto out; - } + mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, @@ -163,7 +160,7 @@ static int start_khugepaged(void) } else /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); -out: + return err; } -- cgit v1.2.3 From 911891afe1c3104adf0f802189909868239ebbfd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:41 -0700 Subject: thp: move khugepaged_mutex out of khugepaged Currently, hugepaged_mutex is used really complexly and hard to understand, actually, it is just used to serialize start_khugepaged and khugepaged for these reasons: - khugepaged_thread is shared between them - the thp disable path (echo never > transparent_hugepage/enabled) is nonblocking, so we need to protect khugepaged_thread to get a stable running state These can be avoided by: - use the lock to serialize the thread creation and cancel - thp disable path can not finised until the thread exits Then khugepaged_thread is fully controlled by start_khugepaged, khugepaged will be happy without the lock Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9833d8ecf38..0931b2b19c5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -139,9 +139,6 @@ static int start_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { - int wakeup; - - mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -151,15 +148,17 @@ static int start_khugepaged(void) err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } - wakeup = !list_empty(&khugepaged_scan.mm_head); - mutex_unlock(&khugepaged_mutex); - if (wakeup) + + if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); set_recommended_min_free_kbytes(); - } else + } else if (khugepaged_thread) { /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } return err; } @@ -221,7 +220,12 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err = start_khugepaged(); + int err; + + mutex_lock(&khugepaged_mutex); + err = start_khugepaged(); + mutex_unlock(&khugepaged_mutex); + if (err) ret = err; } @@ -2329,20 +2333,10 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - /* serialize with start_khugepaged() */ - mutex_lock(&khugepaged_mutex); - - for (;;) { - mutex_unlock(&khugepaged_mutex); + while (!kthread_should_stop()) { VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); VM_BUG_ON(khugepaged_thread != current); - - mutex_lock(&khugepaged_mutex); - if (!khugepaged_enabled()) - break; - if (unlikely(kthread_should_stop())) - break; } spin_lock(&khugepaged_mm_lock); @@ -2351,10 +2345,6 @@ static int khugepaged(void *none) if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); - - khugepaged_thread = NULL; - mutex_unlock(&khugepaged_mutex); - return 0; } -- cgit v1.2.3 From e060f0e0139b83f05bb90fa05563d14179b9a7ff Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:42 -0700 Subject: thp: remove unnecessary khugepaged_thread check Now, khugepaged creation and cancel are completely serial under the protection of khugepaged_mutex, it is impossible that many khugepaged entities are running Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0931b2b19c5..46e3f150a6c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2333,11 +2333,8 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - while (!kthread_should_stop()) { - VM_BUG_ON(khugepaged_thread != current); + while (!kthread_should_stop()) khugepaged_loop(); - VM_BUG_ON(khugepaged_thread != current); - } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; -- cgit v1.2.3 From 2017c0bff8ba79ea527361adbe19471e174775d6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:44 -0700 Subject: thp: remove wake_up_interruptible in the exit path Add the check of kthread_should_stop() to the conditions which are used to wakeup on khugepaged_wait, then kthread_stop is enough to let the thread exit Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 46e3f150a6c..a94c07a1b3c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -154,8 +154,6 @@ static int start_khugepaged(void) set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { - /* wakeup to exit */ - wake_up_interruptible(&khugepaged_wait); kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } @@ -2221,7 +2219,7 @@ static int khugepaged_has_work(void) static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } static void khugepaged_do_scan(struct page **hpage) @@ -2288,6 +2286,24 @@ static struct page *khugepaged_alloc_hugepage(void) } #endif +static void khugepaged_wait_work(void) +{ + try_to_freeze(); + + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; + + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; + } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + static void khugepaged_loop(void) { struct page *hpage; @@ -2312,17 +2328,8 @@ static void khugepaged_loop(void) if (hpage) put_page(hpage); #endif - try_to_freeze(); - if (unlikely(kthread_should_stop())) - break; - if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) - continue; - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); - } else if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, - khugepaged_wait_event()); + + khugepaged_wait_work(); } } -- cgit v1.2.3 From 9817626e722a5e5699cf38f5d3a4c9851e054436 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:46 -0700 Subject: thp: remove some code depend on CONFIG_NUMA If NUMA is disabled, hpage is used as page pre-alloc, so there are two cases for hpage: - it is !NULL, means the page is not consumed otherwise, - the page has been consumed If NUMA is enabled, hpage is just used as alloc-fail indicator which is not a real page, NULL means not fail triggered. So, we can release the page only if !IS_ERR_OR_NULL Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a94c07a1b3c..1e21b4cf4c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2306,11 +2306,8 @@ static void khugepaged_wait_work(void) static void khugepaged_loop(void) { - struct page *hpage; + struct page *hpage = NULL; -#ifdef CONFIG_NUMA - hpage = NULL; -#endif while (likely(khugepaged_enabled())) { #ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); @@ -2324,10 +2321,9 @@ static void khugepaged_loop(void) #endif khugepaged_do_scan(&hpage); -#ifndef CONFIG_NUMA - if (hpage) + + if (!IS_ERR_OR_NULL(hpage)) put_page(hpage); -#endif khugepaged_wait_work(); } -- cgit v1.2.3 From d516904bd239fe2c9f1bd46cf146bb4b8831321c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:48 -0700 Subject: thp: merge page pre-alloc in khugepaged_loop into khugepaged_do_scan There are two pre-alloc operations in these two function, the different is: - it allows to sleep if page alloc fail in khugepaged_loop - it exits immediately if page alloc fail in khugepaged_do_scan Actually, in khugepaged_do_scan, we can allow the pre-alloc to sleep on the first failure, then the operation in khugepaged_loop can be removed Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 97 ++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1e21b4cf4c7..d5b5fcc73c4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2222,10 +2222,40 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_alloc_sleep(void) +{ + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} +#endif + +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; barrier(); /* write khugepaged_pages_to_scan to local stack */ @@ -2233,17 +2263,18 @@ static void khugepaged_do_scan(struct page **hpage) cond_resched(); #ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!hpage) + hpage = khugepaged_alloc_hugepage(&wait); + + if (unlikely(!hpage)) + break; +#else + if (IS_ERR(hpage)) { + if (!wait) break; - } - count_vm_event(THP_COLLAPSE_ALLOC); + wait = false; + khugepaged_alloc_sleep(); } -#else - if (IS_ERR(*hpage)) - break; #endif if (unlikely(kthread_should_stop() || freezing(current))) @@ -2255,37 +2286,16 @@ static void khugepaged_do_scan(struct page **hpage) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - hpage); + &hpage); else progress = pages; spin_unlock(&khugepaged_mm_lock); } -} -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); } -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(void) -{ - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && - likely(khugepaged_enabled())); - return hpage; -} -#endif - static void khugepaged_wait_work(void) { try_to_freeze(); @@ -2306,25 +2316,8 @@ static void khugepaged_wait_work(void) static void khugepaged_loop(void) { - struct page *hpage = NULL; - while (likely(khugepaged_enabled())) { -#ifndef CONFIG_NUMA - hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) - break; -#else - if (IS_ERR(hpage)) { - khugepaged_alloc_sleep(); - hpage = NULL; - } -#endif - - khugepaged_do_scan(&hpage); - - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); - + khugepaged_do_scan(); khugepaged_wait_work(); } } -- cgit v1.2.3 From 420256ef02660af0acf28c12fe4b7d514ca88a4d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:49 -0700 Subject: thp: release page in page pre-alloc path If NUMA is enabled, we can release the page in the page pre-alloc operation, then the CONFIG_NUMA dependent code can be reduced Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d5b5fcc73c4..9c4390f60c3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1873,15 +1873,12 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } + *hpage = new_page; count_vm_event(THP_COLLAPSE_ALLOC); #endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { -#ifdef CONFIG_NUMA - put_page(new_page); -#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) return; - } /* * Prevent all access to pagetables with the exception of @@ -1982,9 +1979,8 @@ static void collapse_huge_page(struct mm_struct *mm, prepare_pmd_huge_pte(pgtable, mm); spin_unlock(&mm->page_table_lock); -#ifndef CONFIG_NUMA *hpage = NULL; -#endif + khugepaged_pages_collapsed++; out_up_write: up_write(&mm->mmap_sem); @@ -1992,9 +1988,6 @@ out_up_write: out: mem_cgroup_uncharge_page(new_page); -#ifdef CONFIG_NUMA - put_page(new_page); -#endif goto out_up_write; } @@ -2260,8 +2253,6 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { - cond_resched(); - #ifndef CONFIG_NUMA if (!hpage) hpage = khugepaged_alloc_hugepage(&wait); @@ -2274,8 +2265,12 @@ static void khugepaged_do_scan(void) break; wait = false; khugepaged_alloc_sleep(); + } else if (hpage) { + put_page(hpage); + hpage = NULL; } #endif + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break; -- cgit v1.2.3 From 26234f36ef3ec7efcfa9acb181427849c1f9db7c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:51 -0700 Subject: thp: introduce khugepaged_prealloc_page and khugepaged_alloc_page They are used to abstract the difference between NUMA enabled and NUMA disabled to make the code more readable Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 166 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 98 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c4390f60c3..f0e999379dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1827,28 +1827,34 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) +static void khugepaged_alloc_sleep(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *ptl; - int isolated; - unsigned long hstart, hend; + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#ifndef CONFIG_NUMA - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - new_page = *hpage; -#else +#ifdef CONFIG_NUMA +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ VM_BUG_ON(*hpage); /* * Allocate the page while the vma is still valid and under @@ -1860,7 +1866,7 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, node, __GFP_OTHER_NODE); /* @@ -1868,15 +1874,81 @@ static void collapse_huge_page(struct mm_struct *mm, * preparation for taking it in write mode. */ up_read(&mm->mmap_sem); - if (unlikely(!new_page)) { + if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return; + return NULL; } - *hpage = new_page; + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + return *hpage; +} #endif +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + if (!new_page) + return; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) return; @@ -2215,34 +2287,6 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); -} - -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); - - return hpage; -} -#endif - static void khugepaged_do_scan(void) { struct page *hpage = NULL; @@ -2253,23 +2297,9 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { -#ifndef CONFIG_NUMA - if (!hpage) - hpage = khugepaged_alloc_hugepage(&wait); - - if (unlikely(!hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#else - if (IS_ERR(hpage)) { - if (!wait) - break; - wait = false; - khugepaged_alloc_sleep(); - } else if (hpage) { - put_page(hpage); - hpage = NULL; - } -#endif + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) -- cgit v1.2.3 From b7231789b0224e73af4efc7973f8bcf17fc16edd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:54 -0700 Subject: thp: remove khugepaged_loop Merge khugepaged_loop into khugepaged Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e999379dd..77b470b7e3c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2339,14 +2339,6 @@ static void khugepaged_wait_work(void) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } -static void khugepaged_loop(void) -{ - while (likely(khugepaged_enabled())) { - khugepaged_do_scan(); - khugepaged_wait_work(); - } -} - static int khugepaged(void *none) { struct mm_slot *mm_slot; @@ -2354,8 +2346,10 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - while (!kthread_should_stop()) - khugepaged_loop(); + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); + } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; -- cgit v1.2.3 From 17c230afa58a6d013a4949d5c04b823a281d40fa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:56 -0700 Subject: thp: use khugepaged_enabled to remove duplicate code Use khugepaged_enabled to see whether thp is enabled Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 77b470b7e3c..a169c4dbcdd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void) unsigned long recommended_min; extern int min_free_kbytes; - if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) && - !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags)) + if (!khugepaged_enabled()) return 0; for_each_populated_zone(zone) @@ -228,11 +225,7 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } - if (ret > 0 && - (test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) || - test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags))) + if (ret > 0 && khugepaged_enabled()) set_recommended_min_free_kbytes(); return ret; -- cgit v1.2.3 From 227e4047488d3ee0173a914275a7fd207ad51e5b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:59 -0700 Subject: thp: remove unnecessary set_recommended_min_free_kbytes Since it is called in start_khugepaged Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a169c4dbcdd..e19cc426c52 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -225,9 +225,6 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } - if (ret > 0 && khugepaged_enabled()) - set_recommended_min_free_kbytes(); - return ret; } static struct kobj_attribute enabled_attr = @@ -562,8 +559,6 @@ static int __init hugepage_init(void) start_khugepaged(); - set_recommended_min_free_kbytes(); - return 0; out: hugepage_exit_sysfs(hugepage_kobj); -- cgit v1.2.3 From ca42b26ab285edc5ee3f9faa48379d258db53c35 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:01 -0700 Subject: mm: fix potential anon_vma locking issue in mprotect() Fix an anon_vma locking issue in the following situation: - vma has no anon_vma - next has an anon_vma - vma is being shrunk / next is being expanded, due to an mprotect call We need to take next's anon_vma lock to avoid races with rmap users (such as page migration) while next is being expanded. Signed-off-by: Michel Lespinasse Reviewed-by: Andrea Arcangeli Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2ba27434e84..e3c365ff1b6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -573,8 +573,12 @@ again: remove_next = 1 + (end > next->vm_end); */ if (vma->anon_vma && (importer || start != vma->vm_start)) { anon_vma = vma->anon_vma; + VM_BUG_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + } else if (adjust_next && next->anon_vma) + anon_vma = next->anon_vma; + if (anon_vma) anon_vma_lock(anon_vma); - } if (root) { flush_dcache_mmap_lock(mapping); -- cgit v1.2.3 From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:04 -0700 Subject: thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE Cleanup patch in preparation for transparent hugepage support on s390. Adding new architectures to the TRANSPARENT_HUGEPAGE config option can make the "depends" line rather ugly, like "depends on (X86 || (S390 && 64BIT)) && MMU". This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead. x86 already has MMU "def_bool y", so the MMU check is superfluous there and HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig. Signed-off-by: Gerald Schaefer Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019c662..3322342a1ff 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on X86 && MMU + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and -- cgit v1.2.3 From e3ebcf64381188a2744a9829a4eb5c2b60f1974c Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:07 -0700 Subject: thp: remove assumptions on pgtable_t type The thp page table pre-allocation code currently assumes that pgtable_t is of type "struct page *". This may not be true for all architectures, so this patch removes that assumption by replacing the functions prepare_pmd_huge_pte() and get_pmd_huge_pte() with two new functions that can be defined architecture-specific. It also removes two VM_BUG_ON checks for page_count() and page_mapcount() operating on a pgtable_t. Apart from the VM_BUG_ON removal, there will be no functional change introduced by this patch. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 50 ++++++++------------------------------------------ mm/pgtable-generic.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e19cc426c52..9ea6d195376 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -598,19 +598,6 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static void prepare_pmd_huge_pte(pgtable_t pgtable, - struct mm_struct *mm) -{ - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(&pgtable->lru); - else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; -} - static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) @@ -652,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - prepare_pmd_huge_pte(pgtable, mm); + pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -778,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - prepare_pmd_huge_pte(pgtable, dst_mm); + pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -789,25 +776,6 @@ out: return ret; } -/* no "address" argument so destroys page coloring of some arch */ -pgtable_t get_pmd_huge_pte(struct mm_struct *mm) -{ - pgtable_t pgtable; - - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - pgtable = mm->pmd_huge_pte; - if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, - struct page, lru); - list_del(&pgtable->lru); - } - return pgtable; -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -863,7 +831,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1028,7 +996,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); + pgtable = pgtable_trans_huge_withdraw(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); @@ -1345,11 +1313,11 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); - for (i = 0, haddr = address; i < HPAGE_PMD_NR; - i++, haddr += PAGE_SIZE) { + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); entry = mk_pte(page + i, vma->vm_page_prot); @@ -2017,8 +1985,6 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - VM_BUG_ON(page_count(pgtable) != 1); - VM_BUG_ON(page_mapcount(pgtable) != 0); _pmd = mk_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); @@ -2036,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, _pmd); - prepare_pmd_huge_pte(pgtable, mm); + pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0ddaa6fa..29867e083d3 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -120,3 +120,42 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif -- cgit v1.2.3 From 46dcde735c9d8953bbd8d105ca6779e5b5300c28 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:09 -0700 Subject: thp: introduce pmdp_invalidate() On s390, a valid page table entry must not be changed while it is attached to any CPU. So instead of pmd_mknotpresent() and set_pmd_at(), an IDTE operation would be necessary there. This patch introduces the pmdp_invalidate() function, to allow architecture-specific implementations. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- mm/pgtable-generic.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ea6d195376..bec6243b696 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1361,8 +1361,7 @@ static int __split_huge_page_map(struct page *page, * SMP TLB and finally we write the non-huge version * of the pmd entry with pmd_populate. */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 29867e083d3..e642627da6b 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -159,3 +159,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif -- cgit v1.2.3 From 8e72033f2a489b6c98c4e3c7cc281b1afd6cb85c Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:12 -0700 Subject: thp: make MADV_HUGEPAGE check for mm->def_flags This adds a check to hugepage_madvise(), to refuse MADV_HUGEPAGE if VM_NOHUGEPAGE is set in mm->def_flags. On s390, the VM_NOHUGEPAGE flag will be set in mm->def_flags for kvm processes, to prevent any future thp mappings. In order to also prevent MADV_HUGEPAGE on such an mm, hugepage_madvise() should check mm->def_flags. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bec6243b696..010d32944d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1450,6 +1450,8 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { + struct mm_struct *mm = vma->vm_mm; + switch (advice) { case MADV_HUGEPAGE: /* @@ -1457,6 +1459,8 @@ int hugepage_madvise(struct vm_area_struct *vma, */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; + if (mm->def_flags & VM_NOHUGEPAGE) + return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* -- cgit v1.2.3 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 4 +- mm/filemap_xip.c | 3 +- mm/fremap.c | 2 +- mm/hugetlb.c | 3 +- mm/interval_tree.c | 61 +++++++++++++++ mm/memory-failure.c | 3 +- mm/memory.c | 9 +-- mm/mmap.c | 22 +++--- mm/nommu.c | 12 ++- mm/prio_tree.c | 208 ---------------------------------------------------- mm/rmap.c | 18 ++--- 11 files changed, 94 insertions(+), 251 deletions(-) create mode 100644 mm/interval_tree.c delete mode 100644 mm/prio_tree.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82d..6b025f80af3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 91750227a19..a52daee11d3 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); diff --git a/mm/fremap.c b/mm/fremap.c index 3d731a49878..3899a86851c 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bb534254f..c9b40e3a993 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 00000000000..7dc565660e5 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,61 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include + +#define ITSTRUCT struct vm_area_struct +#define ITRB shared.linear.rb +#define ITTYPE unsigned long +#define ITSUBTREE shared.linear.rb_subtree_last +#define ITSTART(n) ((n)->vm_pgoff) +#define ITLAST(n) ((n)->vm_pgoff + \ + (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) +#define ITSTATIC +#define ITPREFIX vma_interval_tree + +#include + +/* Insert old immediately after vma in the interval tree */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_add(&vma->shared.nonlinear, &old->shared.nonlinear); + return; + } + + last = ITLAST(vma); + + if (!old->shared.linear.rb.rb_right) { + parent = old; + link = &old->shared.linear.rb.rb_right; + } else { + parent = rb_entry(old->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + vma->shared.linear.rb_subtree_last = last; + rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, + &vma_interval_tree_augment_callbacks); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a661..c38a6257d08 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index e09c0481318..d205e4381a3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/mmap.c b/mm/mmap.c index e3c365ff1b6..5ac533f88e9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -554,7 +554,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -582,9 +582,9 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -597,8 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } diff --git a/mm/nommu.c b/mm/nommu.c index 12e84e69dd0..45131b41bcd 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8..00000000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cda2a2..7b5b51d25fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3 From 85d3a316c714197f94e75c1e5b2d37607d66e5de Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:27 -0700 Subject: kmemleak: use rbtree instead of prio tree kmemleak uses a tree where each node represents an allocated memory object in order to quickly find out what object a given address is part of. However, the objects don't overlap, so rbtrees are a better choice than prio tree for this use. They are both faster and have lower memory overhead. Tested by booting a kernel with kmemleak enabled, loading the kmemleak_test module, and looking for the expected messages. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Acked-by: Catalin Marinas Tested-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 100 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0de83b4541e..a217cc54406 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -29,7 +29,7 @@ * - kmemleak_lock (rwlock): protects the object_list modifications and * accesses to the object_tree_root. The object_list is the main list * holding the metadata (struct kmemleak_object) for the allocated memory - * blocks. The object_tree_root is a priority search tree used to look-up + * blocks. The object_tree_root is a red black tree used to look-up * metadata based on a pointer to the corresponding memory block. The * kmemleak_object structures are added to the object_list and * object_tree_root in the create_object() function called from the @@ -71,7 +71,7 @@ #include #include #include -#include +#include #include #include #include @@ -132,7 +132,7 @@ struct kmemleak_scan_area { * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the * object->lock. Insertions or deletions from object_list, gray_list or - * tree_node are already protected by the corresponding locks or mutex (see + * rb_node are already protected by the corresponding locks or mutex (see * the notes on locking above). These objects are reference-counted * (use_count) and freed using the RCU mechanism. */ @@ -141,7 +141,7 @@ struct kmemleak_object { unsigned long flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; - struct prio_tree_node tree_node; + struct rb_node rb_node; struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; @@ -182,9 +182,9 @@ struct kmemleak_object { static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); -/* prio search tree for object boundaries */ -static struct prio_tree_root object_tree_root; -/* rw_lock protecting the access to object_list and prio_tree_root */ +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* rw_lock protecting the access to object_list and object_tree_root */ static DEFINE_RWLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ @@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.entries = object->trace; pr_notice("Object 0x%08lx (size %zu):\n", - object->tree_node.start, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); @@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) } /* - * Look-up a memory block metadata (kmemleak_object) in the priority search + * Look-up a memory block metadata (kmemleak_object) in the object search * tree based on a pointer value. If alias is 0, only values pointing to the * beginning of the memory block are allowed. The kmemleak_lock must be held * when calling this function. */ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { - struct prio_tree_node *node; - struct prio_tree_iter iter; - struct kmemleak_object *object; - - prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); - node = prio_tree_next(&iter); - if (node) { - object = prio_tree_entry(node, struct kmemleak_object, - tree_node); - if (!alias && object->pointer != ptr) { + struct rb_node *rb = object_tree_root.rb_node; + + while (rb) { + struct kmemleak_object *object = + rb_entry(rb, struct kmemleak_object, rb_node); + if (ptr < object->pointer) + rb = object->rb_node.rb_left; + else if (object->pointer + object->size <= ptr) + rb = object->rb_node.rb_right; + else if (object->pointer == ptr || alias) + return object; + else { kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); - object = NULL; + break; } - } else - object = NULL; - - return object; + } + return NULL; } /* @@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) } /* - * Look up an object in the prio search tree and increase its use_count. + * Look up an object in the object search tree and increase its use_count. */ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { @@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { unsigned long flags; - struct kmemleak_object *object; - struct prio_tree_node *node; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_len = __save_stack_trace(object->trace); - INIT_PRIO_TREE_NODE(&object->tree_node); - object->tree_node.start = ptr; - object->tree_node.last = ptr + size - 1; - write_lock_irqsave(&kmemleak_lock, flags); min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); - node = prio_tree_insert(&object_tree_root, &object->tree_node); - /* - * The code calling the kernel does not yet have the pointer to the - * memory block to be able to free it. However, we still hold the - * kmemleak_lock here in case parts of the kernel started freeing - * random memory blocks. - */ - if (node != &object->tree_node) { - kmemleak_stop("Cannot insert 0x%lx into the object search tree " - "(already existing)\n", ptr); - object = lookup_object(ptr, 1); - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); - - goto out; + link = &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + if (ptr + size <= parent->pointer) + link = &parent->rb_node.rb_left; + else if (parent->pointer + parent->size <= ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object " + "search tree (overlaps existing)\n", + ptr); + kmem_cache_free(object_cache, object); + object = parent; + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + goto out; + } } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, &object_tree_root); + list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); @@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) unsigned long flags; write_lock_irqsave(&kmemleak_lock, flags); - prio_tree_remove(&object_tree_root, &object->tree_node); + rb_erase(&object->rb_node, &object_tree_root); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); @@ -1766,7 +1769,6 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - INIT_PRIO_TREE_ROOT(&object_tree_root); if (crt_early_log >= ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " -- cgit v1.2.3 From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:35 -0700 Subject: mm: interval tree updates Update the generic interval tree code that was introduced in "mm: replace vma prio_tree with an interval tree". Changes: - fixed 'endpoing' typo noticed by Andrew Morton - replaced include/linux/interval_tree_tmpl.h, which was used as a template (including it automatically defined the interval tree functions) with include/linux/interval_tree_generic.h, which only defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself defines the interval tree functions when invoked. Now that is a very long macro which is unfortunate, but it does make the usage sites (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously. - make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro, instead of duplicating that code in the interval tree template. - replaced vma_interval_tree_add(), which was actually handling the nonlinear and interval tree cases, with vma_interval_tree_insert_after() which handles only the interval tree case and has an API that is more consistent with the other interval tree handling functions. The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 60 ++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 7dc565660e5..4ab7b9ec3a5 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,40 +8,38 @@ #include #include +#include -#define ITSTRUCT struct vm_area_struct -#define ITRB shared.linear.rb -#define ITTYPE unsigned long -#define ITSUBTREE shared.linear.rb_subtree_last -#define ITSTART(n) ((n)->vm_pgoff) -#define ITLAST(n) ((n)->vm_pgoff + \ - (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) -#define ITSTATIC -#define ITPREFIX vma_interval_tree - -#include - -/* Insert old immediately after vma in the interval tree */ -void vma_interval_tree_add(struct vm_area_struct *vma, - struct vm_area_struct *old, - struct address_space *mapping) +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, + unsigned long, shared.linear.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) { struct rb_node **link; struct vm_area_struct *parent; - unsigned long last; - - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - list_add(&vma->shared.nonlinear, &old->shared.nonlinear); - return; - } + unsigned long last = vma_last_pgoff(node); - last = ITLAST(vma); + VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); - if (!old->shared.linear.rb.rb_right) { - parent = old; - link = &old->shared.linear.rb.rb_right; + if (!prev->shared.linear.rb.rb_right) { + parent = prev; + link = &prev->shared.linear.rb.rb_right; } else { - parent = rb_entry(old->shared.linear.rb.rb_right, + parent = rb_entry(prev->shared.linear.rb.rb_right, struct vm_area_struct, shared.linear.rb); if (parent->shared.linear.rb_subtree_last < last) parent->shared.linear.rb_subtree_last = last; @@ -54,8 +52,8 @@ void vma_interval_tree_add(struct vm_area_struct *vma, link = &parent->shared.linear.rb.rb_left; } - vma->shared.linear.rb_subtree_last = last; - rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, - &vma_interval_tree_augment_callbacks); + node->shared.linear.rb_subtree_last = last; + rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&node->shared.linear.rb, root, + &vma_interval_tree_augment); } -- cgit v1.2.3 From 108d6642ad81bb1d62b401490a334d2c12397517 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:36 -0700 Subject: mm anon rmap: remove anon_vma_moveto_tail mremap() had a clever optimization where move_ptes() did not take the anon_vma lock to avoid a race with anon rmap users such as page migration. Instead, the avc's were ordered in such a way that the origin vma was always visited by rmap before the destination. This ordering and the use of page table locks rmap usage safe. However, we want to replace the use of linked lists in anon rmap with an interval tree, and this will make it harder to impose such ordering as the interval tree will always be sorted by the avc->vma->vm_pgoff value. For now, let's replace the anon_vma_moveto_tail() ordering function with proper anon_vma locking in move_ptes(). Once we have the anon interval tree in place, we will re-introduce an optimization to avoid taking these locks in the most common cases. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +-- mm/mremap.c | 14 +++++--------- mm/rmap.c | 45 --------------------------------------------- 3 files changed, 6 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5ac533f88e9..66984aab791 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, */ VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; - } else - anon_vma_moveto_tail(new_vma); + } } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e48d0..5588bb6e929 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long new_addr) { struct address_space *mapping = NULL; + struct anon_vma *anon_vma = vma->anon_vma; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, mapping = vma->vm_file->f_mapping; mutex_lock(&mapping->i_mmap_mutex); } + if (anon_vma) + anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -220,15 +225,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { - /* - * Before moving the page tables from the new vma to - * the old vma, we need to be sure the old vma is - * queued after new vma in the same_anon_vma list to - * prevent SMP races with rmap_walk (that could lead - * rmap_walk to miss some page table). - */ - anon_vma_moveto_tail(vma); - /* * On error, move entries back from new area to old, * which will succeed since page tables still there, diff --git a/mm/rmap.c b/mm/rmap.c index 7b5b51d25fc..8cbd62fde0f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -268,51 +268,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return -ENOMEM; } -/* - * Some rmap walk that needs to find all ptes/hugepmds without false - * negatives (like migrate and split_huge_page) running concurrent - * with operations that copy or move pagetables (like mremap() and - * fork()) to be safe. They depend on the anon_vma "same_anon_vma" - * list to be in a certain order: the dst_vma must be placed after the - * src_vma in the list. This is always guaranteed by fork() but - * mremap() needs to call this function to enforce it in case the - * dst_vma isn't newly allocated and chained with the anon_vma_clone() - * function but just an extension of a pre-existing vma through - * vma_merge. - * - * NOTE: the same_anon_vma list can still be changed by other - * processes while mremap runs because mremap doesn't hold the - * anon_vma mutex to prevent modifications to the list while it - * runs. All we need to enforce is that the relative order of this - * process vmas isn't changing (we don't care about other vmas - * order). Each vma corresponds to an anon_vma_chain structure so - * there's no risk that other processes calling anon_vma_moveto_tail() - * and changing the same_anon_vma list under mremap() will screw with - * the relative order of this process vmas in the list, because we - * they can't alter the order of any vma that belongs to this - * process. And there can't be another anon_vma_moveto_tail() running - * concurrently with mremap() coming from this process because we hold - * the mmap_sem for the whole mremap(). fork() ordering dependency - * also shouldn't be affected because fork() only cares that the - * parent vmas are placed in the list before the child vmas and - * anon_vma_moveto_tail() won't reorder vmas from either the fork() - * parent or child. - */ -void anon_vma_moveto_tail(struct vm_area_struct *dst) -{ - struct anon_vma_chain *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = pavc->anon_vma; - VM_BUG_ON(pavc->vma != dst); - root = lock_anon_vma_root(root, anon_vma); - list_del(&pavc->same_anon_vma); - list_add_tail(&pavc->same_anon_vma, &anon_vma->head); - } - unlock_anon_vma_root(root); -} - /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. -- cgit v1.2.3 From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:39 -0700 Subject: mm anon rmap: replace same_anon_vma linked list with an interval tree. When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++-- mm/interval_tree.c | 14 ++++++++++ mm/ksm.c | 9 ++++--- mm/memory-failure.c | 5 +++- mm/mmap.c | 73 ++++++++++++++++++++++++++++++++++++++++------------- mm/rmap.c | 24 +++++++++--------- 6 files changed, 94 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 010d32944d1..ce59ada0946 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); @@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page, __split_huge_page_refcount(page); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4ab7b9ec3a5..f7c72cd35e1 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,6 +8,7 @@ #include #include +#include #include static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) @@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, rb_insert_augmented(&node->shared.linear.rb, root, &vma_interval_tree_augment); } + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) diff --git a/mm/ksm.c b/mm/ksm.c index 9638620a753..14ee5cf8a51 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1618,7 +1618,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1671,7 +1672,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1723,7 +1725,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c38a6257d08..6c5899b9034 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ return; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; if (!task_early_kill(tsk)) continue; - list_for_each_entry(vmac, &av->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; diff --git a/mm/mmap.c b/mm/mmap.c index 66984aab791..2e580ed7921 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + static int find_vma_links(struct mm_struct *mm, unsigned long addr, unsigned long end, struct vm_area_struct **pprev, struct rb_node ***rb_link, struct rb_node **rb_parent) @@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end); vma_adjust_trans_huge(vma, start, end, adjust_next); - /* - * When changing only vma->vm_end, we don't really need anon_vma - * lock. This is a fairly rare case by itself, but the anon_vma - * lock may be shared between many sibling processes. Skipping - * the lock for brk adjustments makes a difference sometimes. - */ - if (vma->anon_vma && (importer || start != vma->vm_start)) { - anon_vma = vma->anon_vma; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - } else if (adjust_next && next->anon_vma) - anon_vma = next->anon_vma; - if (anon_vma) anon_vma_lock(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); + } if (root) { flush_dcache_mmap_lock(mapping); @@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock(anon_vma); + } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); } } @@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * @@ -2619,13 +2656,13 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->head will + * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next @@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); anon_vma_unlock(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 8cbd62fde0f..9c61bf387fd 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); - list_del(&avc->same_anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (list_empty(&anon_vma->head)) + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) continue; list_del(&avc->same_vma); @@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data) mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); - INIT_LIST_HEAD(&anon_vma->head); + anon_vma->rb_root = RB_ROOT; } void __init anon_vma_init(void) @@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int referenced = 0; @@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) if (!anon_vma) return ret; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address; @@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (!anon_vma) return ret; anon_vma_lock(anon_vma); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) -- cgit v1.2.3 From 86c2ad19956f84f2191e062fcb979367b6365871 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:42 -0700 Subject: mm rmap: remove vma_address check for address inside vma In file and anon rmap, we use interval trees to find potentially relevant vmas and then call vma_address() to find the virtual address the given page might be found at in these vmas. vma_address() used to include a check that the returned address falls within the limits of the vma, but this check isn't necessary now that we always use interval trees in rmap: the interval tree just doesn't return any vmas which this check would find to be irrelevant. As a result, we can replace the use of -EFAULT error code (which then needed to be checked in every call site) with a VM_BUG_ON(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ---- mm/rmap.c | 48 +++++++++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ce59ada0946..7cf8b0ec11e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1386,8 +1386,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1412,8 +1410,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) diff --git a/mm/rmap.c b/mm/rmap.c index 9c61bf387fd..28777412de6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -510,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) /* * At what user virtual address is page expected in @vma? - * Returns virtual address or -EFAULT if page's index/offset is not - * within the range mapped the @vma. */ -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; if (unlikely(is_vm_hugetlb_page(vma))) pgoff = page->index << huge_page_order(page_hstate(page)); - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } + + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + return address; } @@ -535,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { + unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -550,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - return vma_address(page, vma); + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; } /* @@ -624,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *pte; spinlock_t *ptl; - address = vma_address(page, vma); - if (address == -EFAULT) /* out of vma range */ + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return 0; pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); if (!pte) /* the page is not in this mm */ @@ -732,8 +740,6 @@ static int page_referenced_anon(struct page *page, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -799,8 +805,6 @@ static int page_referenced_file(struct page *page, vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -904,8 +908,6 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret += page_mkclean_one(page, vma, address); } } @@ -1468,8 +1470,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; @@ -1508,8 +1508,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; @@ -1684,8 +1682,6 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; @@ -1707,8 +1703,6 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; -- cgit v1.2.3 From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:45 -0700 Subject: mm: add CONFIG_DEBUG_VM_RB build option Add a CONFIG_DEBUG_VM_RB build option for the previously existing DEBUG_MM_RB code. Now that Andi Kleen modified it to avoid using recursive algorithms, we can expose it a bit more. Also extend this code to validate_mm() after stack expansion, and to check that the vma's start and last pgoffs have not changed since the nodes were inserted on the anon vma interval tree (as it is important that the nodes be reindexed after each such update). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 41 ++++++++++++++++++++++++++++++++++++++++- mm/mmap.c | 19 +++++++++---------- 2 files changed, 49 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f7c72cd35e1..4a5822a586e 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -70,4 +70,43 @@ static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, - avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/mmap.c b/mm/mmap.c index 2e580ed7921..deb422c39e2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -303,7 +297,7 @@ out: return retval; } -#ifdef DEBUG_MM_RB +#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { int i = 0, j; @@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; + struct vm_area_struct *vma = mm->mmap; + while (vma) { + struct anon_vma_chain *avc; + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma = vma->vm_next; i++; } if (i != mm->map_count) @@ -1790,6 +1787,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1843,6 +1841,7 @@ int expand_downwards(struct vm_area_struct *vma, } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } -- cgit v1.2.3 From 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:48 -0700 Subject: mm anon rmap: in mremap, set the new vma's position before anon_vma_clone() anon_vma_clone() expects new_vma->vm_{start,end,pgoff} to be correctly set so that the new vma can be indexed on the anon interval tree. copy_vma() was failing to do that, which broke mremap(). Signed-off-by: Michel Lespinasse Cc: Jiri Slaby Cc: Hugh Dickins Tested-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index deb422c39e2..81248992120 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2419,16 +2419,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) goto out_free_vma; + vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) -- cgit v1.2.3 From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:50 -0700 Subject: mm: avoid taking rmap locks in move_ptes() During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++++-- mm/mremap.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 81248992120..2d942353d68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; @@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * linear if there are no pages mapped yet. */ VM_BUG_ON(faulted_in_anon_vma); - *vmap = new_vma; + *vmap = vma = new_vma; } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { @@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } } return new_vma; diff --git a/mm/mremap.c b/mm/mremap.c index 5588bb6e929..3b639a4b26b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) + unsigned long new_addr, bool need_rmap_locks) { struct address_space *mapping = NULL; - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock truncate_pagecache - * out, since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + /* + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } } - if (anon_vma) - anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len) + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; @@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + new_vma, new_pmd, new_addr, need_rmap_locks); need_flush = true; } if (likely(need_flush)) @@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; int err; + bool need_rmap_locks; /* * We'd prefer to avoid failure later on in do_munmap: @@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); if (!new_vma) return -ENOMEM; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); if (moved_len < old_len) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.2.3 From 1e8537baacd59e96bbe5f8d3d32feafd11f509fe Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Mon, 8 Oct 2012 16:31:51 -0700 Subject: memory-hotplug: build zonelists when offlining pages online_pages() does build_all_zonelists() and zone_pcp_update(), I think offline_pages() should do it too. When the zone has no memory to allocate, remove it from other nodes' zonelists. zone_batchsize() depends on zone's present pages, if zone's present pages are changed, zone's pcp should be updated. Signed-off-by: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a5b90d0cfd..b35016156c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -970,8 +970,13 @@ repeat: init_per_zone_wmark_min(); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { zone_pcp_reset(zone); + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } else + zone_pcp_update(zone); if (!node_present_pages(node)) { node_clear_state(node, N_HIGH_MEMORY); -- cgit v1.2.3 From 70400303ce0c4ced3139499c676d5c79636b0c72 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 8 Oct 2012 16:31:52 -0700 Subject: mm: mmu_notifier: make the mmu_notifier srcu static The variable must be static especially given the variable name. s/RCU/SRCU/ over a few comments. Signed-off-by: Andrea Arcangeli Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 947df83dccb..c297142f0fe 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -20,7 +20,7 @@ #include /* global SRCU for all MMs */ -struct srcu_struct srcu; +static struct srcu_struct srcu; /* * This function can't run concurrently against mmu_notifier_register @@ -41,7 +41,7 @@ void __mmu_notifier_release(struct mm_struct *mm) int id; /* - * RCU here will block mmu_notifier_unregister until + * SRCU here will block mmu_notifier_unregister until * ->release returns. */ id = srcu_read_lock(&srcu); @@ -302,7 +302,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (!hlist_unhashed(&mn->hlist)) { /* - * RCU here will force exit_mmap to wait ->release to finish + * SRCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ int id; -- cgit v1.2.3 From 02c6de8d757cb32c0829a45d81c3dfcbcafd998b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:31:55 -0700 Subject: mm: cma: discard clean pages during contiguous allocation instead of migration Drop clean cache pages instead of migration during alloc_contig_range() to minimise allocation latency by reducing the amount of migration that is necessary. It's useful for CMA because latency of migration is more important than evicting the background process's working set. In addition, as pages are reclaimed then fewer free pages for migration targets are required so it avoids memory reclaiming to get free pages, which is a contributory factor to increased latency. I measured elapsed time of __alloc_contig_migrate_range() which migrates 10M in 40M movable zone in QEMU machine. Before - 146ms, After - 7ms [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Mel Gorman Signed-off-by: Minchan Kim Reviewed-by: Mel Gorman Cc: Marek Szyprowski Acked-by: Michal Nazarewicz Cc: Rik van Riel Tested-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/page_alloc.c | 2 ++ mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index bbd7b34db4e..8312d4fadf5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -356,5 +356,6 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); - +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfd565dbe12..cefd14e6dcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5700,6 +5700,8 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) break; } + reclaim_clean_pages_from_list(cc.zone, &cc.migratepages); + ret = migrate_pages(&cc.migratepages, __alloc_contig_migrate_alloc, 0, false, MIGRATE_SYNC); diff --git a/mm/vmscan.c b/mm/vmscan.c index d16bf5a5326..1ee4b69a28a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, + enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, - unsigned long *ret_nr_writeback) + unsigned long *ret_nr_writeback, + bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { - enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; cond_resched(); @@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, wait_on_page_writeback(page); } - references = page_check_references(page, sc); + if (!force_reclaim) + references = page_check_references(page, sc); + switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -960,6 +964,33 @@ keep: return nr_reclaimed; } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, true); + list_splice(&clean_pages, page_list); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, - &nr_dirty, &nr_writeback); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_writeback, false); spin_lock_irq(&zone->lru_lock); -- cgit v1.2.3 From 770c8aaaf6f04a87e6765f24d497132de9152a46 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:31:57 -0700 Subject: mm: fix tracing in free_pcppages_bulk() page->private gets re-used in __free_one_page() to store page order (so trace_mm_page_pcpu_drain() may print order instead of migratetype) thus migratetype value must be cached locally. Fixes regression introduced in commit a7016235a61d ("mm: fix migratetype bug which slowed swapping"). This caused incorrect data to be attached to the mm_page_pcpu_drain trace event. [akpm@linux-foundation.org: add comment] Cc: Marek Szyprowski Cc: Michal Nazarewicz Acked-by: Minchan Kim Acked-by: Mel Gorman Cc: Hugh Dickins Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cefd14e6dcf..3f18a14effb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -668,12 +668,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = to_free; do { + int mt; /* migratetype of the to-be-freed page */ + page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + mt = page_private(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, page_private(page)); - trace_mm_page_pcpu_drain(page, 0, page_private(page)); + __free_one_page(page, zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); -- cgit v1.2.3 From 2139cbe627b8910ded55148f87ee10f7485408ed Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:00 -0700 Subject: cma: fix counting of isolated pages Isolated free pages shouldn't be accounted to NR_FREE_PAGES counter. Fix it by properly decreasing/increasing NR_FREE_PAGES counter in set_migratetype_isolate()/unset_migratetype_isolate() and removing counter adjustment for isolated pages from free_one_page() and split_free_page(). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++-- mm/page_isolation.c | 12 +++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f18a14effb..d259cc2b69c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -691,7 +691,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + if (unlikely(migratetype != MIGRATE_ISOLATE)) + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } @@ -1392,6 +1393,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) unsigned int order; unsigned long watermark; struct zone *zone; + int mt; BUG_ON(!PageBuddy(page)); @@ -1407,7 +1409,10 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + mt = get_pageblock_migratetype(page); + if (unlikely(mt != MIGRATE_ISOLATE)) + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); if (alloc_order != order) expand(zone, page, alloc_order, order, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f17573..3ca1716471b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -76,8 +76,12 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { + unsigned long nr_pages; + set_pageblock_isolate(page); - move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + + __mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages); } spin_unlock_irqrestore(&zone->lock, flags); @@ -89,12 +93,14 @@ out: void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; - unsigned long flags; + unsigned long flags, nr_pages; + zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.2.3 From d1ce749a0db12202b711d1aba1d29e823034648d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:02 -0700 Subject: cma: count free CMA pages Add NR_FREE_CMA_PAGES counter to be later used for checking watermark in __zone_watermark_ok(). For simplicity and to avoid #ifdef hell make this counter always available (not only when CONFIG_CMA=y). [akpm@linux-foundation.org: use conventional migratetype naming] Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++------- mm/page_isolation.c | 5 +++-- mm/vmstat.c | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d259cc2b69c..6969a8abdba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); set_page_private(page, 0); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, + migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -677,6 +678,8 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); @@ -692,7 +695,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, __free_one_page(page, zone, order, migratetype); if (unlikely(migratetype != MIGRATE_ISOLATE)) - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -815,7 +818,8 @@ static inline void expand(struct zone *zone, struct page *page, set_page_guard_flag(&page[size]); set_page_private(&page[size], high); /* Guard pages are not available for any usage */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + __mod_zone_freepage_state(zone, -(1 << high), + migratetype); continue; } #endif @@ -1141,6 +1145,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } set_page_private(page, mt); list = &page->lru; + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); @@ -1412,7 +1419,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) mt = get_pageblock_migratetype(page); if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + __mod_zone_freepage_state(zone, -(1UL << order), mt); if (alloc_order != order) expand(zone, page, alloc_order, order, @@ -1516,7 +1523,8 @@ again: spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); + __mod_zone_freepage_state(zone, -(1 << order), + get_pageblock_migratetype(page)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); @@ -2890,7 +2898,8 @@ void show_free_areas(unsigned int filter) " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -2907,7 +2916,8 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE)); + global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { int i; @@ -2939,6 +2949,7 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2968,6 +2979,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone->all_unreclaimable ? "yes" : "no") diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3ca1716471b..345643b85bd 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -77,11 +77,12 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { unsigned long nr_pages; + int migratetype = get_pageblock_migratetype(page); set_pageblock_isolate(page); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); - __mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages); + __mod_zone_freepage_state(zone, -nr_pages, migratetype); } spin_unlock_irqrestore(&zone->lock, flags); @@ -100,7 +101,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + __mod_zone_freepage_state(zone, nr_pages, migratetype); restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/vmstat.c b/mm/vmstat.c index b3e3b9d525d..acbd85c983e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -722,6 +722,7 @@ const char * const vmstat_text[] = { "numa_other", #endif "nr_anon_transparent_hugepages", + "nr_free_cma", "nr_dirty_threshold", "nr_dirty_background_threshold", -- cgit v1.2.3 From d95ea5d18e699515468368415c93ed49b1a3221b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:05 -0700 Subject: cma: fix watermark checking * Add ALLOC_CMA alloc flag and pass it to [__]zone_watermark_ok() (from Minchan Kim). * During watermark check decrease available free pages number by free CMA pages number if necessary (unmovable allocations cannot use pages from CMA areas). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 +++++++- mm/internal.h | 14 ++++++++++++++ mm/page_alloc.c | 31 +++++++++++++++---------------- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0fbc6b73a52..1f61bcbd626 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -934,6 +934,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; + int alloc_flags = 0; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) @@ -941,6 +942,10 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, count_vm_event(COMPACTSTALL); +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -951,7 +956,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, + alloc_flags)) break; } diff --git a/mm/internal.h b/mm/internal.h index 8312d4fadf5..96cda4c6ac5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -358,4 +358,18 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6969a8abdba..f2c7cc6a303 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1541,19 +1541,6 @@ failed: return NULL; } -/* The ALLOC_WMARK bits are used as an index to zone->watermark */ -#define ALLOC_WMARK_MIN WMARK_MIN -#define ALLOC_WMARK_LOW WMARK_LOW -#define ALLOC_WMARK_HIGH WMARK_HIGH -#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ - -/* Mask to get the watermark bits */ -#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) - -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct { @@ -1648,7 +1635,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; - +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); +#endif if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { @@ -2362,7 +2353,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } - +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif return alloc_flags; } @@ -2587,6 +2581,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; gfp_mask &= gfp_allowed_mask; @@ -2615,9 +2610,13 @@ retry_cpuset: if (!preferred_zone) goto out; +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) page = __alloc_pages_slowpath(gfp_mask, order, -- cgit v1.2.3 From b12c4ad14ee0232ad47c2bef404b6d42a3578332 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:08 -0700 Subject: mm: page_alloc: use get_freepage_migratetype() instead of page_private() The page allocator uses set_page_private and page_private for handling migratetype when it frees page. Let's replace them with [set|get] _freepage_migratetype to make it more clear. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- mm/page_isolation.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2c7cc6a303..6aa0a8e89c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -674,7 +674,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = page_private(page); + mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); @@ -1143,7 +1143,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) mt = migratetype; } - set_page_private(page, mt); + set_freepage_migratetype(page, mt); list = &page->lru; if (is_migrate_cma(mt)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -1313,7 +1313,7 @@ void free_hot_cold_page(struct page *page, int cold) return; migratetype = get_pageblock_migratetype(page); - set_page_private(page, migratetype); + set_freepage_migratetype(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 345643b85bd..9c03dca8c2a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -203,7 +203,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) if (PageBuddy(page)) pfn += 1 << page_order(page); else if (page_count(page) == 0 && - page_private(page) == MIGRATE_ISOLATE) + get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; else break; -- cgit v1.2.3 From 95e3441248053fc06bbb1dbbd34409a84211619e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:11 -0700 Subject: mm: remain migratetype in freed page The page allocator caches the pageblock information in page->private while it is in the PCP freelists but this is overwritten with the order of the page when freed to the buddy allocator. This patch stores the migratetype of the page in the page->index field so that it is available at all times when the page remain in free_list. This patch adds a new call site in __free_pages_ok so it might be overhead a bit but it's for high order allocation. So I believe damage isn't hurt. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6aa0a8e89c5..94fd283dde9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -729,6 +729,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int wasMlocked = __TestClearPageMlocked(page); + int migratetype; if (!free_pages_prepare(page, order)) return; @@ -737,8 +738,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_freepage_migratetype(page, migratetype); + free_one_page(page_zone(page), page, order, migratetype); local_irq_restore(flags); } @@ -959,6 +961,7 @@ static int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); + set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } -- cgit v1.2.3 From 41d575ad4a511b71a4a41c8313004212f5c229b1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:14 -0700 Subject: memory-hotplug: bug fix race between isolation and allocation Like below, memory-hotplug makes race between page-isolation and page-allocation so it can hit BUG_ON in __offline_isolated_pages. CPU A CPU B start_isolate_page_range set_migratetype_isolate spin_lock_irqsave(zone->lock) free_hot_cold_page(Page A) /* without zone->lock */ migratetype = get_pageblock_migratetype(Page A); /* * Page could be moved into MIGRATE_MOVABLE * of per_cpu_pages */ list_add_tail(&page->lru, &pcp->lists[migratetype]); set_pageblock_isolate move_freepages_block drain_all_pages /* Page A could be in MIGRATE_MOVABLE of free_list. */ check_pages_isolated __test_page_isolated_in_pageblock /* * We can't catch freed page which * is free_list[MIGRATE_MOVABLE] */ if (PageBuddy(page A)) pfn += 1 << page_order(page A); /* So, Page A could be allocated */ __offline_isolated_pages /* * BUG_ON hit or offline page * which is used by someone */ BUG_ON(!PageBuddy(page A)); This patch checks page's migratetype in freelist in __test_page_isolated_in_pageblock. So now __test_page_isolated_in_pageblock can check the page caused by above race and can fail of memory offlining. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9c03dca8c2a..6744235d2d0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -200,8 +200,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) + if (PageBuddy(page)) { + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) + break; pfn += 1 << page_order(page); + } else if (page_count(page) == 0 && get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; -- cgit v1.2.3 From 435b405c06119d93333738172b8060b0ed12af41 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:16 -0700 Subject: memory-hotplug: fix pages missed by race rather than failing If race between allocation and isolation in memory-hotplug offline happens, some pages could be in MIGRATE_MOVABLE of free_list although the pageblock's migratetype of the page is MIGRATE_ISOLATE. The race could be detected by get_freepage_migratetype in __test_page_isolated_in_pageblock. If it is detected, now EBUSY gets bubbled all the way up and the hotplug operations fails. But better idea is instead of returning and failing memory-hotremove, move the free page to the correct list at the time it is detected. It could enhance memory-hotremove operation success ratio although the race is really rare. Suggested by Mel Gorman. [akpm@linux-foundation.org: small cleanup] Signed-off-by: Minchan Kim Cc: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/page_isolation.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94fd283dde9..82f0b2f54f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -925,7 +925,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -static int move_freepages(struct zone *zone, +int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6744235d2d0..5f34a9053ce 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -201,8 +201,20 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) } page = pfn_to_page(pfn); if (PageBuddy(page)) { - if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) - break; + /* + * If race between isolatation and allocation happens, + * some free pages could be in MIGRATE_MOVABLE list + * although pageblock's migratation type of the page + * is MIGRATE_ISOLATE. Catch it and move the page into + * MIGRATE_ISOLATE list. + */ + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { + struct page *end_page; + + end_page = page + (1 << page_order(page)) - 1; + move_freepages(page_zone(page), page, end_page, + MIGRATE_ISOLATE); + } pfn += 1 << page_order(page); } else if (page_count(page) == 0 && -- cgit v1.2.3 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index a9827b42556..83efee76a5c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); -- cgit v1.2.3 From e9d24ad30fc5c4c601824fb39712350b053ca812 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Oct 2012 16:32:21 -0700 Subject: mm/memblock: use existing interface to set nid Use the existing interface function to set the NUMA node ID (NID) for the regions, either memory or reserved region. Signed-off-by: Wanpeng Li Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Gavin Shan Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 82aa349d2f7..d809f17ef75 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -756,7 +756,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return ret; for (i = start_rgn; i < end_rgn; i++) - type->regions[i].nid = nid; + memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); return 0; -- cgit v1.2.3 From f2d52fe51c8c0a18cf5fbe583bad51090d12c146 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Oct 2012 16:32:24 -0700 Subject: mm/memblock: cleanup early_node_map[] related comments Commit 0ee332c14518 ("memblock: Kill early_node_map[]") removed early_node_map[]. Clean up the comments to comply with that change. Signed-off-by: Wanpeng Li Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Gavin Shan Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 2 -- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 405573010f9..bd82f6b3141 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -162,8 +162,6 @@ unsigned long __init free_all_bootmem(void) * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related */ return free_low_memory_core_early(MAX_NUMNODES); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 82f0b2f54f8..ca002b39b9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4931,7 +4931,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early_node_map[] */ + /* Print out the early node map */ printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, -- cgit v1.2.3 From e64c5237cf6ff474cb2f3f832f48f2b441dd9979 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:27 -0700 Subject: mm: compaction: abort compaction loop if lock is contended or run too long isolate_migratepages_range() might isolate no pages if for example when zone->lru_lock is contended and running asynchronous compaction. In this case, we should abort compaction, otherwise, compact_zone will run a useless loop and make zone->lru_lock is even contended. An additional check is added to ensure that cc.migratepages and cc.freepages get properly drained whan compaction is aborted. [minchan@kernel.org: Putback pages isolated for migration if aborting] [akpm@linux-foundation.org: compact_zone_order requires non-NULL arg contended] [akpm@linux-foundation.org: make compact_zone_order() require non-NULL arg `contended'] [minchan@kernel.org: Putback pages isolated for migration if aborting] Signed-off-by: Andrea Arcangeli Signed-off-by: Shaohua Li Signed-off-by: Mel Gorman Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 17 ++++++++++++----- mm/internal.h | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 1f61bcbd626..0649cc1b347 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (!cc->sync) { - if (cc->contended) - *cc->contended = true; + cc->contended = true; return false; } @@ -688,7 +687,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); - if (!low_pfn) + if (!low_pfn || cc->contended) return ISOLATE_ABORT; cc->migrate_pfn = low_pfn; @@ -848,6 +847,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; @@ -896,6 +897,7 @@ static unsigned long compact_zone_order(struct zone *zone, bool sync, bool *contended, struct page **page) { + unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -903,13 +905,18 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .contended = contended, .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - return compact_zone(zone, &cc); + ret = compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + *contended = cc.contended; + return ret; } int sysctl_extfrag_threshold = 500; diff --git a/mm/internal.h b/mm/internal.h index 96cda4c6ac5..97664be2ca3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,7 +130,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool *contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended */ struct page **page; /* Page captured of requested size */ }; -- cgit v1.2.3 From 3cc668f4e30fbd97b3c0574d8cac7a83903c9bc7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:30 -0700 Subject: mm: compaction: move fatal signal check out of compact_checklock_irqsave Commit c67fe3752abe ("mm: compaction: Abort async compaction if locks are contended or taking too long") addressed a lock contention problem in compaction by introducing compact_checklock_irqsave() that effecively aborting async compaction in the event of compaction. To preserve existing behaviour it also moved a fatal_signal_pending() check into compact_checklock_irqsave() but that is very misleading. It "hides" the check within a locking function but has nothing to do with locking as such. It just happens to work in a desirable fashion. This patch moves the fatal_signal_pending() check to isolate_migratepages_range() where it belongs. Arguably the same check should also happen when isolating pages for freeing but it's overkill. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Shaohua Li Cc: Minchan Kim Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0649cc1b347..78075a26839 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -75,8 +75,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } cond_resched(); - if (fatal_signal_pending(current)) - return false; } if (!locked) @@ -363,7 +361,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); - if (!locked) + if (!locked || fatal_signal_pending(current)) break; /* -- cgit v1.2.3 From 661c4cb9b829110cb68c18ea05a56be39f75a4d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:31 -0700 Subject: mm: compaction: Update try_to_compact_pages()kerneldoc comment Parameters were added without documentation, tut tut. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 78075a26839..b16dd382299 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -926,6 +926,8 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @sync: Whether migration is synchronous or not + * @contended: Return value that is true if compaction was aborted due to lock contention + * @page: Optionally capture a free page of the requested order during compaction * * This is the main entry point for direct page compaction. */ -- cgit v1.2.3 From 2a1402aa044b55c2d30ab0ed9405693ef06fb07c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:33 -0700 Subject: mm: compaction: acquire the zone->lru_lock as late as possible Richard Davies and Shaohua Li have both reported lock contention problems in compaction on the zone and LRU locks as well as significant amounts of time being spent in compaction. This series aims to reduce lock contention and scanning rates to reduce that CPU usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that this series made a big different to a problem he reported in August: http://marc.info/?l=kvm&m=134511507015614&w=2 Patch 1 defers acquiring the zone->lru_lock as long as possible. Patch 2 defers acquiring the zone->lock as lock as possible. Patch 3 reverts Rik's "skip-free" patches as the core concept gets reimplemented later and the remaining patches are easier to understand if this is reverted first. Patch 4 adds a pageblock-skip bit to the pageblock flags to cache what pageblocks should be skipped by the migrate and free scanners. This drastically reduces the amount of scanning compaction has to do. Patch 5 reimplements something similar to Rik's idea except it uses the pageblock-skip information to decide where the scanners should restart from and does not need to wrap around. I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were akpm-20120920 3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012 lesslock Patches 1-6 revert Patches 1-7 cachefail Patches 1-8 skipuseless Patches 1-9 Stress high-order allocation tests looked ok. Success rates are more or less the same with the full series applied but there is an expectation that there is less opportunity to race with other allocation requests if there is less scanning. The time to complete the tests did not vary that much and are uninteresting as were the vmstat statistics so I will not present them here. Using ftrace I recorded how much scanning was done by compaction and got this 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 akpm-20120920 lockless revert-v2r2 cachefail skipuseless Total free scanned 360753976 515414028 565479007 17103281 18916589 Total free isolated 2852429 3597369 4048601 670493 727840 Total free efficiency 0.0079% 0.0070% 0.0072% 0.0392% 0.0385% Total migrate scanned 247728664 822729112 1004645830 17946827 14118903 Total migrate isolated 2555324 3245937 3437501 616359 658616 Total migrate efficiency 0.0103% 0.0039% 0.0034% 0.0343% 0.0466% The efficiency is worthless because of the nature of the test and the number of failures. The really interesting point as far as this patch series is concerned is the number of pages scanned. Note that reverting Rik's patches massively increases the number of pages scanned indicating that those patches really did make a difference to CPU usage. However, caching what pageblocks should be skipped has a much higher impact. With patches 1-8 applied, free page and migrate page scanning are both reduced by 95% in comparison to the akpm kernel. If the basic concept of Rik's patches are implemened on top then scanning then the free scanner barely changed but migrate scanning was further reduced. That said, tests on 3.6-rc5 indicated that the last patch had greater impact than what was measured here so it is a bit variable. One way or the other, this series has a large impact on the amount of scanning compaction does when there is a storm of THP allocations. This patch: Compaction's migrate scanner acquires the zone->lru_lock when scanning a range of pages looking for LRU pages to acquire. It does this even if there are no LRU pages in the range. If multiple processes are compacting then this can cause severe locking contention. To make matters worse commit b2eef8c0 ("mm: compaction: minimise the time IRQs are disabled while isolating pages for migration") releases the lru_lock every SWAP_CLUSTER_MAX pages that are scanned. This patch makes two changes to how the migrate scanner acquires the LRU lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if the lock is contended. This reduces the number of times it unnecessarily disables and re-enables IRQs. The second is that it defers acquiring the LRU lock for as long as possible. If there are no LRU pages or the only LRU pages are transhuge then the LRU lock will not be acquired at all which reduces contention on zone->lru_lock. [minchan@kernel.org: augment comment] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 65 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b16dd382299..832c4183dcc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +static inline bool should_release_lock(spinlock_t *lock) +{ + return need_resched() || spin_is_contended(lock); +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. Check if the process needs to be scheduled or @@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype) static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, bool locked, struct compact_control *cc) { - if (need_resched() || spin_is_contended(lock)) { + if (should_release_lock(lock)) { if (locked) { spin_unlock_irqrestore(lock, *flags); locked = false; @@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; - bool locked; + bool locked = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irqsave(&zone->lru_lock, flags); - locked = true; for (; low_pfn < end_pfn; low_pfn++) { struct page *page; /* give a chance to irqs before checking need_resched() */ - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - locked = false; + if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (should_release_lock(&zone->lru_lock)) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } } - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(&zone->lru_lock, &flags, - locked, cc); - if (!locked || fatal_signal_pending(current)) - break; - /* * migrate_pfn does not necessarily start aligned to a * pageblock. Ensure that pfn_valid is called when moving @@ -403,20 +402,39 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; - continue; + goto next_pageblock; } + /* Check may be lockless but that's ok as we recheck later */ if (!PageLRU(page)) continue; /* - * PageLRU is set, and lru_lock excludes isolation, - * splitting and collapsing (collapsing has already - * happened if PageLRU is set). + * PageLRU is set. lru_lock normally excludes isolation + * splitting and collapsing (collapsing has already happened + * if PageLRU is set) but the lock is not necessarily taken + * here and it is wasteful to take it just to check transhuge. + * Check TransHuge without lock and skip the whole pageblock if + * it's either a transhuge or hugetlbfs page, as calling + * compound_order() without preventing THP from splitting the + * page underneath us may return surprising results. */ + if (PageTransHuge(page)) { + if (!locked) + goto next_pageblock; + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked || fatal_signal_pending(current)) + break; + + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; if (PageTransHuge(page)) { low_pfn += (1 << compound_order(page)) - 1; continue; @@ -444,6 +462,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, ++low_pfn; break; } + + continue; + +next_pageblock: + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); -- cgit v1.2.3 From f40d1e42bb988d2a26e8e111ea4c4c7bac819b7e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:36 -0700 Subject: mm: compaction: acquire the zone->lock as late as possible Compaction's free scanner acquires the zone->lock when checking for PageBuddy pages and isolating them. It does this even if there are no PageBuddy pages in the range. This patch defers acquiring the zone lock for as long as possible. In the event there are no free pages in the pageblock then the lock will not be acquired at all which reduces contention on zone->lock. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Acked-by: Minchan Kim Tested-by: Peter Ujfalusi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 140 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 832c4183dcc..bdf6e13045e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -93,6 +93,27 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + static void compact_capture_page(struct compact_control *cc) { unsigned long flags; @@ -153,38 +174,56 @@ static void compact_capture_page(struct compact_control *cc) * pages inside of the pageblock (even though it may still end up isolating * some pages). */ -static unsigned long isolate_freepages_block(unsigned long blockpfn, +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long blockpfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; struct page *cursor; + unsigned long nr_strict_required = end_pfn - blockpfn; + unsigned long flags; + bool locked = false; cursor = pfn_to_page(blockpfn); - /* Isolate free pages. This assumes the block is valid */ + /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) { - if (strict) - return 0; - continue; - } nr_scanned++; + if (!pfn_valid_within(blockpfn)) + continue; + if (!PageBuddy(page)) + continue; + + /* + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_checklock_irqsave(&cc->zone->lock, &flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict && !suitable_migration_target(page)) + break; - if (!PageBuddy(page)) { - if (strict) - return 0; + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) continue; - } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); if (!isolated && strict) - return 0; + break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -199,6 +238,18 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); + + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && nr_strict_required != total_isolated) + total_isolated = 0; + + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + return total_isolated; } @@ -218,12 +269,17 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, unsigned long isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn, flags; + unsigned long isolated, pfn, block_end_pfn; struct zone *zone = NULL; LIST_HEAD(freelist); + /* cc needed for isolate_freepages_block to acquire zone->lock */ + struct compact_control cc = { + .sync = true, + }; + if (pfn_valid(start_pfn)) - zone = page_zone(pfn_to_page(start_pfn)); + cc.zone = zone = page_zone(pfn_to_page(start_pfn)); for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) @@ -236,10 +292,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - spin_lock_irqsave(&zone->lock, flags); - isolated = isolate_freepages_block(pfn, block_end_pfn, + isolated = isolate_freepages_block(&cc, pfn, block_end_pfn, &freelist, true); - spin_unlock_irqrestore(&zone->lock, flags); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -483,29 +537,6 @@ next_pageblock: #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION - -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; - - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) - return true; - - /* Otherwise skip the block */ - return false; -} - /* * Returns the start pfn of the last page block in a zone. This is the starting * point for full compaction of a zone. Compaction searches for free pages from @@ -529,7 +560,6 @@ static void isolate_freepages(struct zone *zone, { struct page *page; unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; - unsigned long flags; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -577,30 +607,12 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled - */ + /* Found a block suitable for isolating free pages from */ isolated = 0; - - /* - * The zone lock must be held to isolate freepages. This - * unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock - */ - if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) - break; - if (suitable_migration_target(page)) { - end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); - isolated = isolate_freepages_block(pfn, end_pfn, - freelist, false); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + isolated = isolate_freepages_block(cc, pfn, end_pfn, + freelist, false); + nr_freepages += isolated; /* * Record the highest PFN we isolated pages from. When next -- cgit v1.2.3 From 753341a4b85ff337487b9959c71c529f522004f4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:40 -0700 Subject: revert "mm: have order > 0 compaction start off where it left" This reverts commit 7db8889ab05b ("mm: have order > 0 compaction start off where it left") and commit de74f1cc ("mm: have order > 0 compaction start near a pageblock with free pages"). These patches were a good idea and tests confirmed that they massively reduced the amount of scanning but the implementation is complex and tricky to understand. A later patch will cache what pageblocks should be skipped and reimplements the concept of compact_cached_free_pfn on top for both migration and free scanners. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 65 +++++---------------------------------------------------- mm/internal.h | 6 ------ mm/page_alloc.c | 5 ----- 3 files changed, 5 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bdf6e13045e..db76361a311 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -537,20 +537,6 @@ next_pageblock: #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* - * Returns the start pfn of the last page block in a zone. This is the starting - * point for full compaction of a zone. Compaction searches for free pages from - * the end of each zone, while isolate_freepages_block scans forward inside each - * page block. - */ -static unsigned long start_free_pfn(struct zone *zone) -{ - unsigned long free_pfn; - free_pfn = zone->zone_start_pfn + zone->spanned_pages; - free_pfn &= ~(pageblock_nr_pages-1); - return free_pfn; -} - /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. @@ -619,19 +605,8 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) { + if (isolated) high_pfn = max(high_pfn, pfn); - - /* - * If the free scanner has wrapped, update - * compact_cached_free_pfn to point to the highest - * pageblock with free pages. This reduces excessive - * scanning of full pageblocks near the end of the - * zone - */ - if (cc->order > 0 && cc->wrapped) - zone->compact_cached_free_pfn = high_pfn; - } } /* split_free_page does not map the pages */ @@ -639,11 +614,6 @@ static void isolate_freepages(struct zone *zone, cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; - - /* If compact_cached_free_pfn is reset then set it now */ - if (cc->order > 0 && !cc->wrapped && - zone->compact_cached_free_pfn == start_free_pfn(zone)) - zone->compact_cached_free_pfn = high_pfn; } /* @@ -738,26 +708,8 @@ static int compact_finished(struct zone *zone, if (fatal_signal_pending(current)) return COMPACT_PARTIAL; - /* - * A full (order == -1) compaction run starts at the beginning and - * end of a zone; it completes when the migrate and free scanner meet. - * A partial (order > 0) compaction can start with the free scanner - * at a random point in the zone, and may have to restart. - */ - if (cc->free_pfn <= cc->migrate_pfn) { - if (cc->order > 0 && !cc->wrapped) { - /* We started partway through; restart at the end. */ - unsigned long free_pfn = start_free_pfn(zone); - zone->compact_cached_free_pfn = free_pfn; - cc->free_pfn = free_pfn; - cc->wrapped = 1; - return COMPACT_CONTINUE; - } - return COMPACT_COMPLETE; - } - - /* We wrapped around and ended up where we started. */ - if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) return COMPACT_COMPLETE; /* @@ -863,15 +815,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; - - if (cc->order > 0) { - /* Incremental compaction. Start where the last one stopped. */ - cc->free_pfn = zone->compact_cached_free_pfn; - cc->start_free_pfn = cc->free_pfn; - } else { - /* Order == -1 starts at the end of the zone. */ - cc->free_pfn = start_free_pfn(zone); - } + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); migrate_prep_local(); diff --git a/mm/internal.h b/mm/internal.h index 97664be2ca3..6f6bb9ab938 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -118,14 +118,8 @@ struct compact_control { unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long start_free_pfn; /* where we started the search */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - bool wrapped; /* Order > 0 compactions are - incremental, once free_pfn - and migrate_pfn meet, we restart - from the top of the zone; - remember we wrapped around. */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca002b39b9b..628968c1ccf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4490,11 +4490,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - zone->compact_cached_free_pfn = zone->zone_start_pfn + - zone->spanned_pages; - zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); -#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) -- cgit v1.2.3 From bb13ffeb9f6bfeb301443994dfbf29f91117dfb3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:41 -0700 Subject: mm: compaction: cache if a pageblock was scanned and no pages were isolated When compaction was implemented it was known that scanning could potentially be excessive. The ideal was that a counter be maintained for each pageblock but maintaining this information would incur a severe penalty due to a shared writable cache line. It has reached the point where the scanning costs are a serious problem, particularly on long-lived systems where a large process starts and allocates a large number of THPs at the same time. Instead of using a shared counter, this patch adds another bit to the pageblock flags called PG_migrate_skip. If a pageblock is scanned by either migrate or free scanner and 0 pages were isolated, the pageblock is marked to be skipped in the future. When scanning, this bit is checked before any scanning takes place and the block skipped if set. The main difficulty with a patch like this is "when to ignore the cached information?" If it's ignored too often, the scanning rates will still be excessive. If the information is too stale then allocations will fail that might have otherwise succeeded. In this patch o CMA always ignores the information o If the migrate and free scanner meet then the cached information will be discarded if it's at least 5 seconds since the last time the cache was discarded o If there are a large number of allocation failures, discard the cache. The time-based heuristic is very clumsy but there are few choices for a better event. Depending solely on multiple allocation failures still allows excessive scanning when THP allocations are failing in quick succession due to memory pressure. Waiting until memory pressure is relieved would cause compaction to continually fail instead of using reclaim/compaction to try allocate the page. The time-based mechanism is clumsy but a better option is not obvious. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Cc: Fengguang Wu Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-------- mm/internal.h | 4 +- mm/page_alloc.c | 38 +++++++++-------- 3 files changed, 131 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index db76361a311..d9dbb97e607 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,79 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +#ifdef CONFIG_COMPACTION +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long pfn; + + /* + * Do not reset more than once every five seconds. If allocations are + * failing sufficiently quickly to allow this to happen then continually + * scanning for compaction is not going to help. The choice of five + * seconds is arbitrary but will mitigate excessive scanning. + */ + if (time_before(jiffies, zone->compact_blockskip_expire)) + return; + zone->compact_blockskip_expire = jiffies + (HZ * 5); + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +{ + if (!page) + return; + + if (!nr_isolated) + set_pageblock_skip(page); +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +{ +} +#endif /* CONFIG_COMPACTION */ + static inline bool should_release_lock(spinlock_t *lock) { return need_resched() || spin_is_contended(lock); @@ -181,7 +254,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *cursor, *valid_page = NULL; unsigned long nr_strict_required = end_pfn - blockpfn; unsigned long flags; bool locked = false; @@ -196,6 +269,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned++; if (!pfn_valid_within(blockpfn)) continue; + if (!valid_page) + valid_page = page; if (!PageBuddy(page)) continue; @@ -250,6 +325,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (locked) spin_unlock_irqrestore(&cc->zone->lock, flags); + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(valid_page, total_isolated); + return total_isolated; } @@ -267,22 +346,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * a free page). */ unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_end_pfn; - struct zone *zone = NULL; LIST_HEAD(freelist); - /* cc needed for isolate_freepages_block to acquire zone->lock */ - struct compact_control cc = { - .sync = true, - }; - - if (pfn_valid(start_pfn)) - cc.zone = zone = page_zone(pfn_to_page(start_pfn)); - for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { - if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) + if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) break; /* @@ -292,7 +363,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - isolated = isolate_freepages_block(&cc, pfn, block_end_pfn, + isolated = isolate_freepages_block(cc, pfn, block_end_pfn, &freelist, true); /* @@ -387,6 +458,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, struct lruvec *lruvec; unsigned long flags; bool locked = false; + struct page *page = NULL, *valid_page = NULL; /* * Ensure that there are not too many pages isolated from the LRU @@ -407,8 +479,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { - struct page *page; - /* give a chance to irqs before checking need_resched() */ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { @@ -444,6 +514,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (page_zone(page) != zone) continue; + if (!valid_page) + valid_page = page; + + /* If isolation recently failed, do not retry */ + pageblock_nr = low_pfn >> pageblock_order; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + /* Skip if free */ if (PageBuddy(page)) continue; @@ -453,7 +531,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * migration is optimistic to see if the minimum amount of work * satisfies the allocation */ - pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { goto next_pageblock; @@ -530,6 +607,10 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (low_pfn == end_pfn) + update_pageblock_skip(valid_page, nr_isolated); + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); return low_pfn; @@ -593,6 +674,10 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + /* Found a block suitable for isolating free pages from */ isolated = 0; end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); @@ -709,8 +794,10 @@ static int compact_finished(struct zone *zone, return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) + if (cc->free_pfn <= cc->migrate_pfn) { + reset_isolation_suitable(cc->zone); return COMPACT_COMPLETE; + } /* * order == -1 is expected when compacting via @@ -818,6 +905,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; cc->free_pfn &= ~(pageblock_nr_pages-1); + /* Clear pageblock skip if there are numerous alloc failures */ + if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) + reset_isolation_suitable(zone); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { diff --git a/mm/internal.h b/mm/internal.h index 6f6bb9ab938..7ba56ac360b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -120,6 +120,7 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ @@ -129,7 +130,8 @@ struct compact_control { }; unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 628968c1ccf..44c56049edf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5679,7 +5679,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private, } /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ @@ -5687,25 +5688,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) unsigned int tries = 0; int ret = 0; - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(start)), - .sync = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - migrate_prep_local(); - while (pfn < end || !list_empty(&cc.migratepages)) { + while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } - if (list_empty(&cc.migratepages)) { - cc.nr_migratepages = 0; - pfn = isolate_migratepages_range(cc.zone, &cc, + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + pfn = isolate_migratepages_range(cc->zone, cc, pfn, end); if (!pfn) { ret = -EINTR; @@ -5717,14 +5710,14 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) break; } - reclaim_clean_pages_from_list(cc.zone, &cc.migratepages); + reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); - ret = migrate_pages(&cc.migratepages, + ret = migrate_pages(&cc->migratepages, __alloc_contig_migrate_alloc, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc.migratepages); + putback_lru_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } @@ -5803,6 +5796,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned long outer_start, outer_end; int ret = 0, order; + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -5832,7 +5834,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) goto done; - ret = __alloc_contig_migrate_range(start, end); + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) goto done; @@ -5881,7 +5883,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); /* Grab isolated pages from freelists. */ - outer_end = isolate_freepages_range(outer_start, end); + outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { ret = -EBUSY; goto done; -- cgit v1.2.3 From c89511ab2f8fe2b47585e60da8af7fd213ec877e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:45 -0700 Subject: mm: compaction: Restart compaction from near where it left off This is almost entirely based on Rik's previous patches and discussions with him about how this might be implemented. Order > 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. This patch caches where the migration and free scanner should start from on subsequent compaction invocations using the pageblock-skip information. When compaction starts it begins from the cached restart points and will update the cached restart points until a page is isolated or a pageblock is skipped that would have been scanned by synchronous compaction. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++---------- mm/internal.h | 4 ++++ 2 files changed, 52 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d9dbb97e607..f94cbc0b99a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -80,6 +80,9 @@ static void reset_isolation_suitable(struct zone *zone) */ if (time_before(jiffies, zone->compact_blockskip_expire)) return; + + zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_expire = jiffies + (HZ * 5); /* Walk the zone and mark every pageblock as suitable for isolation */ @@ -103,13 +106,29 @@ static void reset_isolation_suitable(struct zone *zone) * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by reset_isolation_suitable(). */ -static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) { + struct zone *zone = cc->zone; if (!page) return; - if (!nr_isolated) + if (!nr_isolated) { + unsigned long pfn = page_to_pfn(page); set_pageblock_skip(page); + + /* Update where compaction should restart */ + if (migrate_scanner) { + if (!cc->finished_update_migrate && + pfn > zone->compact_cached_migrate_pfn) + zone->compact_cached_migrate_pfn = pfn; + } else { + if (!cc->finished_update_free && + pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; + } + } } #else static inline bool isolation_suitable(struct compact_control *cc, @@ -118,7 +137,9 @@ static inline bool isolation_suitable(struct compact_control *cc, return true; } -static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -327,7 +348,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(valid_page, total_isolated); + update_pageblock_skip(cc, valid_page, total_isolated, false); return total_isolated; } @@ -533,6 +554,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { + cc->finished_update_migrate = true; goto next_pageblock; } @@ -583,6 +605,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ + cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; @@ -609,7 +632,7 @@ next_pageblock: /* Update the pageblock-skip if the whole pageblock was scanned */ if (low_pfn == end_pfn) - update_pageblock_skip(valid_page, nr_isolated); + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -690,8 +713,10 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) + if (isolated) { + cc->finished_update_free = true; high_pfn = max(high_pfn, pfn); + } } /* split_free_page does not map the pages */ @@ -888,6 +913,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -900,10 +927,21 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - /* Setup to move all movable pages to the end of the zone */ - cc->migrate_pfn = zone->zone_start_pfn; - cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; - cc->free_pfn &= ~(pageblock_nr_pages-1); + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ + cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn = cc->migrate_pfn; + } /* Clear pageblock skip if there are numerous alloc failures */ if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) diff --git a/mm/internal.h b/mm/internal.h index 7ba56ac360b..7f72f249bc2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -121,6 +121,10 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool finished_update_free; /* True when the zone cached pfns are + * no longer being updated + */ + bool finished_update_migrate; int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ -- cgit v1.2.3 From 62997027ca5b3d4618198ed8b1aba40b61b1137b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:47 -0700 Subject: mm: compaction: clear PG_migrate_skip based on compaction and reclaim activity Compaction caches if a pageblock was scanned and no pages were isolated so that the pageblocks can be skipped in the future to reduce scanning. This information is not cleared by the page allocator based on activity due to the impact it would have to the page allocator fast paths. Hence there is a requirement that something clear the cache or pageblocks will be skipped forever. Currently the cache is cleared if there were a number of recent allocation failures and it has not been cleared within the last 5 seconds. Time-based decisions like this are terrible as they have no relationship to VM activity and is basically a big hammer. Unfortunately, accurate heuristics would add cost to some hot paths so this patch implements a rough heuristic. There are two cases where the cache is cleared. 1. If a !kswapd process completes a compaction cycle (migrate and free scanner meet), the zone is marked compact_blockskip_flush. When kswapd goes to sleep, it will clear the cache. This is expected to be the common case where the cache is cleared. It does not really matter if kswapd happens to be asleep or going to sleep when the flag is set as it will be woken on the next allocation request. 2. If there have been multiple failures recently and compaction just finished being deferred then a process will clear the cache and start a full scan. This situation happens if there are multiple high-order allocation requests under heavy memory pressure. The clearing of the PG_migrate_skip bits and other scans is inherently racy but the race is harmless. For allocations that can fail such as THP, they will simply fail. For requests that cannot fail, they will retry the allocation. Tests indicated that scanning rates were roughly similar to when the time-based heuristic was used and the allocation success rates were similar. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 50 ++++++++++++++++++++++++++++++++++---------------- mm/page_alloc.c | 1 + mm/vmscan.c | 8 ++++++++ 3 files changed, 43 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f94cbc0b99a..d8187f9cabb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -66,24 +66,15 @@ static inline bool isolation_suitable(struct compact_control *cc, * should be skipped for page isolation when the migrate and free page scanner * meet. */ -static void reset_isolation_suitable(struct zone *zone) +static void __reset_isolation_suitable(struct zone *zone) { unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; unsigned long pfn; - /* - * Do not reset more than once every five seconds. If allocations are - * failing sufficiently quickly to allow this to happen then continually - * scanning for compaction is not going to help. The choice of five - * seconds is arbitrary but will mitigate excessive scanning. - */ - if (time_before(jiffies, zone->compact_blockskip_expire)) - return; - zone->compact_cached_migrate_pfn = start_pfn; zone->compact_cached_free_pfn = end_pfn; - zone->compact_blockskip_expire = jiffies + (HZ * 5); + zone->compact_blockskip_flush = false; /* Walk the zone and mark every pageblock as suitable for isolation */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -102,9 +93,24 @@ static void reset_isolation_suitable(struct zone *zone) } } +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + /* * If no pages were isolated then mark this pageblock to be skipped in the - * future. The information is later cleared by reset_isolation_suitable(). + * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, @@ -820,7 +826,15 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { - reset_isolation_suitable(cc->zone); + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (!current_is_kswapd()) + zone->compact_blockskip_flush = true; + return COMPACT_COMPLETE; } @@ -943,9 +957,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn = cc->migrate_pfn; } - /* Clear pageblock skip if there are numerous alloc failures */ - if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) - reset_isolation_suitable(zone); + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); migrate_prep_local(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44c56049edf..b97cf12f07a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2172,6 +2172,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { got_page: + preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; if (order >= preferred_zone->compact_order_failed) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ee4b69a28a..b010efc4389 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2895,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + if (!kthread_should_stop()) schedule(); -- cgit v1.2.3 From 3f6d4caeb9a9d8f7e5bbf3f49f1fd71e1414ff64 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 8 Oct 2012 16:32:50 -0700 Subject: mm/hugetlb.c: remove duplicate inclusion of header file Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9b40e3a993..8536741f069 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,7 +30,6 @@ #include #include #include -#include #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -- cgit v1.2.3 From 723a0644a7255f532575fd43245f9ef976491328 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:52 -0700 Subject: mm/page_alloc: refactor out __alloc_contig_migrate_alloc() __alloc_contig_migrate_alloc() can be used by memory-hotplug so refactor it out (move + rename as a common name) into page_isolation.c. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Wen Congyang Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +------------- mm/page_isolation.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b97cf12f07a..8ac593893e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5667,18 +5667,6 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } -static struct page * -__alloc_contig_migrate_alloc(struct page *page, unsigned long private, - int **resultp) -{ - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); -} - /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -5714,7 +5702,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); ret = migrate_pages(&cc->migratepages, - __alloc_contig_migrate_alloc, + alloc_migrate_target, 0, false, MIGRATE_SYNC); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5f34a9053ce..f2f5b4818e9 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -255,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } + +struct page *alloc_migrate_target(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} -- cgit v1.2.3 From 74c08f982674cfd5dfeb2702d631db9bcdabf788 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:54 -0700 Subject: memory-hotplug: don't replace lowmem pages with highmem The changelog for commit 6a6dccba2fdc ("mm: cma: don't replace lowmem pages with highmem") mentioned that lowmem pages can be replaced by highmem pages during CMA migration. 6a6dccba2fdc fixed that issue. Quote from that changelog: : The filesystem layer expects pages in the block device's mapping to not : be in highmem (the mapping's gfp mask is set in bdget()), but CMA can : currently replace lowmem pages with highmem pages, leading to crashes in : filesystem code such as the one below: : : Unable to handle kernel NULL pointer dereference at virtual address 00000400 : pgd = c0c98000 : [00000400] *pgd=00c91831, *pte=00000000, *ppte=00000000 : Internal error: Oops: 817 [#1] PREEMPT SMP ARM : CPU: 0 Not tainted (3.5.0-rc5+ #80) : PC is at __memzero+0x24/0x80 : ... : Process fsstress (pid: 323, stack limit = 0xc0cbc2f0) : Backtrace: : [] (ext4_getblk+0x0/0x180) from [] (ext4_bread+0x1c/0x98) : [] (ext4_bread+0x0/0x98) from [] (ext4_mkdir+0x160/0x3bc) : r4:c15337f0 : [] (ext4_mkdir+0x0/0x3bc) from [] (vfs_mkdir+0x8c/0x98) : [] (vfs_mkdir+0x0/0x98) from [] (sys_mkdirat+0x74/0xac) : r6:00000000 r5:c152eb40 r4:000001ff r3:c14b43f0 : [] (sys_mkdirat+0x0/0xac) from [] (sys_mkdir+0x20/0x24) : r6:beccdcf0 r5:00074000 r4:beccdbbc : [] (sys_mkdir+0x0/0x24) from [] (ret_fast_syscall+0x0/0x30) Memory-hotplug has same problem as CMA has so the same fix can be applied to memory-hotplug as well. Fix it by reusing. Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Wen Congyang Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b35016156c1..f9ac0955e10 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -756,13 +756,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) return 0; } -static struct page * -hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) -{ - /* This should be improooooved!! */ - return alloc_page(GFP_HIGHUSER_MOVABLE); -} - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -813,8 +806,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_lru_pages(&source); goto out; } - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + + /* + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ + ret = migrate_pages(&source, alloc_migrate_target, 0, true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); -- cgit v1.2.3 From e3b4126c556ca3a07699adf202d44bed3f453638 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:32:57 -0700 Subject: thp: khugepaged_prealloc_page() forgot to reset the page alloc indicator If NUMA is enabled, the indicator is not reset if the previous page request failed, ausing us to trigger the BUG_ON() in khugepaged_alloc_page(). Signed-off-by: Xiao Guangrong Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michel Lespinasse Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7cf8b0ec11e..7153e0d1867 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1797,6 +1797,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return false; *wait = false; + *hpage = NULL; khugepaged_alloc_sleep(); } else if (*hpage) { put_page(*hpage); -- cgit v1.2.3 From eab1eef9911c36966b5d5934e6970581b3316013 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:33:01 -0700 Subject: mm: thp: fix the update_mmu_cache() last argument passing in mm/huge_memory.c The update_mmu_cache() takes a pointer (to pte_t by default) as the last argument but the huge_memory.c passes a pmd_t value. The patch changes the argument to the pmd_t * pointer. Signed-off-by: Catalin Marinas Signed-off-by: Steve Capper Signed-off-by: Will Deacon Cc: Arnd Bergmann Reviewed-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Gerald Schaefer Reviewed-by: Andrea Arcangeli Cc: Chris Metcalf Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7153e0d1867..0e7740923fb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -889,7 +889,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -941,7 +941,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -2002,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, _pmd); + update_mmu_cache(vma, address, pmd); pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -- cgit v1.2.3 From 05106e6a54aed321191b4bb5c9ee09538cbad3b1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 8 Oct 2012 16:33:03 -0700 Subject: mm: enable CONFIG_COMPACTION by default Now that lumpy reclaim has been removed, compaction is the only way left to free up contiguous memory areas. It is time to just enable CONFIG_COMPACTION by default. Signed-off-by: Rik van Riel Cc: Mel Gorman Acked-by: Rafael Aquini Acked-by: Johannes Weiner Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3322342a1ff..a3f8dddaaab 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS # support for memory compaction config COMPACTION bool "Allow for memory compaction" + def_bool y select MIGRATION depends on MMU help -- cgit v1.2.3 From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Mon, 8 Oct 2012 16:33:06 -0700 Subject: mm: fix-up zone present pages I think zone->present_pages indicates pages that buddy system can management, it should be: zone->present_pages = spanned pages - absent pages - bootmem pages, but is now: zone->present_pages = spanned pages - absent pages - memmap pages. spanned pages: total size, including holes. absent pages: holes. bootmem pages: pages used in system boot, managed by bootmem allocator. memmap pages: pages used by page structs. This may cause zone->present_pages less than it should be. For example, numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's present_pages should be spanned pages - absent pages, but now it also minus memmap pages(free_area_init_core), which are actually allocated from ZONE_MOVABLE. When offlining all memory of a zone, this will cause zone->present_pages less than 0, because present_pages is unsigned long type, it is actually a very large integer, it indirectly caused zone->watermark[WMARK_MIN] becomes a large integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a large integer(calculate_totalreserve_pages()), and finally cause memory allocating failure when fork process(__vm_enough_memory()). [root@localhost ~]# dmesg -bash: fork: Cannot allocate memory I think the bug described in http://marc.info/?l=linux-mm&m=134502182714186&w=2 is also caused by wrong zone present pages. This patch intends to fix-up zone->present_pages when memory are freed to buddy system on x86_64 and IA64 platforms. Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reported-by: Petr Tesarik Tested-by: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 10 +++++++++- mm/memory_hotplug.c | 7 +++++++ mm/nobootmem.c | 3 +++ mm/page_alloc.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b2..434be4ae7a0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); + fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), + start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); + fixup_zone_present_pages( + page_to_nid(page), + start + off, start + off + 1); count++; } vec >>= 1; @@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) + while (pages--) { + fixup_zone_present_pages(page_to_nid(page), + page_to_pfn(page), page_to_pfn(page) + 1); __free_pages_bootmem(page++, 0); + } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9ac0955e10..ce690a911f1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; + struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); + + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b3141..714d5d65047 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); + fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), + start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; + reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ac593893e6..00750bc08a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6087,3 +6087,37 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } + +/* reset zone->present_pages */ +void reset_zone_present_pages(void) +{ + struct zone *z; + int i, nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + z->present_pages = 0; + } + } +} + +/* calculate zone's present pages in buddy system */ +void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct zone *z; + unsigned long zone_start_pfn, zone_end_pfn; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + zone_start_pfn = z->zone_start_pfn; + zone_end_pfn = zone_start_pfn + z->spanned_pages; + + /* if the two regions intersect */ + if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) + z->present_pages += min(end_pfn, zone_end_pfn) - + max(start_pfn, zone_start_pfn); + } +} -- cgit v1.2.3 From 4bd2c1ee4b439d926e437a4841e8145230df98c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:10 -0700 Subject: memcg: cleanup kmem tcp ifdefs TCP kmem accounting is currently guarded by CONFIG_MEMCG_KMEM ifdefs but the code is not used if !CONFIG_INET so we should rather test for both. The same applies to net/sock.h, net/ip.h and net/tcp_memcontrol.h but let's keep those outside of any ifdefs because it is considered safer wrt. future maintainability. Tested with - CONFIG_INET && CONFIG_MEMCG_KMEM - !CONFIG_INET && CONFIG_MEMCG_KMEM - CONFIG_INET && !CONFIG_MEMCG_KMEM - !CONFIG_INET && !CONFIG_MEMCG_KMEM Signed-off-by: Sachin Kamat Signed-off-by: Michal Hocko Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a72f2ffdc3d..4f5f9365724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,6 +51,7 @@ #include #include "internal.h" #include +#include #include #include @@ -326,7 +327,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; -#ifdef CONFIG_INET +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif }; @@ -412,9 +413,7 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) } /* Writing them here to avoid exposing memcg's inner layout */ -#ifdef CONFIG_MEMCG_KMEM -#include -#include +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) @@ -461,7 +460,6 @@ void sock_release_memcg(struct sock *sk) } } -#ifdef CONFIG_INET struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) { if (!memcg || mem_cgroup_is_root(memcg)) @@ -470,10 +468,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) return &memcg->tcp_mem.cg_proto; } EXPORT_SYMBOL(tcp_proto_cgroup); -#endif /* CONFIG_INET */ -#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static void disarm_sock_keys(struct mem_cgroup *memcg) { if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) -- cgit v1.2.3 From 7ffc0edc49d0df5dac077c1830e2533b27d3a4ed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:13 -0700 Subject: memcg: move mem_cgroup_is_root upwards kmem code uses this function and it is better to not use forward declarations for static inline functions as some (older) compilers don't like it: gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux) mm/memcontrol.c:421: warning: `mem_cgroup_is_root' declared inline after being called mm/memcontrol.c:421: warning: previous declaration of `mem_cgroup_is_root' was here Signed-off-by: Michal Hocko Cc: Glauber Costa Cc: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f5f9365724..7acf43bf04a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -412,10 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) -static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) { if (mem_cgroup_sockets_enabled) { @@ -1011,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; -- cgit v1.2.3 From ec4d9f626d5908b6052c2973f37992f1db52e967 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:14 -0700 Subject: mm: fix invalidate_complete_page2() lock ordering In fuzzing with trinity, lockdep protested "possible irq lock inversion dependency detected" when isolate_lru_page() reenabled interrupts while still holding the supposedly irq-safe tree_lock: invalidate_inode_pages2 invalidate_complete_page2 spin_lock_irq(&mapping->tree_lock) clear_page_mlock isolate_lru_page spin_unlock_irq(&zone->lru_lock) isolate_lru_page() is correct to enable interrupts unconditionally: invalidate_complete_page2() is incorrect to call clear_page_mlock() while holding tree_lock, which is supposed to nest inside lru_lock. Both truncate_complete_page() and invalidate_complete_page() call clear_page_mlock() before taking tree_lock to remove page from radix_tree. I guess invalidate_complete_page2() preferred to test PageDirty (again) under tree_lock before committing to the munlock; but since the page has already been unmapped, its state is already somewhat inconsistent, and no worse if clear_page_mlock() moved up. Reported-by: Sasha Levin Deciphered-by: Andrew Morton Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 75801acdaac..f38055cb8af 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; + clear_page_mlock(page); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; - clear_page_mlock(page); BUG_ON(page_has_private(page)); __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.3 From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:18 -0700 Subject: mm: remove vma arg from page_evictable page_evictable(page, vma) is an irritant: almost all its callers pass NULL for vma. Remove the vma arg and use mlocked_vma_newpage(vma, page) explicitly in the couple of places it's needed. But in those places we don't even need page_evictable() itself! They're dealing with a freshly allocated anonymous page, which has no "mapping" and cannot be mlocked yet. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ++--- mm/ksm.c | 2 +- mm/rmap.c | 2 +- mm/swap.c | 2 +- mm/vmscan.c | 27 +++++++++------------------ 5 files changed, 14 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 7f72f249bc2..78f25d6cc6a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path via page_evictable() for a new page - * to determine if it's being mapped into a LOCKED vma. - * If so, mark page as mlocked. + * Called only in fault path, to determine if a new page is being + * mapped into a LOCKED vma. If it is, mark page as mlocked. */ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, struct page *page) diff --git a/mm/ksm.c b/mm/ksm.c index 14ee5cf8a51..ecbc090cdaa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page, SetPageSwapBacked(new_page); __set_page_locked(new_page); - if (page_evictable(new_page, vma)) + if (!mlocked_vma_newpage(vma, new_page)) lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(new_page); diff --git a/mm/rmap.c b/mm/rmap.c index 28777412de6..0d86433e42d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,7 +1080,7 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (page_evictable(page, vma)) + if (!mlocked_vma_newpage(vma, page)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(page); diff --git a/mm/swap.c b/mm/swap.c index f76c76c7501..6310dc2008f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -751,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, SetPageLRU(page_tail); - if (page_evictable(page_tail, NULL)) { + if (page_evictable(page_tail)) { if (PageActive(page)) { SetPageActive(page_tail); active = 1; diff --git a/mm/vmscan.c b/mm/vmscan.c index b010efc4389..8b627309dd4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (lru == LRU_UNEVICTABLE && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -709,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -1217,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON(PageLRU(page)); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); @@ -1470,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } @@ -3414,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } #ifdef CONFIG_SHMEM @@ -3472,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (!PageLRU(page) || !PageUnevictable(page)) continue; - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON(PageActive(page)); -- cgit v1.2.3 From e6c509f85455041d3d7c4b863bf80bc294288cc1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:19 -0700 Subject: mm: use clear_page_mlock() in page_remove_rmap() We had thought that pages could no longer get freed while still marked as mlocked; but Johannes Weiner posted this program to demonstrate that truncating an mlocked private file mapping containing COWed pages is still mishandled: #include #include #include #include #include #include #include int main(void) { char *map; int fd; system("grep mlockfreed /proc/vmstat"); fd = open("chigurh", O_CREAT|O_EXCL|O_RDWR); unlink("chigurh"); ftruncate(fd, 4096); map = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE, fd, 0); map[0] = 11; mlock(map, sizeof(fd)); ftruncate(fd, 0); close(fd); munlock(map, sizeof(fd)); munmap(map, 4096); system("grep mlockfreed /proc/vmstat"); return 0; } The anon COWed pages are not caught by truncation's clear_page_mlock() of the pagecache pages; but unmap_mapping_range() unmaps them, so we ought to look out for them there in page_remove_rmap(). Indeed, why should truncation or invalidation be doing the clear_page_mlock() when removing from pagecache? mlock is a property of mapping in userspace, not a property of pagecache: an mlocked unmapped page is nonsensical. Reported-by: Johannes Weiner Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Michel Lespinasse Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 7 +------ mm/memory.c | 10 +++++----- mm/mlock.c | 16 +++------------- mm/rmap.c | 4 ++++ mm/truncate.c | 4 ---- 5 files changed, 13 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 78f25d6cc6a..4dc93e2fe69 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -201,12 +201,7 @@ extern void munlock_vma_page(struct page *page); * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ -extern void __clear_page_mlock(struct page *page); -static inline void clear_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) - __clear_page_mlock(page); -} +extern void clear_page_mlock(struct page *page); /* * mlock_migrate_page - called only from migrate_page_copy() to diff --git a/mm/memory.c b/mm/memory.c index d205e4381a3..5f5d1f039bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1577,12 +1577,12 @@ split_fallthrough: if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* - * Because we lock page here and migration is - * blocked by the pte's page reference, we need - * only check for file-cache page truncation. + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. */ - if (page->mapping) - mlock_vma_page(page); + mlock_vma_page(page); unlock_page(page); } } diff --git a/mm/mlock.c b/mm/mlock.c index a948be4b7ba..de732159289 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -51,13 +51,10 @@ EXPORT_SYMBOL(can_do_mlock); /* * LRU accounting for clear_page_mlock() */ -void __clear_page_mlock(struct page *page) +void clear_page_mlock(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - - if (!page->mapping) { /* truncated ? */ + if (!TestClearPageMlocked(page)) return; - } dec_zone_page_state(page, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); @@ -290,14 +287,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { lock_page(page); - /* - * Like in __mlock_vma_pages_range(), - * because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - munlock_vma_page(page); + munlock_vma_page(page); unlock_page(page); put_page(page); } diff --git a/mm/rmap.c b/mm/rmap.c index 0d86433e42d..bf03149f495 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1155,7 +1155,10 @@ void page_remove_rmap(struct page *page) } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_end_update_page_stat(page, &locked, &flags); } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1165,6 +1168,7 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ + return; out: if (!anon) mem_cgroup_end_update_page_stat(page, &locked, &flags); diff --git a/mm/truncate.c b/mm/truncate.c index f38055cb8af..d51ce92d6e8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); - clear_page_mlock(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -394,8 +392,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - clear_page_mlock(page); - spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; -- cgit v1.2.3 From a0c5e813f087dffc0d9b173d2e7d3328b1482fd5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:21 -0700 Subject: mm: remove free_page_mlock We should not be seeing non-0 unevictable_pgs_mlockfreed any longer. So remove free_page_mlock() from the page freeing paths: __PG_MLOCKED is already in PAGE_FLAGS_CHECK_AT_FREE, so free_pages_check() will now be checking it, reporting "BUG: Bad page state" if it's ever found set. Comment UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed always 0. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ----------------- mm/vmstat.c | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00750bc08a3..dbb53866c3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -598,17 +598,6 @@ out: zone->free_area[order].nr_free++; } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); -} - static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | @@ -728,15 +717,12 @@ static bool free_pages_prepare(struct page *page, unsigned int order) static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int wasMlocked = __TestClearPageMlocked(page); int migratetype; if (!free_pages_prepare(page, order)) return; local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); @@ -1310,7 +1296,6 @@ void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; int migratetype; - int wasMlocked = __TestClearPageMlocked(page); if (!free_pages_prepare(page, 0)) return; @@ -1318,8 +1303,6 @@ void free_hot_cold_page(struct page *page, int cold) migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_event(PGFREE); /* diff --git a/mm/vmstat.c b/mm/vmstat.c index acbd85c983e..05e3a991374 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -782,7 +782,7 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", + "unevictable_pgs_mlockfreed", /* no longer useful: always zero */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", -- cgit v1.2.3 From 957f822a0ab95e88b146638bad6209bbc315bedd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:33:24 -0700 Subject: mm, numa: reclaim from all nodes within reclaim distance RECLAIM_DISTANCE represents the distance between nodes at which it is deemed too costly to allocate from; it's preferred to try to reclaim from a local zone before falling back to allocating on a remote node with such a distance. To do this, zone_reclaim_mode is set if the distance between any two nodes on the system is greather than this distance. This, however, ends up causing the page allocator to reclaim from every zone regardless of its affinity. What we really want is to reclaim only from zones that are closer than RECLAIM_DISTANCE. This patch adds a nodemask to each node that represents the set of nodes that are within this distance. During the zone iteration, if the bit for a zone's node is set for the local node, then reclaim is attempted; otherwise, the zone is skipped. [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dbb53866c3a..9b8e6243a52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1799,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); +} + +static void __paginginit init_zone_allows_reclaim(int nid) +{ + int i; + + for_each_online_node(i) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + node_set(i, NODE_DATA(nid)->reclaim_nodes); + zone_reclaim_mode = 1; + } +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1819,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) static void zlc_clear_zones_full(struct zonelist *zonelist) { } + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static inline void init_zone_allows_reclaim(int nid) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1903,7 +1928,8 @@ zonelist_scan: did_zlc_setup = 1; } - if (zone_reclaim_mode == 0) + if (zone_reclaim_mode == 0 || + !zone_allows_reclaim(preferred_zone, zone)) goto this_zone_full; /* @@ -3364,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) j = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - int distance = node_distance(local_node, node); - - /* - * If another node is sufficiently far away then it is better - * to reclaim pages in a zone before going off node. - */ - if (distance > RECLAIM_DISTANCE) - zone_reclaim_mode = 1; - /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) node_load[node] = load; prev_node = node; @@ -4552,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + init_zone_allows_reclaim(nid); calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); -- cgit v1.2.3 From 36e4f20af833d1ce196e6a4ade05dc26c44652d1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:31 -0700 Subject: hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach Commit 0c176d52b0b2 ("mm: hugetlb: fix pgoff computation when unmapping page from vma") fixed pgoff calculation but it has replaced it by vma_hugecache_offset() which is not approapriate for offsets used for vma_prio_tree_foreach() because that one expects index in page units rather than in huge_page_shift. Johannes said: : The resulting index may not be too big, but it can be too small: assume : hpage size of 2M and the address to unmap to be 0x200000. This is regular : page index 512 and hpage index 1. If you have a VMA that maps the file : only starting at the second huge page, that VMAs vm_pgoff will be 512 but : you ask for offset 1 and miss it even though it does map the page of : interest. hugetlb_cow() will try to unmap, miss the vma, and retry the : cow until the allocation succeeds or the skipped vma(s) go away. Signed-off-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Acked-by: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8536741f069..de5d1dcf34f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2480,7 +2480,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = vma_hugecache_offset(h, vma, address); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* -- cgit v1.2.3 From 2ec74c3ef2d8c58d71e0e00336fb6b891192155a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:33:33 -0700 Subject: mm: move all mmu notifier invocations to be done outside the PT lock In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli Signed-off-by: Sagi Grimberg Signed-off-by: Haggai Eran Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 4 +++- mm/huge_memory.c | 42 ++++++++++++++++++++++++++++++++++++------ mm/hugetlb.c | 21 +++++++++++++-------- mm/memory.c | 28 +++++++++++++++++++--------- mm/mremap.c | 8 ++++++-- mm/rmap.c | 18 +++++++++++++++--- 6 files changed, 92 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a52daee11d3..a912da6ddfd 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -192,11 +192,13 @@ retry: if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7740923fb..08a943b9cf9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, cond_resched(); } + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm); @@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, page_remove_rmap(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + ret |= VM_FAULT_WRITE; put_page(page); @@ -859,6 +867,7 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; struct page *page, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); spin_lock(&mm->page_table_lock); @@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); - goto out; + goto out_mn; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache(vma, address, pmd); @@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); ret |= VM_FAULT_WRITE; } -out_unlock: spin_unlock(&mm->page_table_lock); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; } struct page *follow_trans_huge_pmd(struct mm_struct *mm, @@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); @@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page, * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; } spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } @@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *ptl; int isolated; unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); + _pmd = pmdp_clear_flush(vma, address, pmd); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de5d1dcf34f..993f7c1820a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); tlb_start_vma(tlb, vma); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { @@ -2425,7 +2427,7 @@ again: if (address < end && !ref_page) goto again; } - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); } @@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_page = pte_page(pte); @@ -2611,6 +2615,9 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * Retake the page_table_lock to check for racing updates * before the page tables are altered @@ -2619,9 +2626,6 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ - mmu_notifier_invalidate_range_start(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); @@ -2629,10 +2633,11 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; - mmu_notifier_invalidate_range_end(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); } + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); return 0; diff --git a/mm/memory.c b/mm/memory.c index 5f5d1f039bf..b03a4a21c1d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; int ret; /* @@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } @@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; @@ -2760,10 +2766,14 @@ gotten: } else mem_cgroup_uncharge_page(new_page); - if (new_page) - page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); + page_cache_release(new_page); + } if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/mremap.c b/mm/mremap.c index 3b639a4b26b..1b61c2d3307 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (likely(need_flush)) flush_tlb_range(vma, old_end-len, old_addr); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); return len + old_addr - old_end; /* how much done */ } diff --git a/mm/rmap.c b/mm/rmap.c index bf03149f495..7df7984d476 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ unsigned long end; int ret = SWAP_AGAIN; int locked_vma = 0; @@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (!pmd_present(*pmd)) return ret; + mmun_start = address; + mmun_end = end; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. @@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (locked_vma) up_read(&vma->vm_mm->mmap_sem); return ret; -- cgit v1.2.3 From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 8 Oct 2012 16:33:35 -0700 Subject: mm: wrap calls to set_pte_at_notify with invalidate_range_start and invalidate_range_end In order to allow sleeping during invalidate_page mmu notifier calls, we need to avoid calling when holding the PT lock. In addition to its direct calls, invalidate_page can also be called as a substitute for a change_pte call, in case the notifier client hasn't implemented change_pte. This patch drops the invalidate_page call from change_pte, and instead wraps all calls to change_pte with invalidate_range_start and invalidate_range_end calls. Note that change_pte still cannot sleep after this patch, and that clients implementing change_pte should not take action on it in case the number of outstanding invalidate_range_start calls is larger than one, otherwise they might miss a later invalidation. Signed-off-by: Haggai Eran Cc: Andrea Arcangeli Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 21 +++++++++++++++++++-- mm/memory.c | 18 ++++++++++++------ mm/mmu_notifier.c | 6 ------ 3 files changed, 31 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index ecbc090cdaa..ae539f0b8aa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; int swapped; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto out_mn; if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; @@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(*pmd)) goto out; + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); - goto out; + goto out_mn; } get_page(kpage); @@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } diff --git a/mm/memory.c b/mm/memory.c index b03a4a21c1d..01ec048ece8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2527,6 +2527,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool mmun_called = false; /* For mmu_notifiers */ old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2704,6 +2707,11 @@ gotten: if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + mmun_start = address & PAGE_MASK; + mmun_end = (address & PAGE_MASK) + PAGE_SIZE; + mmun_called = true; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Re-check the pte - we dropped the lock */ @@ -2766,14 +2774,12 @@ gotten: } else mem_cgroup_uncharge_page(new_page); + if (new_page) + page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); - if (new_page) { - if (new_page == old_page) - /* cow happened, notify before releasing old_page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(new_page); - } + if (mmun_called) + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index c297142f0fe..479a1e751a7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -137,12 +137,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); - /* - * Some drivers don't have change_pte, - * so we must call invalidate_page in that case. - */ - else if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); } srcu_read_unlock(&srcu, id); } -- cgit v1.2.3 From 082708072a4250f5c4dbc62065e7af93f5e45646 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:38 -0700 Subject: mm: revert 0def08e3 ("mm/mempolicy.c: check return code of check_range") Revert commit 0def08e3acc2 because check_range can't fail in migrate_to_node with considering current usecases. Quote from Johannes : I think it makes sense to revert. Not because of the semantics, but I : just don't see how check_range() could even fail for this callsite: : : 1. we pass mm->mmap->vm_start in there, so we should not fail due to : find_vma() : : 2. we pass MPOL_MF_DISCONTIG_OK, so the discontig checks do not apply : and so can not fail : : 3. we pass MPOL_MF_MOVE | MPOL_MF_MOVE_ALL, the page table loops will : continue until addr == end, so we never fail with -EIO And I added a new VM_BUG_ON for checking migrate_to_node's future usecase which might pass to MPOL_MF_STRICT. Suggested-by: Johannes Weiner Signed-off-by: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Vasiliy Kulikov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d64b369180..0b78fb9ea65 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -946,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; - struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (IS_ERR(vma)) - return PTR_ERR(vma); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, -- cgit v1.2.3 From 5a883813845a2bb5ed2bd8c9240736c0740b156f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:39 -0700 Subject: memory-hotplug: fix zone stat mismatch During memory-hotplug, I found NR_ISOLATED_[ANON|FILE] are increasing, causing the kernel to hang. When the system doesn't have enough free pages, it enters reclaim but never reclaim any pages due to too_many_isolated()==true and loops forever. The cause is that when we do memory-hotadd after memory-remove, __zone_pcp_update() clears a zone's ZONE_STAT_ITEMS in setup_pageset() although the vm_stat_diff of all CPUs still have values. In addtion, when we offline all pages of the zone, we reset them in zone_pcp_reset without draining so we loss some zone stat item. Reviewed-by: Wen Congyang Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Yasuaki Ishimatsu Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ mm/vmstat.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9b8e6243a52..5485f0ef4ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5916,6 +5916,7 @@ static int __meminit __zone_pcp_update(void *data) local_irq_save(flags); if (pcp->count > 0) free_pcppages_bulk(zone, pcp->count, pcp); + drain_zonestat(zone, pset); setup_pageset(pset, batch); local_irq_restore(flags); } @@ -5932,10 +5933,16 @@ void __meminit zone_pcp_update(struct zone *zone) void zone_pcp_reset(struct zone *zone) { unsigned long flags; + int cpu; + struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ local_irq_save(flags); if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); + drain_zonestat(zone, pset); + } free_percpu(zone->pageset); zone->pageset = &boot_pageset; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 05e3a991374..2f11309955c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pset->vm_stat_diff[i]) { + int v = pset->vm_stat_diff[i]; + pset->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + atomic_long_add(v, &vm_stat[i]); + } +} #endif #ifdef CONFIG_NUMA -- cgit v1.2.3 From 8befedfe678ff84d5bc356be4f3cb1fd84959d02 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:42 -0700 Subject: mm: remove unevictable_pgs_mlockfreed Simply remove UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed line from /proc/vmstat: Johannes and Mel point out that it was very unlikely to have been used by any tool, and of course we can restore it easily enough if that turns out to be wrong. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 2f11309955c..c7370579111 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -794,7 +794,6 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", /* no longer useful: always zero */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", -- cgit v1.2.3 From c462f179e4d273f0da0e7cf302e29b0edf43844e Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Mon, 8 Oct 2012 16:33:43 -0700 Subject: mm/memory.c: fix typo in comment Signed-off-by: Robert P. J. Day Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 01ec048ece8..45bb6d296b6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2148,7 +2148,7 @@ out: * @addr: target user address of this page * @pfn: source kernel pfn * - * Similar to vm_inert_page, this allows drivers to insert individual pages + * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and -- cgit v1.2.3 From e46a28790e594c0876d1a84270926abf75460f61 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:48 -0700 Subject: CMA: migrate mlocked pages Presently CMA cannot migrate mlocked pages so it ends up failing to allocate contiguous memory space. This patch makes mlocked pages be migrated out. Of course, it can affect realtime processes but in CMA usecase, contiguous memory allocation failing is far worse than access latency to an mlocked page being variable while CMA is running. If someone wants to make the system realtime, he shouldn't enable CMA because stalls can still happen at random times. [akpm@linux-foundation.org: tweak comment text, per Mel] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++-- mm/internal.h | 2 +- mm/page_alloc.c | 2 +- mm/vmscan.c | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d8187f9cabb..2c4ce17651d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -461,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) * @cc: Compaction control structure. * @low_pfn: The first PFN of the range. * @end_pfn: The one-past-the-last PFN of the range. + * @unevictable: true if it allows to isolate unevictable pages * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). Returns zero if there is a fatal signal @@ -476,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) */ unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn) + unsigned long low_pfn, unsigned long end_pfn, bool unevictable) { unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; @@ -602,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; + if (unevictable) + mode |= ISOLATE_UNEVICTABLE; + lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -807,7 +811,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); if (!low_pfn || cc->contended) return ISOLATE_ABORT; diff --git a/mm/internal.h b/mm/internal.h index 4dc93e2fe69..f5f295fe11e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -138,7 +138,7 @@ isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn); + unsigned long low_pfn, unsigned long end_pfn, bool unevictable); #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5485f0ef4ec..fd86c47de86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5690,7 +5690,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end); + pfn, end, true); if (!pfn) { ret = -EINTR; break; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b627309dd4..2624edcfb42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1009,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (!PageLRU(page)) return ret; - /* Do not give back unevictable pages for compaction */ - if (PageUnevictable(page)) + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; ret = -EBUSY; -- cgit v1.2.3 From beb51eaa88238daba698ad837222ad277d440b6d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:51 -0700 Subject: cma: decrease cc.nr_migratepages after reclaiming pagelist reclaim_clean_pages_from_list() reclaims clean pages before migration so cc.nr_migratepages should be updated. Currently, there is no problem but it can be wrong if we try to use the value in future. Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd86c47de86..bb90971182b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5674,7 +5674,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - + unsigned long nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; @@ -5701,7 +5701,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; } - reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, -- cgit v1.2.3 From c22331166b49b94b51424bb7fa1f83e09ad82734 Mon Sep 17 00:00:00 2001 From: Raghavendra D Prabhu Date: Mon, 8 Oct 2012 16:33:55 -0700 Subject: mm: avoid section mismatch warning for memblock_type_name Following section mismatch warning is thrown during build; WARNING: vmlinux.o(.text+0x32408f): Section mismatch in reference from the function memblock_type_name() to the variable .meminit.data:memblock The function memblock_type_name() references the variable __meminitdata memblock. This is often because memblock_type_name lacks a __meminitdata annotation or the annotation of memblock is wrong. This is because memblock_type_name makes reference to memblock variable with attribute __meminitdata. Hence, the warning (even if the function is inline). [akpm@linux-foundation.org: remove inline] Signed-off-by: Raghavendra D Prabhu Cc: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index d809f17ef75..931eef145af 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ -static inline const char *memblock_type_name(struct memblock_type *type) +static __init_memblock const char * +memblock_type_name(struct memblock_type *type) { if (type == &memblock.memory) return "memory"; -- cgit v1.2.3 From a16cee10c7ab994546ed98d9abfd4de74050124a Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Mon, 8 Oct 2012 16:33:58 -0700 Subject: memory-hotplug: preparation to notify memory block's state at memory hot remove remove_memory() is called in two cases: 1. echo offline >/sys/devices/system/memory/memoryXX/state 2. hot remove a memory device In the 1st case, the memory block's state is changed and the notification that memory block's state changed is sent to userland after calling remove_memory(). So user can notice memory block is changed. But in the 2nd case, the memory block's state is not changed and the notification is not also sent to userspcae even if calling remove_memory(). So user cannot notice memory block is changed. For adding the notification at memory hot remove, the patch just prepare as follows: 1st case uses offline_pages() for offlining memory. 2nd case uses remove_memory() for offlining memory and changing memory block's state and notifing the information. The patch does not implement notification to remove_memory(). Signed-off-by: Wen Congyang Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ce690a911f1..dfc0a6134c7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -874,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -static int __ref offline_pages(unsigned long start_pfn, +static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -1007,15 +1007,24 @@ out: return ret; } +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); +} + int remove_memory(u64 start, u64 size) { unsigned long start_pfn, end_pfn; start_pfn = PFN_DOWN(start); end_pfn = start_pfn + PFN_DOWN(size); - return offline_pages(start_pfn, end_pfn, 120 * HZ); + return __offline_pages(start_pfn, end_pfn, 120 * HZ); } #else +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return -EINVAL; +} int remove_memory(u64 start, u64 size) { return -EINVAL; -- cgit v1.2.3 From e90bdb7f52f94204c78fb40b0804645defdebd71 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Mon, 8 Oct 2012 16:34:01 -0700 Subject: memory-hotplug: update memory block's state and notify userspace remove_memory() will be called when hot removing a memory device. But even if offlining memory, we cannot notice it. So the patch updates the memory block's state and sends notification to userspace. Additionally, the memory device may contain more than one memory block. If the memory block has been offlined, __offline_pages() will fail. So we should try to offline one memory block at a time. Thus remove_memory() also check each memory block's state. So there is no need to check the memory block's state before calling remove_memory(). Signed-off-by: Wen Congyang Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dfc0a6134c7..7d0797475a4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1014,11 +1014,42 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) int remove_memory(u64 start, u64 size) { + struct memory_block *mem = NULL; + struct mem_section *section; unsigned long start_pfn, end_pfn; + unsigned long pfn, section_nr; + int ret; start_pfn = PFN_DOWN(start); end_pfn = start_pfn + PFN_DOWN(size); - return __offline_pages(start_pfn, end_pfn, 120 * HZ); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + section_nr = pfn_to_section_nr(pfn); + if (!present_section_nr(section_nr)) + continue; + + section = __nr_to_section(section_nr); + /* same memblock? */ + if (mem) + if ((section_nr >= mem->start_section_nr) && + (section_nr <= mem->end_section_nr)) + continue; + + mem = find_memory_block_hinted(section, mem); + if (!mem) + continue; + + ret = offline_memory_block(mem); + if (ret) { + kobject_put(&mem->dev.kobj); + return ret; + } + } + + if (mem) + kobject_put(&mem->dev.kobj); + + return 0; } #else int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -- cgit v1.2.3 From b676b293fb48672904ee1b9828cb50b4eed01717 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:34:03 -0700 Subject: mm, thp: fix mapped pages avoiding unevictable list on mlock When a transparent hugepage is mapped and it is included in an mlock() range, follow_page() incorrectly avoids setting the page's mlock bit and moving it to the unevictable lru. This is evident if you try to mlock(), munlock(), and then mlock() a range again. Currently: #define MAP_SIZE (4 << 30) /* 4GB */ void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); mlock(ptr, MAP_SIZE); $ grep -E "Unevictable|Inactive\(anon" /proc/meminfo Inactive(anon): 6304 kB Unevictable: 4213924 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4186252 kB Unevictable: 19652 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4198556 kB Unevictable: 21684 kB Notice that less than 2MB was added to the unevictable list; this is because these pages in the range are not transparent hugepages since the 4GB range was allocated with mmap() and has no specific alignment. If posix_memalign() were used instead, unevictable would not have grown at all on the second mlock(). The fix is to call mlock_vma_page() so that the mlock bit is set and the page is added to the unevictable list. With this patch: mlock(ptr, MAP_SIZE); Inactive(anon): 4056 kB Unevictable: 4213940 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4198268 kB Unevictable: 19636 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4008 kB Unevictable: 4213940 kB Signed-off-by: David Rientjes Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Cc: Naoya Horiguchi Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++++++++++- mm/memory.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 08a943b9cf9..3a8d6b7d95d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -971,11 +971,12 @@ out_unlock: return ret; } -struct page *follow_trans_huge_pmd(struct mm_struct *mm, +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; assert_spin_locked(&mm->page_table_lock); @@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) diff --git a/mm/memory.c b/mm/memory.c index 45bb6d296b6..fb135ba4aba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); } else { - page = follow_trans_huge_pmd(mm, address, + page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); goto out; -- cgit v1.2.3 From 8449d21fb49e9824e2736c5febd6b5d287cd2ba1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:34:06 -0700 Subject: mm, thp: fix mlock statistics NR_MLOCK is only accounted in single page units: there's no logic to handle transparent hugepages. This patch checks the appropriate number of pages to adjust the statistics by so that the correct amount of memory is reflected. Currently: $ grep Mlocked /proc/meminfo Mlocked: 19636 kB #define MAP_SIZE (4 << 30) /* 4GB */ void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); mlock(ptr, MAP_SIZE); $ grep Mlocked /proc/meminfo Mlocked: 29844 kB munlock(ptr, MAP_SIZE); $ grep Mlocked /proc/meminfo Mlocked: 19636 kB And with this patch: $ grep Mlock /proc/meminfo Mlocked: 19636 kB mlock(ptr, MAP_SIZE); $ grep Mlock /proc/meminfo Mlocked: 4213664 kB munlock(ptr, MAP_SIZE); $ grep Mlock /proc/meminfo Mlocked: 19636 kB Signed-off-by: David Rientjes Reported-by: Hugh Dickens Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Cc: Naoya Horiguchi Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/mlock.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f5f295fe11e..a4fa284f6bc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, return 0; if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } return 1; diff --git a/mm/mlock.c b/mm/mlock.c index de732159289..f0b9ce572fc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -56,7 +56,8 @@ void clear_page_mlock(struct page *page) if (!TestClearPageMlocked(page)) return; - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGCLEARED); if (!isolate_lru_page(page)) { putback_lru_page(page); @@ -78,7 +79,8 @@ void mlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); if (!isolate_lru_page(page)) putback_lru_page(page); @@ -105,7 +107,8 @@ void munlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); if (!isolate_lru_page(page)) { int ret = SWAP_AGAIN; -- cgit v1.2.3 From 45ec16908e84e00d847e9063231fb3a75e6366bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Oct 2012 16:34:09 -0700 Subject: mm: use %pK for /proc/vmallocinfo In the paranoid case of sysctl kernel.kptr_restrict=2, mask the kernel virtual addresses in /proc/vmallocinfo too. Signed-off-by: Kees Cook Reported-by: Brad Spengler Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8de704679bf..78e08300db2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2571,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) { struct vm_struct *v = p; - seq_printf(m, "0x%p-0x%p %7ld", + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) -- cgit v1.2.3 From 7795912c257bc068445f1db429c94d6b4b6ee604 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Oct 2012 16:34:11 -0700 Subject: mm: document PageHuge somewhat Acked-by: David Rientjes Cc: Mel Gorman Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 993f7c1820a..59a0059b39e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) } } +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ int PageHuge(struct page *page) { compound_page_dtor *dtor; -- cgit v1.2.3 From d760afd4d2570653891f94e13b848e97150dc5a6 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 8 Oct 2012 16:34:14 -0700 Subject: memory-hotplug: suppress "Trying to free nonexistent resource " warning When our x86 box calls __remove_pages(), release_mem_region() shows many warnings. And x86 box cannot unregister iomem_resource. "Trying to free nonexistent resource " release_mem_region() has been changed to be called in each PAGES_PER_SECTION by commit de7f0cba9678 ("memory hotplug: release memory regions in PAGES_PER_SECTION chunks"). Because powerpc registers iomem_resource in each PAGES_PER_SECTION chunk. But when I hot add memory on x86 box, iomem_resource is register in each _CRS not PAGES_PER_SECTION chunk. So x86 box unregisters iomem_resource. The patch fixes the problem. Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: Andrew Morton Cc: KOSAKI Motohiro Cc: Wen Congyang Cc: Dave Hansen Cc: Nathan Fontenot Cc: Badari Pulavarty Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7d0797475a4..56b758ae57d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -369,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); + release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - release_mem_region(pfn << PAGE_SHIFT, - PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; -- cgit v1.2.3 From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:25 -0700 Subject: mm: Add and use update_mmu_cache_pmd() in transparent huge page code. The transparent huge page code passes a PMD pointer in as the third argument of update_mmu_cache(), which expects a PTE pointer. This never got noticed because X86 implements update_mmu_cache() as a macro and thus we don't get any type checking, and X86 is the only architecture which supports transparent huge pages currently. Before other architectures can support transparent huge pages properly we need to add a new interface which will take a PMD pointer as the third argument rather than a PTE pointer. [akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390] Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3a8d6b7d95d..68a3c93036f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -900,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -956,7 +956,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -2041,7 +2041,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -- cgit v1.2.3 From f5c8ad47284ca01dafc37da5a72bb9644174d387 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:26 -0700 Subject: mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd(). Invalidation sequences are handled in various ways on various architectures. One way, which sparc64 uses, is to let the set_*_at() functions accumulate pending flushes into a per-cpu array. Then the flush_tlb_range() et al. calls process the pending TLB flushes. In this regime, the __tlb_remove_*tlb_entry() implementations are essentially NOPs. The canonical PTE zap in mm/memory.c is: ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); With a subsequent tlb_flush_mmu() if needed. Mirror this in the THP PMD zapping using: orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); And we properly accomodate TLB flush mechanims like the one described above. Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 68a3c93036f..a863af26c79 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1024,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; + pmd_t orig_pmd; pgtable = pgtable_trans_huge_withdraw(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); + orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); + page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); -- cgit v1.2.3 From baaf1dd491433a78826150aff7411015de7e9b65 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 5 Aug 2012 15:00:58 +0000 Subject: mm/slob: use min_t() to compare ARCH_SLAB_MINALIGN The definition of ARCH_SLAB_MINALIGN is architecture dependent and can be either of type size_t or int. Comparing that value with ARCH_KMALLOC_MINALIGN can cause harmless warnings on platforms where they are different. Since both are always small positive integer numbers, using the size_t type to compare them is safe and gets rid of the warning. Without this patch, building ARM collie_defconfig results in: mm/slob.c: In function '__kmalloc_node': mm/slob.c:431:152: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'kfree': mm/slob.c:484:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'ksize': mm/slob.c:503:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] Signed-off-by: Arnd Bergmann Acked-by: Christoph Lameter Cc: Pekka Enberg --- mm/slob.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 45d4ca79933..497c55e318a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -428,7 +428,7 @@ out: void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -481,7 +481,7 @@ void kfree(const void *block) sp = virt_to_page(block); if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else @@ -500,7 +500,7 @@ size_t ksize(const void *block) sp = virt_to_page(block); if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } else -- cgit v1.2.3 From 35c2a7f4908d404c9124c2efc6ada4640ca4d5d5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 7 Oct 2012 20:32:51 -0700 Subject: tmpfs,ceph,gfs2,isofs,reiserfs,xfs: fix fh_len checking Fuzzing with trinity oopsed on the 1st instruction of shmem_fh_to_dentry(), u64 inum = fid->raw[2]; which is unhelpfully reported as at the end of shmem_alloc_inode(): BUG: unable to handle kernel paging request at ffff880061cd3000 IP: [] shmem_alloc_inode+0x40/0x40 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Call Trace: [] ? exportfs_decode_fh+0x79/0x2d0 [] do_handle_open+0x163/0x2c0 [] sys_open_by_handle_at+0xc/0x10 [] tracesys+0xe1/0xe6 Right, tmpfs is being stupid to access fid->raw[2] before validating that fh_len includes it: the buffer kmalloc'ed by do_sys_name_to_handle() may fall at the end of a page, and the next page not be present. But some other filesystems (ceph, gfs2, isofs, reiserfs, xfs) are being careless about fh_len too, in fh_to_dentry() and/or fh_to_parent(), and could oops in the same way: add the missing fh_len checks to those. Reported-by: Sasha Levin Signed-off-by: Hugh Dickins Cc: Al Viro Cc: Sage Weil Cc: Steven Whitehouse Cc: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- mm/shmem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index cc12072f878..67afba5117f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2220,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, { struct inode *inode; struct dentry *dentry = NULL; - u64 inum = fid->raw[2]; - inum = (inum << 32) | fid->raw[1]; + u64 inum; if (fh_len < 3) return NULL; + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { -- cgit v1.2.3 From 210ed9defffca13b909f040d7338d8062e5594a3 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 8 Oct 2012 09:26:01 +0200 Subject: mm, slab: release slab_mutex earlier in kmem_cache_destroy() Commit 1331e7a1bbe1 ("rcu: Remove _rcu_barrier() dependency on __stop_machine()") introduced slab_mutex -> cpu_hotplug.lock dependency through kmem_cache_destroy() -> rcu_barrier() -> _rcu_barrier() -> get_online_cpus(). Lockdep thinks that this might actually result in ABBA deadlock, and reports it as below: === [ cut here ] === ====================================================== [ INFO: possible circular locking dependency detected ] 3.6.0-rc5-00004-g0d8ee37 #143 Not tainted ------------------------------------------------------- kworker/u:2/40 is trying to acquire lock: (rcu_sched_state.barrier_mutex){+.+...}, at: [] _rcu_barrier+0x26/0x1e0 but task is already holding lock: (slab_mutex){+.+.+.}, at: [] kmem_cache_destroy+0x45/0xe0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (slab_mutex){+.+.+.}: [] validate_chain+0x632/0x720 [] __lock_acquire+0x309/0x530 [] lock_acquire+0x121/0x190 [] __mutex_lock_common+0x5c/0x450 [] mutex_lock_nested+0x3e/0x50 [] cpuup_callback+0x2f/0xbe [] notifier_call_chain+0x93/0x140 [] __raw_notifier_call_chain+0x9/0x10 [] _cpu_up+0xba/0x14e [] cpu_up+0xbc/0x117 [] smp_init+0x6b/0x9f [] kernel_init+0x147/0x1dc [] kernel_thread_helper+0x4/0x10 -> #1 (cpu_hotplug.lock){+.+.+.}: [] validate_chain+0x632/0x720 [] __lock_acquire+0x309/0x530 [] lock_acquire+0x121/0x190 [] __mutex_lock_common+0x5c/0x450 [] mutex_lock_nested+0x3e/0x50 [] get_online_cpus+0x37/0x50 [] _rcu_barrier+0xbb/0x1e0 [] rcu_barrier_sched+0x10/0x20 [] rcu_barrier+0x9/0x10 [] deactivate_locked_super+0x49/0x90 [] deactivate_super+0x61/0x70 [] mntput_no_expire+0x127/0x180 [] sys_umount+0x6e/0xd0 [] system_call_fastpath+0x16/0x1b -> #0 (rcu_sched_state.barrier_mutex){+.+...}: [] check_prev_add+0x3de/0x440 [] validate_chain+0x632/0x720 [] __lock_acquire+0x309/0x530 [] lock_acquire+0x121/0x190 [] __mutex_lock_common+0x5c/0x450 [] mutex_lock_nested+0x3e/0x50 [] _rcu_barrier+0x26/0x1e0 [] rcu_barrier_sched+0x10/0x20 [] rcu_barrier+0x9/0x10 [] kmem_cache_destroy+0xd1/0xe0 [] nf_conntrack_cleanup_net+0xe4/0x110 [nf_conntrack] [] nf_conntrack_cleanup+0x2a/0x70 [nf_conntrack] [] nf_conntrack_net_exit+0x5e/0x80 [nf_conntrack] [] ops_exit_list+0x39/0x60 [] cleanup_net+0xfb/0x1b0 [] process_one_work+0x26b/0x4c0 [] worker_thread+0x12e/0x320 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 other info that might help us debug this: Chain exists of: rcu_sched_state.barrier_mutex --> cpu_hotplug.lock --> slab_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slab_mutex); lock(cpu_hotplug.lock); lock(slab_mutex); lock(rcu_sched_state.barrier_mutex); *** DEADLOCK *** === [ cut here ] === This is actually a false positive. Lockdep has no way of knowing the fact that the ABBA can actually never happen, because of special semantics of cpu_hotplug.refcount and its handling in cpu_hotplug_begin(); the mutual exclusion there is not achieved through mutex, but through cpu_hotplug.refcount. The "neither cpu_up() nor cpu_down() will proceed past cpu_hotplug_begin() until everyone who called get_online_cpus() will call put_online_cpus()" semantics is totally invisible to lockdep. This patch therefore moves the unlock of slab_mutex so that rcu_barrier() is being called with it unlocked. It has two advantages: - it slightly reduces hold time of slab_mutex; as it's used to protect the cachep list, it's not necessary to hold it over kmem_cache_free() call any more - it silences the lockdep false positive warning, as it avoids lockdep ever learning about slab_mutex -> cpu_hotplug.lock dependency Reviewed-by: Paul E. McKenney Reviewed-by: Srivatsa S. Bhat Acked-by: David Rientjes Signed-off-by: Jiri Kosina Signed-off-by: Pekka Enberg --- mm/slab_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 9c217255ac4..069a24e6440 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -168,6 +168,7 @@ void kmem_cache_destroy(struct kmem_cache *s) list_del(&s->list); if (!__kmem_cache_shutdown(s)) { + mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); @@ -175,12 +176,14 @@ void kmem_cache_destroy(struct kmem_cache *s) kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); + mutex_unlock(&slab_mutex); printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", s->name); dump_stack(); } + } else { + mutex_unlock(&slab_mutex); } - mutex_unlock(&slab_mutex); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 91a27b2a756784714e924e5e854b919273082d26 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:28 -0400 Subject: vfs: define struct filename and have getname() return it getname() is intended to copy pathname strings from userspace into a kernel buffer. The result is just a string in kernel space. It would however be quite helpful to be able to attach some ancillary info to the string. For instance, we could attach some audit-related info to reduce the amount of audit-related processing needed. When auditing is enabled, we could also call getname() on the string more than once and not need to recopy it from userspace. This patchset converts the getname()/putname() interfaces to return a struct instead of a string. For now, the struct just tracks the string in kernel space and the original userland pointer for it. Later, we'll add other information to the struct as it becomes convenient. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- mm/swapfile.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e254c768f..90d2ed591de 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char *pathname; + struct filename *pathname; int oom_score_adj; int i, type, prev; int err; @@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (IS_ERR(pathname)) goto out; - victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); - putname(pathname); + victim = filp_open(pathname->name, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; @@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; - char *name; + struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; int i; @@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) name = NULL; goto bad_swap; } - swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + swap_file = filp_open(name->name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; @@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) printk(KERN_INFO "Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s%s\n", - p->pages<<(PAGE_SHIFT-10), name, p->prio, + p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", -- cgit v1.2.3 From 669abf4e5539c8aa48bf28c965be05c0a7b58a27 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 16:43:10 -0400 Subject: vfs: make path_openat take a struct filename pointer ...and fix up the callers. For do_file_open_root, just declare a struct filename on the stack and fill out the .name field. For do_filp_open, make it also take a struct filename pointer, and fix up its callers to call it appropriately. For filp_open, add a variant that takes a struct filename pointer and turn filp_open into a wrapper around it. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 90d2ed591de..71cd288b200 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (IS_ERR(pathname)) goto out; - victim = filp_open(pathname->name, O_RDWR|O_LARGEFILE, 0); + victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; @@ -1966,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) name = NULL; goto bad_swap; } - swap_file = filp_open(name->name, O_RDWR|O_LARGEFILE, 0); + swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; -- cgit v1.2.3 From 325adeb55e32c50055c654e5b06a49f0bd88420b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 15 Oct 2012 13:44:56 +0200 Subject: mm: huge_memory: Fix build error. Certain configurations won't implicitly pull in resulting in the following build error: mm/huge_memory.c: In function 'release_pte_page': mm/huge_memory.c:1697:2: error: implicit declaration of function 'unlock_page' [-Werror=implicit-function-declaration] mm/huge_memory.c: In function '__collapse_huge_page_isolate': mm/huge_memory.c:1757:3: error: implicit declaration of function 'trylock_page' [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Reported-by: David Daney Signed-off-by: Ralf Baechle Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a863af26c79..40f17c34b41 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "internal.h" -- cgit v1.2.3 From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Oct 2012 17:31:23 -0700 Subject: mm, mempolicy: fix printing stack contents in numa_maps When reading /proc/pid/numa_maps, it's possible to return the contents of the stack where the mempolicy string should be printed if the policy gets freed from beneath us. This happens because mpol_to_str() may return an error the stack-allocated buffer is then printed without ever being stored. There are two possible error conditions in mpol_to_str(): - if the buffer allocated is insufficient for the string to be stored, and - if the mempolicy has an invalid mode. The first error condition is not triggered in any of the callers to mpol_to_str(): at least 50 bytes is always allocated on the stack and this is sufficient for the string to be written. A future patch should convert this into BUILD_BUG_ON() since we know the maximum strlen possible, but that's not -rc material. The second error condition is possible if a race occurs in dropping a reference to a task's mempolicy causing it to be freed during the read(). The slab poison value is then used for the mode and mpol_to_str() returns -EINVAL. This race is only possible because get_vma_policy() believes that mm->mmap_sem protects task->mempolicy, which isn't true. The exit path does not hold mm->mmap_sem when dropping the reference or setting task->mempolicy to NULL: it uses task_lock(task) instead. Thus, it's required for the caller of a task mempolicy to hold task_lock(task) while grabbing the mempolicy and reading it. Callers with a vma policy store their mempolicy earlier and can simply increment the reference count so it's guaranteed not to be freed. Reported-by: Dave Jones Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b78fb9ea65..d04a8a54c29 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1536,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies - * are protected by the task's mmap_sem, which must be held for read by - * the caller. + * Current or other task's task mempolicy and non-shared vma policies must be + * protected by task_lock(task) by the caller. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the -- cgit v1.2.3 From deb521c44fa529b24cc78a64702757a683f82487 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Oct 2012 13:37:57 -0700 Subject: remap_file_pages: correctly handle the case of a NULL vm_ops pointer In commit 0b173bc4daa8 ("mm: kill vma flag VM_CAN_NONLINEAR") we replaced the VM_CAN_NONLINEAR test with checking whether the mapping has a '->remap_pages()' vm operation, but there is no guarantee that there it even has a vm_ops pointer at all. Add the appropriate test for NULL vm_ops. Reported-by: Sasha Levin Cc: Konstantin Khlebnikov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 3899a86851c..a0aaf0e5680 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!vma->vm_ops->remap_pages) + if (!vma->vm_ops || !vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) -- cgit v1.2.3 From 0db63d7e25f96e2c6da925c002badf6f144ddf30 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 13:56:57 -0700 Subject: mm: compaction: correct the nr_strict va isolated check for CMA Thierry reported that the "iron out" patch for isolate_freepages_block() had problems due to the strict check being too strict with "mm: compaction: Iron out isolate_freepages_block() and isolate_freepages_range() -fix1". It's possible that more pages than necessary are isolated but the check still fails and I missed that this fix was not picked up before RC1. This same problem has been identified in 3.7-RC1 by Tony Prisk and should be addressed by the following patch. Signed-off-by: Mel Gorman Tested-by: Tony Prisk Reported-by: Thierry Reding Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2c4ce17651d..9eef55838fc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -346,7 +346,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * pages requested were isolated. If there were any failures, 0 is * returned and CMA will fail. */ - if (strict && nr_strict_required != total_isolated) + if (strict && nr_strict_required > total_isolated) total_isolated = 0; if (locked) -- cgit v1.2.3 From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Trim memory in memblock to be page aligned We will not map partial pages, so need to make sure memblock allocation will not allocate those bytes out. Also we will use for_each_mem_pfn_range() to loop to map memory range to keep them consistent. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- mm/memblock.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 931eef145af..625905523c2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -930,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + int i; + phys_addr_t start, end, orig_start, orig_end; + struct memblock_type *mem = &memblock.memory; + + for (i = 0; i < mem->cnt; i++) { + orig_start = mem->regions[i].base; + orig_end = mem->regions[i].base + mem->regions[i].size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + mem->regions[i].base = start; + mem->regions[i].size = end - start; + } else { + memblock_remove_region(mem, i); + i--; + } + } +} void __init_memblock memblock_set_current_limit(phys_addr_t limit) { -- cgit v1.2.3 From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 25 Oct 2012 13:37:31 -0700 Subject: mm: fix XFS oops due to dirty pages without buffers on s390 On s390 any write to a page (even from kernel itself) sets architecture specific page dirty bit. Thus when a page is written to via buffered write, HW dirty bit gets set and when we later map and unmap the page, page_remove_rmap() finds the dirty bit and calls set_page_dirty(). Dirtying of a page which shouldn't be dirty can cause all sorts of problems to filesystems. The bug we observed in practice is that buffers from the page get freed, so when the page gets later marked as dirty and writeback writes it, XFS crashes due to an assertion BUG_ON(!PagePrivate(page)) in page_buffers() called from xfs_count_page_state(). Similar problem can also happen when zero_user_segment() call from xfs_vm_writepage() (or block_write_full_page() for that matter) set the hardware dirty bit during writeback, later buffers get freed, and then page unmapped. Fix the issue by ignoring s390 HW dirty bit for page cache pages of mappings with mapping_cap_account_dirty(). This is safe because for such mappings when a page gets marked as writeable in PTE it is also marked dirty in do_wp_page() or do_page_fault(). When the dirty bit is cleared by clear_page_dirty_for_io(), the page gets writeprotected in page_mkclean(). So pagecache page is writeable if and only if it is dirty. Thanks to Hugh Dickins for pointing out mapping has to have mapping_cap_account_dirty() for things to work and proposing a cleaned up variant of the patch. The patch has survived about two hours of running fsx-linux on tmpfs while heavily swapping and several days of running on out build machines where the original problem was triggered. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Cc: Heiko Carstens Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 7df7984d476..2ee1ef0f317 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -926,11 +927,8 @@ int page_mkclean(struct page *page) if (page_mapped(page)) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping) ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } } return ret; @@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page) * this if the page is anon, so about to be freed; but perhaps * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. + * + * And we can skip it on file pages, so long as the filesystem + * participates in dirty tracking; but need to catch shm and tmpfs + * and ramfs pages which have been modified since creation by read + * fault. + * + * Note that mapping must be decided above, before decrementing + * mapcount (which luckily provides a barrier): once page is unmapped, + * it could be truncated and page->mapping reset to NULL at any moment. + * Note also that we are relying on page_mapping(page) to set mapping + * to &swapper_space when PageSwapCache(page). */ - if ((!anon || PageSwapCache(page)) && + if (mapping && !mapping_cap_account_dirty(mapping) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* -- cgit v1.2.3 From 86a595f961360c9c31d00fb111dd09e5d5ba9a40 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 25 Oct 2012 13:37:56 -0700 Subject: mm/page_alloc.c:alloc_contig_range(): return early for err path If start_isolate_page_range() failed, unset_migratetype_isolate() has been done inside it. Signed-off-by: Bob Liu Cc: Ni zhan Chen Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb90971182b..b0012ab372a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5825,7 +5825,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); if (ret) - goto done; + return ret; ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) -- cgit v1.2.3 From 35cfa2b0b491c37e23527822bf365610dbb188e5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 25 Oct 2012 13:38:01 -0700 Subject: mm/mmu_notifier: allocate mmu_notifier in advance While allocating mmu_notifier with parameter GFP_KERNEL, swap would start to work in case of tight available memory. Eventually, that would lead to a deadlock while the swap deamon swaps anonymous pages. It was caused by commit e0f3c3f78da29b ("mm/mmu_notifier: init notifier if necessary"). ================================= [ INFO: inconsistent lock state ] 3.7.0-rc1+ #518 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/35 [HC0[0]:SC0[0]:HE1:SE1] takes: (&mapping->i_mmap_mutex){+.+.?.}, at: page_referenced+0x9c/0x2e0 {RECLAIM_FS-ON-W} state was registered at: mark_held_locks+0x86/0x150 lockdep_trace_alloc+0x67/0xc0 kmem_cache_alloc_trace+0x33/0x230 do_mmu_notifier_register+0x87/0x180 mmu_notifier_register+0x13/0x20 kvm_dev_ioctl+0x428/0x510 do_vfs_ioctl+0x98/0x570 sys_ioctl+0x91/0xb0 system_call_fastpath+0x16/0x1b irq event stamp: 825 hardirqs last enabled at (825): _raw_spin_unlock_irq+0x30/0x60 hardirqs last disabled at (824): _raw_spin_lock_irq+0x19/0x80 softirqs last enabled at (0): copy_process+0x630/0x17c0 softirqs last disabled at (0): (null) ... Simply back out the above commit, which was a small performance optimization. Signed-off-by: Gavin Shan Reported-by: Andrea Righi Tested-by: Andrea Righi Cc: Wanpeng Li Cc: Andrea Arcangeli Cc: Avi Kivity Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 479a1e751a7..8a5ac8c686b 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -196,28 +196,28 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out; + goto out_clean; if (!mm_has_notifiers(mm)) { - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), - GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) { - ret = -ENOMEM; - goto out_of_mem; - } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -233,12 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); -out_of_mem: mm_drop_all_locks(mm); -out: +out_clean: if (take_mmap_sem) up_write(&mm->mmap_sem); - + kfree(mmu_notifier_mm); +out: BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } -- cgit v1.2.3 From 6b187d0260b6cd1d0904309f32659b7ed5948af8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 25 Oct 2012 13:38:08 -0700 Subject: mm, numa: avoid setting zone_reclaim_mode unless a node is sufficiently distant Commit 957f822a0ab9 ("mm, numa: reclaim from all nodes within reclaim distance") caused zone_reclaim_mode to be set for all systems where two nodes are within RECLAIM_DISTANCE of each other. This is the opposite of what we actually want: zone_reclaim_mode should be set if two nodes are sufficiently distant. Signed-off-by: David Rientjes Reported-by: Julian Wollrath Tested-by: Julian Wollrath Cc: Hugh Dickins Cc: Patrik Kullman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0012ab372a..5b74de6702e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid) int i; for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); + else zone_reclaim_mode = 1; - } } #else /* CONFIG_NUMA */ -- cgit v1.2.3