From c129ef6d0f76a809b0a96fd52b9509856722e74b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 19 Jul 2012 18:51:22 -0400
Subject: mm: frontswap: remove unneeded headers

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
[v1: Rebased with tracing removed]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 mm/frontswap.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'mm')

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 7fb9538bec2..5318b3a5708 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -11,15 +11,11 @@
  * This work is licensed under the terms of the GNU GPL, version 2.
  */
 
-#include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-#include <linux/proc_fs.h>
 #include <linux/security.h>
-#include <linux/capability.h>
 #include <linux/module.h>
-#include <linux/uaccess.h>
 #include <linux/debugfs.h>
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
-- 
cgit v1.2.3


From 88610238d9a0a5998c4deba201332dd1e35b4199 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwp@linux.vnet.ibm.com>
Date: Sat, 16 Jun 2012 20:37:48 +0800
Subject: mm/frontswap: cleanup doc and comment error

Signed-off-by: Wanpeng Li <liwp.linux@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 mm/frontswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 5318b3a5708..6b3e71a2cd4 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -120,7 +120,7 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offse
  * "Store" data from a page to frontswap and associate it with the page's
  * swaptype and offset.  Page must be locked and in the swap cache.
  * If frontswap already contains a page with matching swaptype and
- * offset, the frontswap implmentation may either overwrite the data and
+ * offset, the frontswap implementation may either overwrite the data and
  * return success or invalidate the page from frontswap and return failure.
  */
 int __frontswap_store(struct page *page)
-- 
cgit v1.2.3


From 6b982fcf0260b8371604fcf6a12c6beecb731c57 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 30 Jul 2012 14:47:44 -0500
Subject: mm/frontswap: fix uninit'ed variable warning

Fixes uninitialized variable warning on 'type' in frontswap_shrink().
type is set before use by __frontswap_unuse_pages() called by
__frontswap_shrink() called by frontswap_shrink() before use by
try_to_unuse().

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 mm/frontswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd4..89dc399d332 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -292,7 +292,7 @@ static int __frontswap_shrink(unsigned long target_pages,
 void frontswap_shrink(unsigned long target_pages)
 {
 	unsigned long pages_to_unuse = 0;
-	int type, ret;
+	int uninitialized_var(type), ret;
 
 	/*
 	 * we don't want to hold swap_lock while doing a very
-- 
cgit v1.2.3


From d9b7f22623b5fa9cc189581dcdfb2ac605933bf4 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Fri, 3 Aug 2012 22:51:37 +0400
Subject: slub: use free_page instead of put_page for freeing kmalloc
 allocation

When freeing objects, the slub allocator will most of the time free
empty pages by calling __free_pages(). But high-order kmalloc will be
diposed by means of put_page() instead. It makes no sense to call
put_page() in kernel pages that are provided by the object allocators,
so we shouldn't be doing this ourselves. Aside from the consistency
change, we don't change the flow too much. put_page()'s would call its
dtor function, which is __free_pages. We also already do all of the
Compound page tests ourselves, and the Mlock test we lose don't really
matter.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
CC: David Rientjes <rientjes@google.com>
CC: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 8f78e257703..c83fe96f5e4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3477,7 +3477,7 @@ void kfree(const void *x)
 	if (unlikely(!PageSlab(page))) {
 		BUG_ON(!PageCompound(page));
 		kmemleak_free(x);
-		put_page(page);
+		__free_pages(page, compound_order(page));
 		return;
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
-- 
cgit v1.2.3


From 48f2474144ac708f1faad97e82a863ca8214b602 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 14 Aug 2012 14:53:20 -0700
Subject: slab: do not call compound_head() in page_get_cache()

page_get_cache() does not need to call compound_head(), as its unique
caller virt_to_slab() already makes sure to return a head page.

Additionally, removing the compound_head() call makes page_get_cache()
consistent with page_get_slab().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f8b0d539b48..cb6ce2dfc7c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -517,7 +517,6 @@ static bool slab_max_order_set __initdata;
 
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
-	page = compound_head(page);
 	BUG_ON(!PageSlab(page));
 	return page->slab_cache;
 }
-- 
cgit v1.2.3


From 455ce9eb1cfa083da0def023094190aeb133855a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 14 Aug 2012 14:53:22 -0700
Subject: mm/slab_common.c: cleanup

Eliminate an ifdef and a label by moving all the CONFIG_DEBUG_VM checking
inside the locked region.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b..281600b3010 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -53,19 +53,17 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 {
 	struct kmem_cache *s = NULL;
 
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+
 #ifdef CONFIG_DEBUG_VM
 	if (!name || in_interrupt() || size < sizeof(void *) ||
 		size > KMALLOC_MAX_SIZE) {
 		printk(KERN_ERR "kmem_cache_create(%s) integrity check"
-			" failed\n", name);
-		goto out;
+				" failed\n", name);
+		goto oops;
 	}
-#endif
-
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
 
-#ifdef CONFIG_DEBUG_VM
 	list_for_each_entry(s, &slab_caches, list) {
 		char tmp;
 		int res;
@@ -104,9 +102,6 @@ oops:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 
-#ifdef CONFIG_DEBUG_VM
-out:
-#endif
 	if (!s && (flags & SLAB_PANIC))
 		panic("kmem_cache_create: Failed to create slab '%s'\n", name);
 
-- 
cgit v1.2.3


From 19c7ff9ecd89441096dab6a56f926f7df8ba850a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 30 May 2012 12:54:46 -0500
Subject: slub: Take node lock during object free checks

Only applies to scenarios where debugging is on:

Validation of slabs can currently occur while debugging
information is updated from the fast paths of the allocator.
This results in various races where we get false reports about
slab metadata not being in order.

This patch makes the fast paths take the node lock so that
serialization with slab validation will occur. Causes additional
slowdown in debug scenarios.

Reported-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index c83fe96f5e4..e131084e87a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1069,13 +1069,13 @@ bad:
 	return 0;
 }
 
-static noinline int free_debug_processing(struct kmem_cache *s,
-		 struct page *page, void *object, unsigned long addr)
+static noinline struct kmem_cache_node *free_debug_processing(
+	struct kmem_cache *s, struct page *page, void *object,
+	unsigned long addr, unsigned long *flags)
 {
-	unsigned long flags;
-	int rc = 0;
+	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&n->list_lock, *flags);
 	slab_lock(page);
 
 	if (!check_slab(s, page))
@@ -1113,15 +1113,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
 		set_track(s, object, TRACK_FREE, addr);
 	trace(s, page, object, 0);
 	init_object(s, object, SLUB_RED_INACTIVE);
-	rc = 1;
 out:
 	slab_unlock(page);
-	local_irq_restore(flags);
-	return rc;
+	/*
+	 * Keep node_lock to preserve integrity
+	 * until the object is actually freed
+	 */
+	return n;
 
 fail:
+	slab_unlock(page);
+	spin_unlock_irqrestore(&n->list_lock, *flags);
 	slab_fix(s, "Object at 0x%p not freed", object);
-	goto out;
+	return NULL;
 }
 
 static int __init setup_slub_debug(char *str)
@@ -1214,8 +1218,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
 static inline int alloc_debug_processing(struct kmem_cache *s,
 	struct page *page, void *object, unsigned long addr) { return 0; }
 
-static inline int free_debug_processing(struct kmem_cache *s,
-	struct page *page, void *object, unsigned long addr) { return 0; }
+static inline struct kmem_cache_node *free_debug_processing(
+	struct kmem_cache *s, struct page *page, void *object,
+	unsigned long addr, unsigned long *flags) { return NULL; }
 
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
 			{ return 1; }
@@ -2452,7 +2457,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	stat(s, FREE_SLOWPATH);
 
-	if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+	if (kmem_cache_debug(s) &&
+		!(n = free_debug_processing(s, page, x, addr, &flags)))
 		return;
 
 	do {
-- 
cgit v1.2.3


From e24fc410f58cc7851188a6e996dc6ce5c4259eb4 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Sat, 23 Jun 2012 03:22:38 +0900
Subject: slub: reduce failure of this_cpu_cmpxchg in put_cpu_partial() after
 unfreezing

In current implementation, after unfreezing, we doesn't touch oldpage,
so it remain 'NOT NULL'. When we call this_cpu_cmpxchg()
with this old oldpage, this_cpu_cmpxchg() is mostly be failed.

We can change value of oldpage to NULL after unfreezing,
because unfreeze_partial() ensure that all the cpu partial slabs is removed
from cpu partial list. In this time, we could expect that
this_cpu_cmpxchg is mostly succeed.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index e131084e87a..c67bd0a4a95 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1962,6 +1962,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 				local_irq_save(flags);
 				unfreeze_partials(s);
 				local_irq_restore(flags);
+				oldpage = NULL;
 				pobjects = 0;
 				pages = 0;
 				stat(s, CPU_PARTIAL_DRAIN);
-- 
cgit v1.2.3


From b920536aa0a4fd178b2957774adfe409027fe55b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Thu, 16 Aug 2012 10:12:18 +0300
Subject: Revert "mm/slab_common.c: cleanup"

This reverts commit 455ce9eb1cfa083da0def023094190aeb133855a. Andrew
sent a better version.

Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 281600b3010..aa3ca5bb01b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -53,17 +53,19 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 {
 	struct kmem_cache *s = NULL;
 
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
-
 #ifdef CONFIG_DEBUG_VM
 	if (!name || in_interrupt() || size < sizeof(void *) ||
 		size > KMALLOC_MAX_SIZE) {
 		printk(KERN_ERR "kmem_cache_create(%s) integrity check"
-				" failed\n", name);
-		goto oops;
+			" failed\n", name);
+		goto out;
 	}
+#endif
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
 
+#ifdef CONFIG_DEBUG_VM
 	list_for_each_entry(s, &slab_caches, list) {
 		char tmp;
 		int res;
@@ -102,6 +104,9 @@ oops:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 
+#ifdef CONFIG_DEBUG_VM
+out:
+#endif
 	if (!s && (flags & SLAB_PANIC))
 		panic("kmem_cache_create: Failed to create slab '%s'\n", name);
 
-- 
cgit v1.2.3


From 77be4b1366c97b95bb197e18187ff45b0e4f6bd3 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.khan@hp.com>
Date: Thu, 16 Aug 2012 00:09:46 -0700
Subject: mm/slab: restructure kmem_cache_create() debug checks

kmem_cache_create() does cache integrity checks when CONFIG_DEBUG_VM is
defined.  These checks interspersed with the regular code path has lead
to compile time warnings when compiled without CONFIG_DEBUG_VM defined.
Restructuring the code to move the integrity checks in to a new function
would eliminate the current compile warning problem and also will allow
for future changes to the debug only code to evolve without introducing
new warnings in the regular path.

This restructuring work is based on the discussion in the following
thread:

https://lkml.org/lkml/2012/7/13/424

[akpm@linux-foundation.org: fix build, cleanup]
Signed-off-by: Shuah Khan <shuah.khan@hp.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 97 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b..8cf8b4962d6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -23,49 +23,17 @@ enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 
-/*
- * kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
- * @flags: SLAB flags
- * @ctor: A constructor for the objects.
- *
- * Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within a interrupt, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * The flags are
- *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
- */
-
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
-		unsigned long flags, void (*ctor)(void *))
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	struct kmem_cache *s = NULL;
 
-#ifdef CONFIG_DEBUG_VM
 	if (!name || in_interrupt() || size < sizeof(void *) ||
 		size > KMALLOC_MAX_SIZE) {
-		printk(KERN_ERR "kmem_cache_create(%s) integrity check"
-			" failed\n", name);
-		goto out;
+		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+		return -EINVAL;
 	}
-#endif
 
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
-
-#ifdef CONFIG_DEBUG_VM
 	list_for_each_entry(s, &slab_caches, list) {
 		char tmp;
 		int res;
@@ -77,36 +45,67 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		 */
 		res = probe_kernel_address(s->name, tmp);
 		if (res) {
-			printk(KERN_ERR
-			       "Slab cache with size %d has lost its name\n",
+			pr_err("Slab cache with size %d has lost its name\n",
 			       s->object_size);
 			continue;
 		}
 
 		if (!strcmp(s->name, name)) {
-			printk(KERN_ERR "kmem_cache_create(%s): Cache name"
-				" already exists.\n",
-				name);
+			pr_err("%s (%s): Cache name already exists.\n",
+			       __func__, name);
 			dump_stack();
 			s = NULL;
-			goto oops;
+			return -EINVAL;
 		}
 	}
 
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
+	return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
+{
+	return 0;
+}
 #endif
 
-	s = __kmem_cache_create(name, size, align, flags, ctor);
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+		unsigned long flags, void (*ctor)(void *))
+{
+	struct kmem_cache *s = NULL;
 
-#ifdef CONFIG_DEBUG_VM
-oops:
-#endif
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	if (kmem_cache_sanity_check(name, size) == 0)
+		s = __kmem_cache_create(name, size, align, flags, ctor);
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 
-#ifdef CONFIG_DEBUG_VM
-out:
-#endif
 	if (!s && (flags & SLAB_PANIC))
 		panic("kmem_cache_create: Failed to create slab '%s'\n", name);
 
-- 
cgit v1.2.3


From 5b74beb425e0eb009d16d4b9a0f55847a5f342fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 16 Aug 2012 12:25:31 -0700
Subject: mm, slab: remove page_get_cache

page_get_cache() isn't called from anything, so remove it.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index cb6ce2dfc7c..3b4587bb7b1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -515,12 +515,6 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
 
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
-	BUG_ON(!PageSlab(page));
-	return page->slab_cache;
-}
-
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
 	struct page *page = virt_to_head_page(obj);
-- 
cgit v1.2.3


From 203b42f7317494ae5e5efc7be6fb7f29c927f102 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Aug 2012 13:18:23 -0700
Subject: workqueue: make deferrable delayed_work initializer names consistent

Initalizers for deferrable delayed_work are confused.

* __DEFERRED_WORK_INITIALIZER()
* DECLARE_DEFERRED_WORK()
* INIT_DELAYED_WORK_DEFERRABLE()

Rename them to

* __DEFERRABLE_WORK_INITIALIZER()
* DECLARE_DEFERRABLE_WORK()
* INIT_DEFERRABLE_WORK()

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/slab.c   | 2 +-
 mm/vmstat.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f8b0d539b48..35b5cb0da55 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -900,7 +900,7 @@ static void __cpuinit start_cpu_timer(int cpu)
 	 */
 	if (keventd_up() && reap_work->work.func == NULL) {
 		init_reap_node(cpu);
-		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
+		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
 		schedule_delayed_work_on(cpu, reap_work,
 					__round_jiffies_relative(HZ, cpu));
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231..b3e3b9d525d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1157,7 +1157,7 @@ static void __cpuinit start_cpu_timer(int cpu)
 {
 	struct delayed_work *work = &per_cpu(vmstat_work, cpu);
 
-	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+	INIT_DEFERRABLE_WORK(work, vmstat_update);
 	schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 }
 
-- 
cgit v1.2.3


From 38f38657444d15e1a8574eae80ed3de9f501737a Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Thu, 23 Aug 2012 16:53:28 -0400
Subject: xattr: extract simple_xattr code from tmpfs

Extract in-memory xattr APIs from tmpfs. Will be used by cgroup.

$ size vmlinux.o
   text    data     bss     dec     hex filename
4658782  880729 5195032 10734543         a3cbcf vmlinux.o
$ size vmlinux.o
   text    data     bss     dec     hex filename
4658957  880729 5195032 10734718         a3cc7e vmlinux.o

v7:
- checkpatch warnings fixed
- Implement the changes requested by Hugh Dickins:
	- make simple_xattrs_init and simple_xattrs_free inline
	- get rid of locking and list reinitialization in simple_xattrs_free,
	  they're not needed
v6:
- no changes
v5:
- no changes
v4:
- move simple_xattrs_free() to fs/xattr.c
v3:
- in kmem_xattrs_free(), reinitialize the list
- use simple_xattr_* prefix
- introduce simple_xattr_add() to prevent direct list usage

Original-patch-by: Li Zefan <lizefan@huawei.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/shmem.c | 171 +++++--------------------------------------------------------
 1 file changed, 13 insertions(+), 158 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38..d3752110c8c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
 #define SHORT_SYMLINK_LEN 128
 
-struct shmem_xattr {
-	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
-	char *name;		/* xattr name */
-	size_t size;
-	char value[0];
-};
-
 /*
  * shmem_fallocate and shmem_writepage communicate via inode->i_private
  * (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_xattr *xattr, *nxattr;
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
 		shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
 	} else
 		kfree(info->symlink);
 
-	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
-		kfree(xattr->name);
-		kfree(xattr);
-	}
+	simple_xattrs_free(&info->xattrs);
 	BUG_ON(inode->i_blocks);
 	shmem_free_inode(inode->i_sb);
 	clear_inode(inode);
@@ -1377,7 +1366,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		spin_lock_init(&info->lock);
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->swaplist);
-		INIT_LIST_HEAD(&info->xattr_list);
+		simple_xattrs_init(&info->xattrs);
 		cache_no_acl(inode);
 
 		switch (mode & S_IFMT) {
@@ -2059,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
  * filesystem level, though.
  */
 
-/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
- */
-static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
-{
-	struct shmem_xattr *new_xattr;
-	size_t len;
-
-	/* wrap around? */
-	len = sizeof(*new_xattr) + size;
-	if (len <= sizeof(*new_xattr))
-		return NULL;
-
-	new_xattr = kmalloc(len, GFP_KERNEL);
-	if (!new_xattr)
-		return NULL;
-
-	new_xattr->size = size;
-	memcpy(new_xattr->value, value, size);
-	return new_xattr;
-}
-
 /*
  * Callback for security_inode_init_security() for acquiring xattrs.
  */
@@ -2090,11 +2057,11 @@ static int shmem_initxattrs(struct inode *inode,
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	const struct xattr *xattr;
-	struct shmem_xattr *new_xattr;
+	struct simple_xattr *new_xattr;
 	size_t len;
 
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-		new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
 		if (!new_xattr)
 			return -ENOMEM;
 
@@ -2111,91 +2078,12 @@ static int shmem_initxattrs(struct inode *inode,
 		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
 		       xattr->name, len);
 
-		spin_lock(&info->lock);
-		list_add(&new_xattr->list, &info->xattr_list);
-		spin_unlock(&info->lock);
+		simple_xattr_list_add(&info->xattrs, new_xattr);
 	}
 
 	return 0;
 }
 
-static int shmem_xattr_get(struct dentry *dentry, const char *name,
-			   void *buffer, size_t size)
-{
-	struct shmem_inode_info *info;
-	struct shmem_xattr *xattr;
-	int ret = -ENODATA;
-
-	info = SHMEM_I(dentry->d_inode);
-
-	spin_lock(&info->lock);
-	list_for_each_entry(xattr, &info->xattr_list, list) {
-		if (strcmp(name, xattr->name))
-			continue;
-
-		ret = xattr->size;
-		if (buffer) {
-			if (size < xattr->size)
-				ret = -ERANGE;
-			else
-				memcpy(buffer, xattr->value, xattr->size);
-		}
-		break;
-	}
-	spin_unlock(&info->lock);
-	return ret;
-}
-
-static int shmem_xattr_set(struct inode *inode, const char *name,
-			   const void *value, size_t size, int flags)
-{
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_xattr *xattr;
-	struct shmem_xattr *new_xattr = NULL;
-	int err = 0;
-
-	/* value == NULL means remove */
-	if (value) {
-		new_xattr = shmem_xattr_alloc(value, size);
-		if (!new_xattr)
-			return -ENOMEM;
-
-		new_xattr->name = kstrdup(name, GFP_KERNEL);
-		if (!new_xattr->name) {
-			kfree(new_xattr);
-			return -ENOMEM;
-		}
-	}
-
-	spin_lock(&info->lock);
-	list_for_each_entry(xattr, &info->xattr_list, list) {
-		if (!strcmp(name, xattr->name)) {
-			if (flags & XATTR_CREATE) {
-				xattr = new_xattr;
-				err = -EEXIST;
-			} else if (new_xattr) {
-				list_replace(&xattr->list, &new_xattr->list);
-			} else {
-				list_del(&xattr->list);
-			}
-			goto out;
-		}
-	}
-	if (flags & XATTR_REPLACE) {
-		xattr = new_xattr;
-		err = -ENODATA;
-	} else {
-		list_add(&new_xattr->list, &info->xattr_list);
-		xattr = NULL;
-	}
-out:
-	spin_unlock(&info->lock);
-	if (xattr)
-		kfree(xattr->name);
-	kfree(xattr);
-	return err;
-}
-
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	&generic_acl_access_handler,
@@ -2226,6 +2114,7 @@ static int shmem_xattr_validate(const char *name)
 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
 			      void *buffer, size_t size)
 {
+	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
 	int err;
 
 	/*
@@ -2240,12 +2129,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
 	if (err)
 		return err;
 
-	return shmem_xattr_get(dentry, name, buffer, size);
+	return simple_xattr_get(&info->xattrs, name, buffer, size);
 }
 
 static int shmem_setxattr(struct dentry *dentry, const char *name,
 			  const void *value, size_t size, int flags)
 {
+	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
 	int err;
 
 	/*
@@ -2260,15 +2150,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
 	if (err)
 		return err;
 
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-
-	return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
-
+	return simple_xattr_set(&info->xattrs, name, value, size, flags);
 }
 
 static int shmem_removexattr(struct dentry *dentry, const char *name)
 {
+	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
 	int err;
 
 	/*
@@ -2283,45 +2170,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
 	if (err)
 		return err;
 
-	return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
-}
-
-static bool xattr_is_trusted(const char *name)
-{
-	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	return simple_xattr_remove(&info->xattrs, name);
 }
 
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	bool trusted = capable(CAP_SYS_ADMIN);
-	struct shmem_xattr *xattr;
-	struct shmem_inode_info *info;
-	size_t used = 0;
-
-	info = SHMEM_I(dentry->d_inode);
-
-	spin_lock(&info->lock);
-	list_for_each_entry(xattr, &info->xattr_list, list) {
-		size_t len;
-
-		/* skip "trusted." attributes for unprivileged callers */
-		if (!trusted && xattr_is_trusted(xattr->name))
-			continue;
-
-		len = strlen(xattr->name) + 1;
-		used += len;
-		if (buffer) {
-			if (size < used) {
-				used = -ERANGE;
-				break;
-			}
-			memcpy(buffer, xattr->name, len);
-			buffer += len;
-		}
-	}
-	spin_unlock(&info->lock);
-
-	return used;
+	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+	return simple_xattr_list(&info->xattrs, buffer, size);
 }
 #endif /* CONFIG_TMPFS_XATTR */
 
-- 
cgit v1.2.3


From 7034ed132f0af8526f4a8eb7229dd25366a20bfa Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Sat, 25 Aug 2012 16:57:27 +0800
Subject: backing-dev: use kstrto* in preference to simple_strtoul

Fix checkpatch warnings:

WARNING: consider using kstrto* in preference to simple_strtoul

for the below sys entry parsers:
/sys/block/<block device>/bdi/read_ahead_kb
/sys/block/<block device>/bdi/max_ratio
/sys/block/<block device>/bdi/min_ratio

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Vivek Trivedi <vtrivedi018@gmail.com>
---
 mm/backing-dev.c | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b41823cc05e..d3ca2b3ee17 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	char *end;
 	unsigned long read_ahead_kb;
-	ssize_t ret = -EINVAL;
+	ssize_t ret;
 
-	read_ahead_kb = simple_strtoul(buf, &end, 10);
-	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-		bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
-		ret = count;
-	}
-	return ret;
+	ret = kstrtoul(buf, 10, &read_ahead_kb);
+	if (ret < 0)
+		return ret;
+
+	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+	return count;
 }
 
 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	char *end;
 	unsigned int ratio;
-	ssize_t ret = -EINVAL;
+	ssize_t ret;
+
+	ret = kstrtouint(buf, 10, &ratio);
+	if (ret < 0)
+		return ret;
+
+	ret = bdi_set_min_ratio(bdi, ratio);
+	if (!ret)
+		ret = count;
 
-	ratio = simple_strtoul(buf, &end, 10);
-	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-		ret = bdi_set_min_ratio(bdi, ratio);
-		if (!ret)
-			ret = count;
-	}
 	return ret;
 }
 BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	char *end;
 	unsigned int ratio;
-	ssize_t ret = -EINVAL;
+	ssize_t ret;
+
+	ret = kstrtouint(buf, 10, &ratio);
+	if (ret < 0)
+		return ret;
+
+	ret = bdi_set_max_ratio(bdi, ratio);
+	if (!ret)
+		ret = count;
 
-	ratio = simple_strtoul(buf, &end, 10);
-	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-		ret = bdi_set_max_ratio(bdi, ratio);
-		if (!ret)
-			ret = count;
-	}
 	return ret;
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
-- 
cgit v1.2.3


From e21827aadd77e33833eeb393de2e8675e9b399e2 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Tue, 14 Aug 2012 09:55:21 -0300
Subject: mm: Use __do_krealloc to do the krealloc job

Without this patch we can get (many) kmem trace events
with call site at krealloc().

This happens because krealloc is calling __krealloc,
which performs the allocation through kmalloc_track_caller.

Since neither krealloc nor __krealloc are marked inline explicitly,
the caller can be traced as being krealloc, which clearly is not
the intended behavior.

This patch allows to get the real caller of krealloc, by creating
an always inlined function __do_krealloc, thus tracing the
call site accurately.

Acked-by: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/util.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index 8c7265afa29..dc3036cdcc6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
 
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+					   gfp_t flags)
+{
+	void *ret;
+	size_t ks = 0;
+
+	if (p)
+		ks = ksize(p);
+
+	if (ks >= new_size)
+		return (void *)p;
+
+	ret = kmalloc_track_caller(new_size, flags);
+	if (ret && p)
+		memcpy(ret, p, ks);
+
+	return ret;
+}
+
 /**
  * __krealloc - like krealloc() but don't free @p.
  * @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
  */
 void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
-	void *ret;
-	size_t ks = 0;
-
 	if (unlikely(!new_size))
 		return ZERO_SIZE_PTR;
 
-	if (p)
-		ks = ksize(p);
+	return __do_krealloc(p, new_size, flags);
 
-	if (ks >= new_size)
-		return (void *)p;
-
-	ret = kmalloc_track_caller(new_size, flags);
-	if (ret && p)
-		memcpy(ret, p, ks);
-
-	return ret;
 }
 EXPORT_SYMBOL(__krealloc);
 
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 		return ZERO_SIZE_PTR;
 	}
 
-	ret = __krealloc(p, new_size, flags);
+	ret = __do_krealloc(p, new_size, flags);
 	if (ret && p != ret)
 		kfree(p);
 
-- 
cgit v1.2.3


From 79576102afc24fcc6627d7a15691e432d9a2eacb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:06:14 +0000
Subject: mm/slub: Add debugging to verify correct cache use on
 kmem_cache_free()

Add additional debugging to check that the objects is actually from the cache
the caller claims. Doing so currently trips up some other debugging code. It
takes a lot to infer from that what was happening.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
[ penberg@kernel.org: Use pr_err() ]
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index c67bd0a4a95..99059217434 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2614,6 +2614,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 
 	page = virt_to_head_page(x);
 
+	if (kmem_cache_debug(s) && page->slab != s) {
+		pr_err("kmem_cache_free: Wrong slab cache. %s but object"
+			" is from  %s\n", page->slab->name, s->name);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	slab_free(s, page, x, _RET_IP_);
 
 	trace_kmem_cache_free(_RET_IP_, x);
-- 
cgit v1.2.3


From 208c4358dc4a8f0fe99e49eb8d21a869b01e7d34 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:06:14 +0000
Subject: mm/slub: Use kmem_cache for the kmem_cache structure

Do not use kmalloc() but kmem_cache_alloc() for the allocation
of the kmem_cache structures in slub.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 99059217434..c6690898321 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -213,7 +213,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
 	kfree(s->name);
-	kfree(s);
+	kmem_cache_free(kmem_cache, s);
 }
 
 #endif
@@ -3969,7 +3969,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	if (!n)
 		return NULL;
 
-	s = kmalloc(kmem_size, GFP_KERNEL);
+	s = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
 	if (s) {
 		if (kmem_cache_open(s, n,
 				size, align, flags, ctor)) {
@@ -3986,7 +3986,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 			list_del(&s->list);
 			kmem_cache_close(s);
 		}
-		kfree(s);
+		kmem_cache_free(kmem_cache, s);
 	}
 	kfree(n);
 	return NULL;
@@ -5224,7 +5224,7 @@ static void kmem_cache_release(struct kobject *kobj)
 	struct kmem_cache *s = to_slab(kobj);
 
 	kfree(s->name);
-	kfree(s);
+	kmem_cache_free(kmem_cache, s);
 }
 
 static const struct sysfs_ops slab_sysfs_ops = {
-- 
cgit v1.2.3


From 686d550d222e8f83f6e709debbedf9d8ef77aec7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:20:33 +0000
Subject: mm/slab_common: Improve error handling in kmem_cache_create

Instead of using s == NULL use an errorcode. This allows much more
detailed diagnostics as to what went wrong. As we add more functionality
from the slab allocators to the common kmem_cache_create() function we will
also add more error conditions.

Print the error code during the panic as well as in a warning if the module
can handle failure. The API for kmem_cache_create() currently does not allow
the returning of an error code. Return NULL but log the cause of the problem
in the syslog.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8cf8b4962d6..fe8dc943c28 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -98,16 +98,36 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s = NULL;
+	int err = 0;
 
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
-	if (kmem_cache_sanity_check(name, size) == 0)
-		s = __kmem_cache_create(name, size, align, flags, ctor);
+
+	if (!kmem_cache_sanity_check(name, size) == 0)
+		goto out_locked;
+
+
+	s = __kmem_cache_create(name, size, align, flags, ctor);
+	if (!s)
+		err = -ENOSYS; /* Until __kmem_cache_create returns code */
+
+out_locked:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 
-	if (!s && (flags & SLAB_PANIC))
-		panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+	if (err) {
+
+		if (flags & SLAB_PANIC)
+			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
+				name, err);
+		else {
+			printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+				name, err);
+			dump_stack();
+		}
+
+		return NULL;
+	}
 
 	return s;
 }
-- 
cgit v1.2.3


From 7c9adf5a5471647f392169ef19d3e81dcfa76045 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:38:33 +0000
Subject: mm/sl[aou]b: Move list_add() to slab_common.c

Move the code to append the new kmem_cache to the list of slab caches to
the kmem_cache_create code in the shared code.

This is possible now since the acquisition of the mutex was moved into
kmem_cache_create().

Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 7 +++++--
 mm/slab_common.c | 7 +++++++
 mm/slob.c        | 4 ++++
 mm/slub.c        | 2 --
 4 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3b4587bb7b1..a6990316849 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1680,6 +1680,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL);
 
+	list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
 	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
 			__kmem_cache_create(names[INDEX_L3].name,
@@ -1687,6 +1688,7 @@ void __init kmem_cache_init(void)
 				ARCH_KMALLOC_MINALIGN,
 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 				NULL);
+		list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
 	}
 
 	slab_early_init = 0;
@@ -1705,6 +1707,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL);
+			list_add(&sizes->cs_cachep->list, &slab_caches);
 		}
 #ifdef CONFIG_ZONE_DMA
 		sizes->cs_dmacachep = __kmem_cache_create(
@@ -1714,6 +1717,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
 						SLAB_PANIC,
 					NULL);
+		list_add(&sizes->cs_dmacachep->list, &slab_caches);
 #endif
 		sizes++;
 		names++;
@@ -2583,6 +2587,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	}
 	cachep->ctor = ctor;
 	cachep->name = name;
+	cachep->refcount = 1;
 
 	if (setup_cpu_cache(cachep, gfp)) {
 		__kmem_cache_destroy(cachep);
@@ -2599,8 +2604,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 		slab_set_debugobj_lock_classes(cachep);
 	}
 
-	/* cache setup completed, link it into the list */
-	list_add(&cachep->list, &slab_caches);
 	return cachep;
 }
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fe8dc943c28..5190a7cd02b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -111,6 +111,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 	if (!s)
 		err = -ENOSYS; /* Until __kmem_cache_create returns code */
 
+	/*
+	 * Check if the slab has actually been created and if it was a
+	 * real instatiation. Aliases do not belong on the list
+	 */
+	if (s && s->refcount == 1)
+		list_add(&s->list, &slab_caches);
+
 out_locked:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933..5225d28f269 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -540,6 +540,10 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 
 void kmem_cache_destroy(struct kmem_cache *c)
 {
+	mutex_lock(&slab_mutex);
+	list_del(&c->list);
+	mutex_unlock(&slab_mutex);
+
 	kmemleak_free(c);
 	if (c->flags & SLAB_DESTROY_BY_RCU)
 		rcu_barrier();
diff --git a/mm/slub.c b/mm/slub.c
index c6690898321..24aa362edef 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3975,7 +3975,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 				size, align, flags, ctor)) {
 			int r;
 
-			list_add(&s->list, &slab_caches);
 			mutex_unlock(&slab_mutex);
 			r = sysfs_slab_add(s);
 			mutex_lock(&slab_mutex);
@@ -3983,7 +3982,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 			if (!r)
 				return s;
 
-			list_del(&s->list);
 			kmem_cache_close(s);
 		}
 		kmem_cache_free(kmem_cache, s);
-- 
cgit v1.2.3


From 945cf2b6199be70ff03102b9e642c3bb05d01de9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:18:33 +0000
Subject: mm/sl[aou]b: Extract a common function for kmem_cache_destroy

kmem_cache_destroy does basically the same in all allocators.

Extract common code which is easy since we already have common mutex
handling.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 45 +++------------------------------------------
 mm/slab.h        |  3 +++
 mm/slab_common.c | 25 +++++++++++++++++++++++++
 mm/slob.c        | 15 +++++++--------
 mm/slub.c        | 36 +++++++++++-------------------------
 5 files changed, 49 insertions(+), 75 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a6990316849..49a74b349e3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2206,7 +2206,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 	}
 }
 
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
+void __kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2763,49 +2763,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
-/**
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a &struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded.  This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that no one will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	BUG_ON(!cachep || in_interrupt());
-
-	/* Find the cache in the chain of caches. */
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
-	/*
-	 * the chain is never empty, cache_cache is never destroyed
-	 */
-	list_del(&cachep->list);
-	if (__cache_shrink(cachep)) {
-		slab_error(cachep, "Can't free all objects");
-		list_add(&cachep->list, &slab_caches);
-		mutex_unlock(&slab_mutex);
-		put_online_cpus();
-		return;
-	}
-
-	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
-		rcu_barrier();
-
-	__kmem_cache_destroy(cachep);
-	mutex_unlock(&slab_mutex);
-	put_online_cpus();
+	return __cache_shrink(cachep);
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 
 /*
  * Get the memory for a slab management obj.
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa2..07a537ed5da 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,4 +30,7 @@ extern struct list_head slab_caches;
 struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
 
+int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_destroy(struct kmem_cache *);
+
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5190a7cd02b..a1c4f0b5aae 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -140,6 +140,31 @@ out_locked:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	s->refcount--;
+	if (!s->refcount) {
+		list_del(&s->list);
+
+		if (!__kmem_cache_shutdown(s)) {
+			if (s->flags & SLAB_DESTROY_BY_RCU)
+				rcu_barrier();
+
+			__kmem_cache_destroy(s);
+		} else {
+			list_add(&s->list, &slab_caches);
+			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+				s->name);
+			dump_stack();
+		}
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
 int slab_is_available(void)
 {
 	return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 5225d28f269..289be4f4681 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -538,18 +538,11 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	return c;
 }
 
-void kmem_cache_destroy(struct kmem_cache *c)
+void __kmem_cache_destroy(struct kmem_cache *c)
 {
-	mutex_lock(&slab_mutex);
-	list_del(&c->list);
-	mutex_unlock(&slab_mutex);
-
 	kmemleak_free(c);
-	if (c->flags & SLAB_DESTROY_BY_RCU)
-		rcu_barrier();
 	slob_free(c, sizeof(struct kmem_cache));
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
@@ -617,6 +610,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
+int __kmem_cache_shutdown(struct kmem_cache *c)
+{
+	/* No way to check for remaining objects */
+	return 0;
+}
+
 int kmem_cache_shrink(struct kmem_cache *d)
 {
 	return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 24aa362edef..724adea3438 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -624,7 +624,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
 	print_trailer(s, page, object);
 }
 
-static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
 {
 	va_list args;
 	char buf[100];
@@ -3146,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 				     sizeof(long), GFP_ATOMIC);
 	if (!map)
 		return;
-	slab_err(s, page, "%s", text);
+	slab_err(s, page, text, s->name);
 	slab_lock(page);
 
 	get_map(s, page, map);
@@ -3178,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 			discard_slab(s, page);
 		} else {
 			list_slab_objects(s, page,
-				"Objects remaining on kmem_cache_close()");
+			"Objects remaining in %s on kmem_cache_close()");
 		}
 	}
 }
@@ -3191,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 	int node;
 
 	flush_all(s);
-	free_percpu(s->cpu_slab);
 	/* Attempt to free all objects */
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n = get_node(s, node);
@@ -3200,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
+	free_percpu(s->cpu_slab);
 	free_kmem_cache_nodes(s);
 	return 0;
 }
 
-/*
- * Close a cache and release the kmem_cache structure
- * (must be used for caches created using kmem_cache_create)
- */
-void kmem_cache_destroy(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
 {
-	mutex_lock(&slab_mutex);
-	s->refcount--;
-	if (!s->refcount) {
-		list_del(&s->list);
-		mutex_unlock(&slab_mutex);
-		if (kmem_cache_close(s)) {
-			printk(KERN_ERR "SLUB %s: %s called for cache that "
-				"still has objects.\n", s->name, __func__);
-			dump_stack();
-		}
-		if (s->flags & SLAB_DESTROY_BY_RCU)
-			rcu_barrier();
-		sysfs_slab_remove(s);
-	} else
-		mutex_unlock(&slab_mutex);
+	return kmem_cache_close(s);
+}
+
+void __kmem_cache_destroy(struct kmem_cache *s)
+{
+	sysfs_slab_remove(s);
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 
 /********************************************************************
  *		Kmalloc subsystem
-- 
cgit v1.2.3


From 9b030cb865f137e1574596983face2a07e41e8b2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:20:33 +0000
Subject: mm/sl[aou]b: Use "kmem_cache" name for slab cache with kmem_cache
 struct

Make all allocators use the "kmem_cache" slabname for the "kmem_cache"
structure.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 72 +++++++++++++++++++++++++++++---------------------------
 mm/slab.h        |  6 +++++
 mm/slab_common.c |  1 +
 mm/slob.c        |  8 +++++++
 mm/slub.c        |  2 --
 5 files changed, 52 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 49a74b349e3..ef94799a1aa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -578,9 +578,9 @@ static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
-static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
-static struct kmem_cache cache_cache = {
-	.nodelists = cache_cache_nodelists,
+static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
+static struct kmem_cache kmem_cache_boot = {
+	.nodelists = kmem_cache_nodelists,
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
@@ -1594,15 +1594,17 @@ void __init kmem_cache_init(void)
 	int order;
 	int node;
 
+	kmem_cache = &kmem_cache_boot;
+
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
 
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
 		if (i < MAX_NUMNODES)
-			cache_cache.nodelists[i] = NULL;
+			kmem_cache->nodelists[i] = NULL;
 	}
-	set_up_list3s(&cache_cache, CACHE_CACHE);
+	set_up_list3s(kmem_cache, CACHE_CACHE);
 
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
@@ -1614,9 +1616,9 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the struct
-	 *    kmem_cache structures of all caches, except cache_cache itself:
-	 *    cache_cache is statically allocated.
+	 * 1) initialize the kmem_cache cache: it contains the struct
+	 *    kmem_cache structures of all caches, except kmem_cache itself:
+	 *    kmem_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
@@ -1625,43 +1627,43 @@ void __init kmem_cache_init(void)
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
-	 * 4) Replace the __init data head arrays for cache_cache and the first
+	 * 4) Replace the __init data head arrays for kmem_cache and the first
 	 *    kmalloc cache with kmalloc allocated arrays.
-	 * 5) Replace the __init data for kmem_list3 for cache_cache and
+	 * 5) Replace the __init data for kmem_list3 for kmem_cache and
 	 *    the other cache's with kmalloc allocated memory.
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
 	node = numa_mem_id();
 
-	/* 1) create the cache_cache */
+	/* 1) create the kmem_cache */
 	INIT_LIST_HEAD(&slab_caches);
-	list_add(&cache_cache.list, &slab_caches);
-	cache_cache.colour_off = cache_line_size();
-	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+	list_add(&kmem_cache->list, &slab_caches);
+	kmem_cache->colour_off = cache_line_size();
+	kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
+	kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
 
 	/*
 	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
 	 */
-	cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+	kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
 				  nr_node_ids * sizeof(struct kmem_list3 *);
-	cache_cache.object_size = cache_cache.size;
-	cache_cache.size = ALIGN(cache_cache.size,
+	kmem_cache->object_size = kmem_cache->size;
+	kmem_cache->size = ALIGN(kmem_cache->object_size,
 					cache_line_size());
-	cache_cache.reciprocal_buffer_size =
-		reciprocal_value(cache_cache.size);
+	kmem_cache->reciprocal_buffer_size =
+		reciprocal_value(kmem_cache->size);
 
 	for (order = 0; order < MAX_ORDER; order++) {
-		cache_estimate(order, cache_cache.size,
-			cache_line_size(), 0, &left_over, &cache_cache.num);
-		if (cache_cache.num)
+		cache_estimate(order, kmem_cache->size,
+			cache_line_size(), 0, &left_over, &kmem_cache->num);
+		if (kmem_cache->num)
 			break;
 	}
-	BUG_ON(!cache_cache.num);
-	cache_cache.gfporder = order;
-	cache_cache.colour = left_over / cache_cache.colour_off;
-	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+	BUG_ON(!kmem_cache->num);
+	kmem_cache->gfporder = order;
+	kmem_cache->colour = left_over / kmem_cache->colour_off;
+	kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
@@ -1728,15 +1730,15 @@ void __init kmem_cache_init(void)
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, cpu_cache_get(&cache_cache),
+		BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(kmem_cache),
 		       sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
 
-		cache_cache.array[smp_processor_id()] = ptr;
+		kmem_cache->array[smp_processor_id()] = ptr;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
@@ -1757,7 +1759,7 @@ void __init kmem_cache_init(void)
 		int nid;
 
 		for_each_online_node(nid) {
-			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+			init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
 
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
 				  &initkmem_list3[SIZE_AC + nid], nid);
@@ -2223,7 +2225,7 @@ void __kmem_cache_destroy(struct kmem_cache *cachep)
 			kfree(l3);
 		}
 	}
-	kmem_cache_free(&cache_cache, cachep);
+	kmem_cache_free(kmem_cache, cachep);
 }
 
 
@@ -2473,7 +2475,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 		gfp = GFP_NOWAIT;
 
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, gfp);
+	cachep = kmem_cache_zalloc(kmem_cache, gfp);
 	if (!cachep)
 		return NULL;
 
@@ -2531,7 +2533,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	if (!cachep->num) {
 		printk(KERN_ERR
 		       "kmem_cache_create: couldn't create cache %s.\n", name);
-		kmem_cache_free(&cache_cache, cachep);
+		kmem_cache_free(kmem_cache, cachep);
 		return NULL;
 	}
 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
@@ -3299,7 +3301,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 
 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
-	if (cachep == &cache_cache)
+	if (cachep == kmem_cache)
 		return false;
 
 	return should_failslab(cachep->object_size, flags, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 07a537ed5da..6724aa6f662 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,8 +25,14 @@ extern enum slab_state slab_state;
 
 /* The slab cache mutex protects the management structures during changes */
 extern struct mutex slab_mutex;
+
+/* The list of all slab caches on the system */
 extern struct list_head slab_caches;
 
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+
+/* Functions provided by the slab allocators */
 struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a1c4f0b5aae..5374150f548 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -22,6 +22,7 @@
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
 
 #ifdef CONFIG_DEBUG_VM
 static int kmem_cache_sanity_check(const char *name, size_t size)
diff --git a/mm/slob.c b/mm/slob.c
index 289be4f4681..7d272c3dcc0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -622,8 +622,16 @@ int kmem_cache_shrink(struct kmem_cache *d)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
+struct kmem_cache kmem_cache_boot = {
+	.name = "kmem_cache",
+	.size = sizeof(struct kmem_cache),
+	.flags = SLAB_PANIC,
+	.align = ARCH_KMALLOC_MINALIGN,
+};
+
 void __init kmem_cache_init(void)
 {
+	kmem_cache = &kmem_cache_boot;
 	slab_state = UP;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 724adea3438..e0d1e047030 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3221,8 +3221,6 @@ void __kmem_cache_destroy(struct kmem_cache *s)
 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 EXPORT_SYMBOL(kmalloc_caches);
 
-static struct kmem_cache *kmem_cache;
-
 #ifdef CONFIG_ZONE_DMA
 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
 #endif
-- 
cgit v1.2.3


From 8f4c765c22deee766319ae9a1db68325f14816e6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:18:32 +0000
Subject: mm/sl[aou]b: Move freeing of kmem_cache structure to common code

The freeing action is basically the same in all slab allocators.
Move to the common kmem_cache_destroy() function.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 1 -
 mm/slab_common.c | 1 +
 mm/slob.c        | 2 --
 mm/slub.c        | 2 --
 4 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ef94799a1aa..8ca6ec6301f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2225,7 +2225,6 @@ void __kmem_cache_destroy(struct kmem_cache *cachep)
 			kfree(l3);
 		}
 	}
-	kmem_cache_free(kmem_cache, cachep);
 }
 
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5374150f548..d6deae9108c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -154,6 +154,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 				rcu_barrier();
 
 			__kmem_cache_destroy(s);
+			kmem_cache_free(kmem_cache, s);
 		} else {
 			list_add(&s->list, &slab_caches);
 			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
diff --git a/mm/slob.c b/mm/slob.c
index 7d272c3dcc0..cb4ab967529 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -540,8 +540,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 
 void __kmem_cache_destroy(struct kmem_cache *c)
 {
-	kmemleak_free(c);
-	slob_free(c, sizeof(struct kmem_cache));
 }
 
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
diff --git a/mm/slub.c b/mm/slub.c
index e0d1e047030..6f932f7a821 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -213,7 +213,6 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
 	kfree(s->name);
-	kmem_cache_free(kmem_cache, s);
 }
 
 #endif
@@ -5206,7 +5205,6 @@ static void kmem_cache_release(struct kobject *kobj)
 	struct kmem_cache *s = to_slab(kobj);
 
 	kfree(s->name);
-	kmem_cache_free(kmem_cache, s);
 }
 
 static const struct sysfs_ops slab_sysfs_ops = {
-- 
cgit v1.2.3


From 12c3667fb780e20360ad0bde32dfb3591ef609ad Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:38:33 +0000
Subject: mm/sl[aou]b: Get rid of __kmem_cache_destroy

What is done there can be done in __kmem_cache_shutdown.

This affects RCU handling somewhat. On rcu free all slab allocators do
not refer to other management structures than the kmem_cache structure.
Therefore these other structures can be freed before the rcu deferred
free to the page allocator occurs.

Reviewed-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 46 +++++++++++++++++++++-------------------------
 mm/slab.h        |  1 -
 mm/slab_common.c |  1 -
 mm/slob.c        |  4 ----
 mm/slub.c        | 10 +++++-----
 5 files changed, 26 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 8ca6ec6301f..de961b48a6a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2208,26 +2208,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 	}
 }
 
-void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
-	int i;
-	struct kmem_list3 *l3;
-
-	for_each_online_cpu(i)
-	    kfree(cachep->array[i]);
-
-	/* NUMA: free the list3 structures */
-	for_each_online_node(i) {
-		l3 = cachep->nodelists[i];
-		if (l3) {
-			kfree(l3->shared);
-			free_alien_cache(l3->alien);
-			kfree(l3);
-		}
-	}
-}
-
-
 /**
  * calculate_slab_order - calculate size (page order) of slabs
  * @cachep: pointer to the cache that is being created
@@ -2364,9 +2344,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
  * Cannot be called within a int, but can be interrupted.
  * The @ctor is run when new pages are allocated by the cache.
  *
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2591,7 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->refcount = 1;
 
 	if (setup_cpu_cache(cachep, gfp)) {
-		__kmem_cache_destroy(cachep);
+		__kmem_cache_shutdown(cachep);
 		return NULL;
 	}
 
@@ -2766,7 +2743,26 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	return __cache_shrink(cachep);
+	int i;
+	struct kmem_list3 *l3;
+	int rc = __cache_shrink(cachep);
+
+	if (rc)
+		return rc;
+
+	for_each_online_cpu(i)
+	    kfree(cachep->array[i]);
+
+	/* NUMA: free the list3 structures */
+	for_each_online_node(i) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
+			kfree(l3->shared);
+			free_alien_cache(l3->alien);
+			kfree(l3);
+		}
+	}
+	return 0;
 }
 
 /*
diff --git a/mm/slab.h b/mm/slab.h
index 6724aa6f662..c4f9a361bd1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -37,6 +37,5 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
 
 int __kmem_cache_shutdown(struct kmem_cache *);
-void __kmem_cache_destroy(struct kmem_cache *);
 
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d6deae9108c..7df814e8fbe 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -153,7 +153,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
 			if (s->flags & SLAB_DESTROY_BY_RCU)
 				rcu_barrier();
 
-			__kmem_cache_destroy(s);
 			kmem_cache_free(kmem_cache, s);
 		} else {
 			list_add(&s->list, &slab_caches);
diff --git a/mm/slob.c b/mm/slob.c
index cb4ab967529..50f60532270 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -538,10 +538,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	return c;
 }
 
-void __kmem_cache_destroy(struct kmem_cache *c)
-{
-}
-
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
diff --git a/mm/slub.c b/mm/slub.c
index 6f932f7a821..e5e09873f5e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3205,12 +3205,12 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 
 int __kmem_cache_shutdown(struct kmem_cache *s)
 {
-	return kmem_cache_close(s);
-}
+	int rc = kmem_cache_close(s);
 
-void __kmem_cache_destroy(struct kmem_cache *s)
-{
-	sysfs_slab_remove(s);
+	if (!rc)
+		sysfs_slab_remove(s);
+
+	return rc;
 }
 
 /********************************************************************
-- 
cgit v1.2.3


From db265eca77000c5dafc5608975afe8dafb2a02d5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:18:33 +0000
Subject: mm/sl[aou]b: Move duping of slab name to slab_common.c

Duping of the slabname has to be done by each slab. Moving this code to
slab_common avoids duplicate implementations.

With this patch we have common string handling for all slab allocators.
Strings passed to kmem_cache_create() are copied internally. Subsystems
can create temporary strings to create slab caches.

Slabs allocated in early states of bootstrap will never be freed (and
those can never be freed since they are essential to slab allocator
operations).  During bootstrap we therefore do not have to worry about
duping names.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 30 +++++++++++++++++++++---------
 mm/slub.c        | 21 ++-------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7df814e8fbe..f18c06fd97c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -100,6 +100,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 {
 	struct kmem_cache *s = NULL;
 	int err = 0;
+	char *n;
 
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
@@ -108,16 +109,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		goto out_locked;
 
 
-	s = __kmem_cache_create(name, size, align, flags, ctor);
-	if (!s)
-		err = -ENOSYS; /* Until __kmem_cache_create returns code */
+	n = kstrdup(name, GFP_KERNEL);
+	if (!n) {
+		err = -ENOMEM;
+		goto out_locked;
+	}
+
+	s = __kmem_cache_create(n, size, align, flags, ctor);
+
+	if (s) {
+		/*
+		 * Check if the slab has actually been created and if it was a
+		 * real instatiation. Aliases do not belong on the list
+		 */
+		if (s->refcount == 1)
+			list_add(&s->list, &slab_caches);
 
-	/*
-	 * Check if the slab has actually been created and if it was a
-	 * real instatiation. Aliases do not belong on the list
-	 */
-	if (s && s->refcount == 1)
-		list_add(&s->list, &slab_caches);
+	} else {
+		kfree(n);
+		err = -ENOSYS; /* Until __kmem_cache_create returns code */
+	}
 
 out_locked:
 	mutex_unlock(&slab_mutex);
@@ -153,6 +164,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 			if (s->flags & SLAB_DESTROY_BY_RCU)
 				rcu_barrier();
 
+			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
 		} else {
 			list_add(&s->list, &slab_caches);
diff --git a/mm/slub.c b/mm/slub.c
index e5e09873f5e..91c9a2fe676 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,10 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *);
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s)
-{
-	kfree(s->name);
-}
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 
 #endif
 
@@ -3929,7 +3926,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
-	char *n;
 
 	s = find_mergeable(size, align, flags, name, ctor);
 	if (s) {
@@ -3948,13 +3944,9 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 		return s;
 	}
 
-	n = kstrdup(name, GFP_KERNEL);
-	if (!n)
-		return NULL;
-
 	s = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
 	if (s) {
-		if (kmem_cache_open(s, n,
+		if (kmem_cache_open(s, name,
 				size, align, flags, ctor)) {
 			int r;
 
@@ -3969,7 +3961,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 		}
 		kmem_cache_free(kmem_cache, s);
 	}
-	kfree(n);
 	return NULL;
 }
 
@@ -5200,13 +5191,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 	return err;
 }
 
-static void kmem_cache_release(struct kobject *kobj)
-{
-	struct kmem_cache *s = to_slab(kobj);
-
-	kfree(s->name);
-}
-
 static const struct sysfs_ops slab_sysfs_ops = {
 	.show = slab_attr_show,
 	.store = slab_attr_store,
@@ -5214,7 +5198,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
 
 static struct kobj_type slab_ktype = {
 	.sysfs_ops = &slab_sysfs_ops,
-	.release = kmem_cache_release
 };
 
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
-- 
cgit v1.2.3


From cbb79694d592e9a76880f6ef6db8feccaeee1c32 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:18:32 +0000
Subject: mm/sl[aou]b: Do slab aliasing call from common code

The slab aliasing logic causes some strange contortions in slub. So add
a call to deal with aliases to slab_common.c but disable it for other
slab allocators by providng stubs that fail to create aliases.

Full general support for aliases will require additional cleanup passes
and more standardization of fields in kmem_cache.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.h        | 10 ++++++++++
 mm/slab_common.c |  4 ++++
 mm/slub.c        | 15 +++++++++++----
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.h b/mm/slab.h
index c4f9a361bd1..84c28f451d2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -36,6 +36,16 @@ extern struct kmem_cache *kmem_cache;
 struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
 
+#ifdef CONFIG_SLUB
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+	size_t align, unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+	size_t align, unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+
+
 int __kmem_cache_shutdown(struct kmem_cache *);
 
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f18c06fd97c..adc42b01b25 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -115,6 +115,10 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		goto out_locked;
 	}
 
+	s = __kmem_cache_alias(name, size, align, flags, ctor);
+	if (s)
+		goto out_locked;
+
 	s = __kmem_cache_create(n, size, align, flags, ctor);
 
 	if (s) {
diff --git a/mm/slub.c b/mm/slub.c
index 91c9a2fe676..64d445e7a27 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3708,7 +3708,7 @@ void __init kmem_cache_init(void)
 		slub_max_order = 0;
 
 	kmem_size = offsetof(struct kmem_cache, node) +
-				nr_node_ids * sizeof(struct kmem_cache_node *);
+			nr_node_ids * sizeof(struct kmem_cache_node *);
 
 	/* Allocate two kmem_caches from the page allocator */
 	kmalloc_size = ALIGN(kmem_size, cache_line_size());
@@ -3922,7 +3922,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 	return NULL;
 }
 
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
@@ -3939,11 +3939,18 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 
 		if (sysfs_slab_alias(s, name)) {
 			s->refcount--;
-			return NULL;
+			s = NULL;
 		}
-		return s;
 	}
 
+	return s;
+}
+
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+		size_t align, unsigned long flags, void (*ctor)(void *))
+{
+	struct kmem_cache *s;
+
 	s = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
 	if (s) {
 		if (kmem_cache_open(s, name,
-- 
cgit v1.2.3


From 96d17b7be0a9849d381442030886211dbb2a7061 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:18:32 +0000
Subject: mm/sl[aou]b: Move sysfs_slab_add to common

Simplify locking by moving the slab_add_sysfs after all locks have been
dropped. Eases the upcoming move to provide sysfs support for all
allocators.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.h        |  3 +++
 mm/slab_common.c |  8 ++++++++
 mm/slub.c        | 15 ++-------------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.h b/mm/slab.h
index 84c28f451d2..ec7b94429b9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -39,10 +39,13 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 #ifdef CONFIG_SLUB
 struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
+extern int sysfs_slab_add(struct kmem_cache *s);
 #else
 static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *))
 { return NULL; }
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+
 #endif
 
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index adc42b01b25..4f722084bae 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -152,6 +152,14 @@ out_locked:
 		return NULL;
 	}
 
+	if (s->refcount == 1) {
+		err = sysfs_slab_add(s);
+		if (err)
+			printk(KERN_WARNING "kmem_cache_create(%s) failed to"
+				" create sysfs entry. Error %d\n",
+					name, err);
+	}
+
 	return s;
 }
 EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/slub.c b/mm/slub.c
index 64d445e7a27..8d00fd78df2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -202,12 +202,10 @@ struct track {
 enum track_item { TRACK_ALLOC, TRACK_FREE };
 
 #ifdef CONFIG_SYSFS
-static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
 
 #else
-static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s) { }
@@ -3955,16 +3953,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	if (s) {
 		if (kmem_cache_open(s, name,
 				size, align, flags, ctor)) {
-			int r;
-
-			mutex_unlock(&slab_mutex);
-			r = sysfs_slab_add(s);
-			mutex_lock(&slab_mutex);
-
-			if (!r)
-				return s;
-
-			kmem_cache_close(s);
+			return s;
 		}
 		kmem_cache_free(kmem_cache, s);
 	}
@@ -5258,7 +5247,7 @@ static char *create_unique_id(struct kmem_cache *s)
 	return name;
 }
 
-static int sysfs_slab_add(struct kmem_cache *s)
+int sysfs_slab_add(struct kmem_cache *s)
 {
 	int err;
 	const char *name;
-- 
cgit v1.2.3


From 278b1bb1313664d4999a7f7d47a8a8d964862d02 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 5 Sep 2012 00:20:34 +0000
Subject: mm/sl[aou]b: Move kmem_cache allocations into common code

Shift the allocations to common code. That way the allocation and
freeing of the kmem_cache structures is handled by common code.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 34 ++++++++++++++++------------------
 mm/slab.h        |  4 ++--
 mm/slab_common.c | 18 ++++++++++--------
 mm/slob.c        | 42 +++++++++++++++++-------------------------
 mm/slub.c        | 24 +++++++-----------------
 5 files changed, 52 insertions(+), 70 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index de961b48a6a..abc83334e5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1676,7 +1676,8 @@ void __init kmem_cache_init(void)
 	 * bug.
 	 */
 
-	sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
+	sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+	__kmem_cache_create(sizes[INDEX_AC].cs_cachep, names[INDEX_AC].name,
 					sizes[INDEX_AC].cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1684,8 +1685,8 @@ void __init kmem_cache_init(void)
 
 	list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
 	if (INDEX_AC != INDEX_L3) {
-		sizes[INDEX_L3].cs_cachep =
-			__kmem_cache_create(names[INDEX_L3].name,
+		sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+		__kmem_cache_create(sizes[INDEX_L3].cs_cachep, names[INDEX_L3].name,
 				sizes[INDEX_L3].cs_size,
 				ARCH_KMALLOC_MINALIGN,
 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1704,7 +1705,8 @@ void __init kmem_cache_init(void)
 		 * allow tighter packing of the smaller caches.
 		 */
 		if (!sizes->cs_cachep) {
-			sizes->cs_cachep = __kmem_cache_create(names->name,
+			sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+			__kmem_cache_create(sizes->cs_cachep, names->name,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1712,7 +1714,8 @@ void __init kmem_cache_init(void)
 			list_add(&sizes->cs_cachep->list, &slab_caches);
 		}
 #ifdef CONFIG_ZONE_DMA
-		sizes->cs_dmacachep = __kmem_cache_create(
+		sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+		__kmem_cache_create(sizes->cs_dmacachep,
 					names->name_dma,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
@@ -2356,13 +2359,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-struct kmem_cache *
-__kmem_cache_create (const char *name, size_t size, size_t align,
+int
+__kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, size_t align,
 	unsigned long flags, void (*ctor)(void *))
 {
 	size_t left_over, slab_size, ralign;
-	struct kmem_cache *cachep = NULL;
 	gfp_t gfp;
+	int err;
 
 #if DEBUG
 #if FORCED_DEBUG
@@ -2450,11 +2453,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	else
 		gfp = GFP_NOWAIT;
 
-	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(kmem_cache, gfp);
-	if (!cachep)
-		return NULL;
-
 	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
 	cachep->object_size = size;
 	cachep->align = align;
@@ -2509,8 +2507,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	if (!cachep->num) {
 		printk(KERN_ERR
 		       "kmem_cache_create: couldn't create cache %s.\n", name);
-		kmem_cache_free(kmem_cache, cachep);
-		return NULL;
+		return -E2BIG;
 	}
 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
 			  + sizeof(struct slab), align);
@@ -2567,9 +2564,10 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->name = name;
 	cachep->refcount = 1;
 
-	if (setup_cpu_cache(cachep, gfp)) {
+	err = setup_cpu_cache(cachep, gfp);
+	if (err) {
 		__kmem_cache_shutdown(cachep);
-		return NULL;
+		return err;
 	}
 
 	if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2582,7 +2580,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 		slab_set_debugobj_lock_classes(cachep);
 	}
 
-	return cachep;
+	return 0;
 }
 
 #if DEBUG
diff --git a/mm/slab.h b/mm/slab.h
index ec7b94429b9..077b07a24ef 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -33,8 +33,8 @@ extern struct list_head slab_caches;
 extern struct kmem_cache *kmem_cache;
 
 /* Functions provided by the slab allocators */
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
-	size_t align, unsigned long flags, void (*ctor)(void *));
+extern int __kmem_cache_create(struct kmem_cache *, const char *name,
+	size_t size, size_t align, unsigned long flags, void (*ctor)(void *));
 
 #ifdef CONFIG_SLUB
 struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4f722084bae..f50d2ed4fbf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -119,19 +119,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 	if (s)
 		goto out_locked;
 
-	s = __kmem_cache_create(n, size, align, flags, ctor);
-
+	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
 	if (s) {
-		/*
-		 * Check if the slab has actually been created and if it was a
-		 * real instatiation. Aliases do not belong on the list
-		 */
-		if (s->refcount == 1)
+		err = __kmem_cache_create(s, n, size, align, flags, ctor);
+		if (!err)
+
 			list_add(&s->list, &slab_caches);
 
+		else {
+			kfree(n);
+			kmem_cache_free(kmem_cache, s);
+		}
+
 	} else {
 		kfree(n);
-		err = -ENOSYS; /* Until __kmem_cache_create returns code */
+		err = -ENOMEM;
 	}
 
 out_locked:
diff --git a/mm/slob.c b/mm/slob.c
index 50f60532270..9b0cee1e847 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -508,34 +508,26 @@ size_t ksize(const void *block)
 }
 EXPORT_SYMBOL(ksize);
 
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+int __kmem_cache_create(struct kmem_cache *c, const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *c;
-
-	c = slob_alloc(sizeof(struct kmem_cache),
-		GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
-
-	if (c) {
-		c->name = name;
-		c->size = size;
-		if (flags & SLAB_DESTROY_BY_RCU) {
-			/* leave room for rcu footer at the end of object */
-			c->size += sizeof(struct slob_rcu);
-		}
-		c->flags = flags;
-		c->ctor = ctor;
-		/* ignore alignment unless it's forced */
-		c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
-		if (c->align < ARCH_SLAB_MINALIGN)
-			c->align = ARCH_SLAB_MINALIGN;
-		if (c->align < align)
-			c->align = align;
-
-		kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
-		c->refcount = 1;
+	c->name = name;
+	c->size = size;
+	if (flags & SLAB_DESTROY_BY_RCU) {
+		/* leave room for rcu footer at the end of object */
+		c->size += sizeof(struct slob_rcu);
 	}
-	return c;
+	c->flags = flags;
+	c->ctor = ctor;
+	/* ignore alignment unless it's forced */
+	c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+	if (c->align < ARCH_SLAB_MINALIGN)
+		c->align = ARCH_SLAB_MINALIGN;
+	if (c->align < align)
+		c->align = align;
+
+	c->refcount = 1;
+	return 0;
 }
 
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
diff --git a/mm/slub.c b/mm/slub.c
index 8d00fd78df2..0ad3fffc7d2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3034,7 +3034,6 @@ static int kmem_cache_open(struct kmem_cache *s,
 		size_t align, unsigned long flags,
 		void (*ctor)(void *))
 {
-	memset(s, 0, kmem_size);
 	s->name = name;
 	s->ctor = ctor;
 	s->object_size = size;
@@ -3109,7 +3108,7 @@ static int kmem_cache_open(struct kmem_cache *s,
 		goto error;
 
 	if (alloc_kmem_cache_cpus(s))
-		return 1;
+		return 0;
 
 	free_kmem_cache_nodes(s);
 error:
@@ -3118,7 +3117,7 @@ error:
 			"order=%u offset=%u flags=%lx\n",
 			s->name, (unsigned long)size, s->size, oo_order(s->oo),
 			s->offset, flags);
-	return 0;
+	return -EINVAL;
 }
 
 /*
@@ -3260,13 +3259,13 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 {
 	struct kmem_cache *s;
 
-	s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+	s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 
 	/*
 	 * This function is called with IRQs disabled during early-boot on
 	 * single CPU so there's no need to take slab_mutex here.
 	 */
-	if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
+	if (kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
@@ -3944,20 +3943,11 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	return s;
 }
 
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+int __kmem_cache_create(struct kmem_cache *s,
+		const char *name, size_t size,
 		size_t align, unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
-
-	s = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
-	if (s) {
-		if (kmem_cache_open(s, name,
-				size, align, flags, ctor)) {
-			return s;
-		}
-		kmem_cache_free(kmem_cache, s);
-	}
-	return NULL;
+	return kmem_cache_open(s, name, size, align, flags, ctor);
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 8a13a4cc80bb25c9eab2e7e56bab724fcfa55fce Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:18:33 +0000
Subject: mm/sl[aou]b: Shrink __kmem_cache_create() parameter lists

Do the initial settings of the fields in common code. This will allow us
to push more processing into common code later and improve readability.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 73 +++++++++++++++++++++++++-------------------------------
 mm/slab.h        |  3 +--
 mm/slab_common.c | 26 ++++++++++----------
 mm/slob.c        |  8 +++----
 mm/slub.c        | 39 ++++++++++++++----------------
 5 files changed, 68 insertions(+), 81 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index abc83334e5f..f1f6d54e129 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1677,20 +1677,20 @@ void __init kmem_cache_init(void)
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-	__kmem_cache_create(sizes[INDEX_AC].cs_cachep, names[INDEX_AC].name,
-					sizes[INDEX_AC].cs_size,
-					ARCH_KMALLOC_MINALIGN,
-					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL);
-
+	sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
+	sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
+	sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
+	sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+	__kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
 	list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
+
 	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-		__kmem_cache_create(sizes[INDEX_L3].cs_cachep, names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size,
-				ARCH_KMALLOC_MINALIGN,
-				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-				NULL);
+		sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
+		sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
+		sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
+		sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+		__kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
 		list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
 	}
 
@@ -1706,22 +1706,21 @@ void __init kmem_cache_init(void)
 		 */
 		if (!sizes->cs_cachep) {
 			sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-			__kmem_cache_create(sizes->cs_cachep, names->name,
-					sizes->cs_size,
-					ARCH_KMALLOC_MINALIGN,
-					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL);
+			sizes->cs_cachep->name = names->name;
+			sizes->cs_cachep->size = sizes->cs_size;
+			sizes->cs_cachep->object_size = sizes->cs_size;
+			sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+			__kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
 			list_add(&sizes->cs_cachep->list, &slab_caches);
 		}
 #ifdef CONFIG_ZONE_DMA
 		sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+		sizes->cs_dmacachep->name = names->name_dma;
+		sizes->cs_dmacachep->size = sizes->cs_size;
+		sizes->cs_dmacachep->object_size = sizes->cs_size;
+		sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
 		__kmem_cache_create(sizes->cs_dmacachep,
-					names->name_dma,
-					sizes->cs_size,
-					ARCH_KMALLOC_MINALIGN,
-					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
-						SLAB_PANIC,
-					NULL);
+			       ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
 		list_add(&sizes->cs_dmacachep->list, &slab_caches);
 #endif
 		sizes++;
@@ -2360,12 +2359,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
  * as davem.
  */
 int
-__kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void *))
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 {
 	size_t left_over, slab_size, ralign;
 	gfp_t gfp;
 	int err;
+	size_t size = cachep->size;
 
 #if DEBUG
 #if FORCED_DEBUG
@@ -2437,8 +2436,8 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 		ralign = ARCH_SLAB_MINALIGN;
 	}
 	/* 3) caller mandated alignment */
-	if (ralign < align) {
-		ralign = align;
+	if (ralign < cachep->align) {
+		ralign = cachep->align;
 	}
 	/* disable debug if necessary */
 	if (ralign > __alignof__(unsigned long long))
@@ -2446,7 +2445,7 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 	/*
 	 * 4) Store it.
 	 */
-	align = ralign;
+	cachep->align = ralign;
 
 	if (slab_is_available())
 		gfp = GFP_KERNEL;
@@ -2454,8 +2453,6 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 		gfp = GFP_NOWAIT;
 
 	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
-	cachep->object_size = size;
-	cachep->align = align;
 #if DEBUG
 
 	/*
@@ -2500,17 +2497,15 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 		 */
 		flags |= CFLGS_OFF_SLAB;
 
-	size = ALIGN(size, align);
+	size = ALIGN(size, cachep->align);
 
-	left_over = calculate_slab_order(cachep, size, align, flags);
+	left_over = calculate_slab_order(cachep, size, cachep->align, flags);
 
-	if (!cachep->num) {
-		printk(KERN_ERR
-		       "kmem_cache_create: couldn't create cache %s.\n", name);
+	if (!cachep->num)
 		return -E2BIG;
-	}
+
 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-			  + sizeof(struct slab), align);
+			  + sizeof(struct slab), cachep->align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -2538,8 +2533,8 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
-	if (cachep->colour_off < align)
-		cachep->colour_off = align;
+	if (cachep->colour_off < cachep->align)
+		cachep->colour_off = cachep->align;
 	cachep->colour = left_over / cachep->colour_off;
 	cachep->slab_size = slab_size;
 	cachep->flags = flags;
@@ -2560,8 +2555,6 @@ __kmem_cache_create (struct kmem_cache *cachep, const char *name, size_t size, s
 		 */
 		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
 	}
-	cachep->ctor = ctor;
-	cachep->name = name;
 	cachep->refcount = 1;
 
 	err = setup_cpu_cache(cachep, gfp);
diff --git a/mm/slab.h b/mm/slab.h
index 077b07a24ef..67aeaa2d39c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -33,8 +33,7 @@ extern struct list_head slab_caches;
 extern struct kmem_cache *kmem_cache;
 
 /* Functions provided by the slab allocators */
-extern int __kmem_cache_create(struct kmem_cache *, const char *name,
-	size_t size, size_t align, unsigned long flags, void (*ctor)(void *));
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
 
 #ifdef CONFIG_SLUB
 struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f50d2ed4fbf..8a85a19d90e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -100,7 +100,6 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 {
 	struct kmem_cache *s = NULL;
 	int err = 0;
-	char *n;
 
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
@@ -109,32 +108,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		goto out_locked;
 
 
-	n = kstrdup(name, GFP_KERNEL);
-	if (!n) {
-		err = -ENOMEM;
-		goto out_locked;
-	}
-
 	s = __kmem_cache_alias(name, size, align, flags, ctor);
 	if (s)
 		goto out_locked;
 
 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
 	if (s) {
-		err = __kmem_cache_create(s, n, size, align, flags, ctor);
+		s->object_size = s->size = size;
+		s->align = align;
+		s->ctor = ctor;
+		s->name = kstrdup(name, GFP_KERNEL);
+		if (!s->name) {
+			kmem_cache_free(kmem_cache, s);
+			err = -ENOMEM;
+			goto out_locked;
+		}
+
+		err = __kmem_cache_create(s, flags);
 		if (!err)
 
 			list_add(&s->list, &slab_caches);
 
 		else {
-			kfree(n);
+			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
 		}
-
-	} else {
-		kfree(n);
+	} else
 		err = -ENOMEM;
-	}
 
 out_locked:
 	mutex_unlock(&slab_mutex);
diff --git a/mm/slob.c b/mm/slob.c
index 9b0cee1e847..cac05d92f32 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -508,17 +508,15 @@ size_t ksize(const void *block)
 }
 EXPORT_SYMBOL(ksize);
 
-int __kmem_cache_create(struct kmem_cache *c, const char *name, size_t size,
-	size_t align, unsigned long flags, void (*ctor)(void *))
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 {
-	c->name = name;
-	c->size = size;
+	size_t align = c->size;
+
 	if (flags & SLAB_DESTROY_BY_RCU) {
 		/* leave room for rcu footer at the end of object */
 		c->size += sizeof(struct slob_rcu);
 	}
 	c->flags = flags;
-	c->ctor = ctor;
 	/* ignore alignment unless it's forced */
 	c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
 	if (c->align < ARCH_SLAB_MINALIGN)
diff --git a/mm/slub.c b/mm/slub.c
index 0ad3fffc7d2..d8ee419d5a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3029,16 +3029,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 
 }
 
-static int kmem_cache_open(struct kmem_cache *s,
-		const char *name, size_t size,
-		size_t align, unsigned long flags,
-		void (*ctor)(void *))
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 {
-	s->name = name;
-	s->ctor = ctor;
-	s->object_size = size;
-	s->align = align;
-	s->flags = kmem_cache_flags(size, flags, name, ctor);
+	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
 
 	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3115,7 +3108,7 @@ error:
 	if (flags & SLAB_PANIC)
 		panic("Cannot create slab %s size=%lu realsize=%u "
 			"order=%u offset=%u flags=%lx\n",
-			s->name, (unsigned long)size, s->size, oo_order(s->oo),
+			s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
 			s->offset, flags);
 	return -EINVAL;
 }
@@ -3261,12 +3254,15 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 
 	s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 
+	s->name = name;
+	s->size = s->object_size = size;
+	s->align = ARCH_KMALLOC_MINALIGN;
+
 	/*
 	 * This function is called with IRQs disabled during early-boot on
 	 * single CPU so there's no need to take slab_mutex here.
 	 */
-	if (kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
-								flags, NULL))
+	if (kmem_cache_open(s, flags))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
@@ -3719,9 +3715,10 @@ void __init kmem_cache_init(void)
 	 */
 	kmem_cache_node = (void *)kmem_cache + kmalloc_size;
 
-	kmem_cache_open(kmem_cache_node, "kmem_cache_node",
-		sizeof(struct kmem_cache_node),
-		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	kmem_cache_node->name = "kmem_cache_node";
+	kmem_cache_node->size = kmem_cache_node->object_size =
+		sizeof(struct kmem_cache_node);
+	kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 
@@ -3729,8 +3726,10 @@ void __init kmem_cache_init(void)
 	slab_state = PARTIAL;
 
 	temp_kmem_cache = kmem_cache;
-	kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
-		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	kmem_cache->name = "kmem_cache";
+	kmem_cache->size = kmem_cache->object_size = kmem_size;
+	kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
 	kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
 	memcpy(kmem_cache, temp_kmem_cache, kmem_size);
 
@@ -3943,11 +3942,9 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	return s;
 }
 
-int __kmem_cache_create(struct kmem_cache *s,
-		const char *name, size_t size,
-		size_t align, unsigned long flags, void (*ctor)(void *))
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 {
-	return kmem_cache_open(s, name, size, align, flags, ctor);
+	return kmem_cache_open(s, flags);
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From cce89f4f6911286500cf7be0363f46c9b0a12ce0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 4 Sep 2012 23:38:33 +0000
Subject: mm/sl[aou]b: Move kmem_cache refcounting to common code

Get rid of the refcount stuff in the allocators and do that part of
kmem_cache management in the common code.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c        | 1 -
 mm/slab_common.c | 5 +++--
 mm/slob.c        | 1 -
 mm/slub.c        | 1 -
 4 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f1f6d54e129..11d9af5f9d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2555,7 +2555,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		 */
 		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
 	}
-	cachep->refcount = 1;
 
 	err = setup_cpu_cache(cachep, gfp);
 	if (err) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8a85a19d90e..651a3c60847 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -125,11 +125,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 		}
 
 		err = __kmem_cache_create(s, flags);
-		if (!err)
+		if (!err) {
 
+			s->refcount = 1;
 			list_add(&s->list, &slab_caches);
 
-		else {
+		} else {
 			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
 		}
diff --git a/mm/slob.c b/mm/slob.c
index cac05d92f32..3edfeaac320 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -524,7 +524,6 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 	if (c->align < align)
 		c->align = align;
 
-	c->refcount = 1;
 	return 0;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index d8ee419d5a1..0b122d8ec21 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3093,7 +3093,6 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	else
 		s->cpu_partial = 30;
 
-	s->refcount = 1;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
 #endif
-- 
cgit v1.2.3


From aac3a1664aba429f47c70edfc76ee10fcd808471 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Wed, 5 Sep 2012 12:07:44 +0300
Subject: Revert "mm/sl[aou]b: Move sysfs_slab_add to common"

This reverts commit 96d17b7be0a9849d381442030886211dbb2a7061 which
caused the following errors at boot:

  [    1.114885] kobject (ffff88001a802578): tried to init an initialized object, something is seriously wrong.
  [    1.114885] Pid: 1, comm: swapper/0 Tainted: G        W    3.6.0-rc1+ #6
  [    1.114885] Call Trace:
  [    1.114885]  [<ffffffff81273f37>] kobject_init+0x87/0xa0
  [    1.115555]  [<ffffffff8127426a>] kobject_init_and_add+0x2a/0x90
  [    1.115555]  [<ffffffff8127c870>] ? sprintf+0x40/0x50
  [    1.115555]  [<ffffffff81124c60>] sysfs_slab_add+0x80/0x210
  [    1.115555]  [<ffffffff81100175>] kmem_cache_create+0xa5/0x250
  [    1.115555]  [<ffffffff81cf24cd>] ? md_init+0x144/0x144
  [    1.115555]  [<ffffffff81cf25b6>] local_init+0xa4/0x11b
  [    1.115555]  [<ffffffff81cf24e1>] dm_init+0x14/0x45
  [    1.115836]  [<ffffffff810001ba>] do_one_initcall+0x3a/0x160
  [    1.116834]  [<ffffffff81cc2c90>] kernel_init+0x133/0x1b7
  [    1.117835]  [<ffffffff81cc25c4>] ? do_early_param+0x86/0x86
  [    1.117835]  [<ffffffff8171aff4>] kernel_thread_helper+0x4/0x10
  [    1.118401]  [<ffffffff81cc2b5d>] ? start_kernel+0x33f/0x33f
  [    1.119832]  [<ffffffff8171aff0>] ? gs_change+0xb/0xb
  [    1.120325] ------------[ cut here ]------------
  [    1.120835] WARNING: at fs/sysfs/dir.c:536 sysfs_add_one+0xc1/0xf0()
  [    1.121437] sysfs: cannot create duplicate filename '/kernel/slab/:t-0000016'
  [    1.121831] Modules linked in:
  [    1.122138] Pid: 1, comm: swapper/0 Tainted: G        W    3.6.0-rc1+ #6
  [    1.122831] Call Trace:
  [    1.123074]  [<ffffffff81195ce1>] ? sysfs_add_one+0xc1/0xf0
  [    1.123833]  [<ffffffff8103adfa>] warn_slowpath_common+0x7a/0xb0
  [    1.124405]  [<ffffffff8103aed1>] warn_slowpath_fmt+0x41/0x50
  [    1.124832]  [<ffffffff81195ce1>] sysfs_add_one+0xc1/0xf0
  [    1.125337]  [<ffffffff81195eb3>] create_dir+0x73/0xd0
  [    1.125832]  [<ffffffff81196221>] sysfs_create_dir+0x81/0xe0
  [    1.126363]  [<ffffffff81273d3d>] kobject_add_internal+0x9d/0x210
  [    1.126832]  [<ffffffff812742a3>] kobject_init_and_add+0x63/0x90
  [    1.127406]  [<ffffffff81124c60>] sysfs_slab_add+0x80/0x210
  [    1.127832]  [<ffffffff81100175>] kmem_cache_create+0xa5/0x250
  [    1.128384]  [<ffffffff81cf24cd>] ? md_init+0x144/0x144
  [    1.128833]  [<ffffffff81cf25b6>] local_init+0xa4/0x11b
  [    1.129831]  [<ffffffff81cf24e1>] dm_init+0x14/0x45
  [    1.130305]  [<ffffffff810001ba>] do_one_initcall+0x3a/0x160
  [    1.130831]  [<ffffffff81cc2c90>] kernel_init+0x133/0x1b7
  [    1.131351]  [<ffffffff81cc25c4>] ? do_early_param+0x86/0x86
  [    1.131830]  [<ffffffff8171aff4>] kernel_thread_helper+0x4/0x10
  [    1.132392]  [<ffffffff81cc2b5d>] ? start_kernel+0x33f/0x33f
  [    1.132830]  [<ffffffff8171aff0>] ? gs_change+0xb/0xb
  [    1.133315] ---[ end trace 2703540871c8fab7 ]---
  [    1.133830] ------------[ cut here ]------------
  [    1.134274] WARNING: at lib/kobject.c:196 kobject_add_internal+0x1f5/0x210()
  [    1.134829] kobject_add_internal failed for :t-0000016 with -EEXIST, don't try to register things with the same name in the same directory.
  [    1.135829] Modules linked in:
  [    1.136135] Pid: 1, comm: swapper/0 Tainted: G        W    3.6.0-rc1+ #6
  [    1.136828] Call Trace:
  [    1.137071]  [<ffffffff81273e95>] ? kobject_add_internal+0x1f5/0x210
  [    1.137830]  [<ffffffff8103adfa>] warn_slowpath_common+0x7a/0xb0
  [    1.138402]  [<ffffffff8103aed1>] warn_slowpath_fmt+0x41/0x50
  [    1.138830]  [<ffffffff811955a3>] ? release_sysfs_dirent+0x73/0xf0
  [    1.139419]  [<ffffffff81273e95>] kobject_add_internal+0x1f5/0x210
  [    1.139830]  [<ffffffff812742a3>] kobject_init_and_add+0x63/0x90
  [    1.140429]  [<ffffffff81124c60>] sysfs_slab_add+0x80/0x210
  [    1.140830]  [<ffffffff81100175>] kmem_cache_create+0xa5/0x250
  [    1.141829]  [<ffffffff81cf24cd>] ? md_init+0x144/0x144
  [    1.142307]  [<ffffffff81cf25b6>] local_init+0xa4/0x11b
  [    1.142829]  [<ffffffff81cf24e1>] dm_init+0x14/0x45
  [    1.143307]  [<ffffffff810001ba>] do_one_initcall+0x3a/0x160
  [    1.143829]  [<ffffffff81cc2c90>] kernel_init+0x133/0x1b7
  [    1.144352]  [<ffffffff81cc25c4>] ? do_early_param+0x86/0x86
  [    1.144829]  [<ffffffff8171aff4>] kernel_thread_helper+0x4/0x10
  [    1.145405]  [<ffffffff81cc2b5d>] ? start_kernel+0x33f/0x33f
  [    1.145828]  [<ffffffff8171aff0>] ? gs_change+0xb/0xb
  [    1.146313] ---[ end trace 2703540871c8fab8 ]---

Conflicts:

	mm/slub.c

Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.h        |  3 ---
 mm/slab_common.c |  8 --------
 mm/slub.c        | 19 +++++++++++++++++--
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.h b/mm/slab.h
index 67aeaa2d39c..7deeb449a30 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,13 +38,10 @@ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
 #ifdef CONFIG_SLUB
 struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *));
-extern int sysfs_slab_add(struct kmem_cache *s);
 #else
 static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 	size_t align, unsigned long flags, void (*ctor)(void *))
 { return NULL; }
-static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
-
 #endif
 
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 651a3c60847..9c217255ac4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -155,14 +155,6 @@ out_locked:
 		return NULL;
 	}
 
-	if (s->refcount == 1) {
-		err = sysfs_slab_add(s);
-		if (err)
-			printk(KERN_WARNING "kmem_cache_create(%s) failed to"
-				" create sysfs entry. Error %d\n",
-					name, err);
-	}
-
 	return s;
 }
 EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/slub.c b/mm/slub.c
index 0b122d8ec21..dafd465f7a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -202,10 +202,12 @@ struct track {
 enum track_item { TRACK_ALLOC, TRACK_FREE };
 
 #ifdef CONFIG_SYSFS
+static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
 
 #else
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s) { }
@@ -3943,7 +3945,20 @@ struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
 
 int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 {
-	return kmem_cache_open(s, flags);
+	int err;
+
+	err = kmem_cache_open(s, flags);
+	if (err)
+		return err;
+
+	mutex_unlock(&slab_mutex);
+	err = sysfs_slab_add(s);
+	mutex_lock(&slab_mutex);
+
+	if (err)
+		kmem_cache_close(s);
+
+	return err;
 }
 
 #ifdef CONFIG_SMP
@@ -5233,7 +5248,7 @@ static char *create_unique_id(struct kmem_cache *s)
 	return name;
 }
 
-int sysfs_slab_add(struct kmem_cache *s)
+static int sysfs_slab_add(struct kmem_cache *s)
 {
 	int err;
 	const char *name;
-- 
cgit v1.2.3


From 9df53b154ac712c87db1170057aa6df05eb7bdbd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Sat, 8 Sep 2012 18:27:10 +0000
Subject: slub: Zero initial memory segment for kmem_cache and kmem_cache_node

Tony Luck reported the following problem on IA-64:

  Worked fine yesterday on next-20120905, crashes today. First sign of
  trouble was an unaligned access, then a NULL dereference. SL*B related
  bits of my config:

  CONFIG_SLUB_DEBUG=y
  # CONFIG_SLAB is not set
  CONFIG_SLUB=y
  CONFIG_SLABINFO=y
  # CONFIG_SLUB_DEBUG_ON is not set
  # CONFIG_SLUB_STATS is not set

  And he console log.

  PID hash table entries: 4096 (order: 1, 32768 bytes)
  Dentry cache hash table entries: 262144 (order: 7, 2097152 bytes)
  Inode-cache hash table entries: 131072 (order: 6, 1048576 bytes)
  Memory: 2047920k/2086064k available (13992k code, 38144k reserved,
  6012k data, 880k init)
  kernel unaligned access to 0xca2ffc55fb373e95, ip=0xa0000001001be550
  swapper[0]: error during unaligned kernel access
   -1 [1]
  Modules linked in:

  Pid: 0, CPU 0, comm:              swapper
  psr : 00001010084a2018 ifs : 800000000000060f ip  :
  [<a0000001001be550>]    Not tainted (3.6.0-rc4-zx1-smp-next-20120906)
  ip is at new_slab+0x90/0x680
  unat: 0000000000000000 pfs : 000000000000060f rsc : 0000000000000003
  rnat: 9666960159966a59 bsps: a0000001001441c0 pr  : 9666960159965a59
  ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70433f
  csd : 0000000000000000 ssd : 0000000000000000
  b0  : a0000001001be500 b6  : a00000010112cb20 b7  : a0000001011660a0
  f6  : 0fff7f0f0f0f0e54f0000 f7  : 0ffe8c5c1000000000000
  f8  : 1000d8000000000000000 f9  : 100068800000000000000
  f10 : 10005f0f0f0f0e54f0000 f11 : 1003e0000000000000078
  r1  : a00000010155eef0 r2  : 0000000000000000 r3  : fffffffffffc1638
  r8  : e0000040600081b8 r9  : ca2ffc55fb373e95 r10 : 0000000000000000
  r11 : e000004040001646 r12 : a000000101287e20 r13 : a000000101280000
  r14 : 0000000000004000 r15 : 0000000000000078 r16 : ca2ffc55fb373e75
  r17 : e000004040040000 r18 : fffffffffffc1646 r19 : e000004040001646
  r20 : fffffffffffc15f8 r21 : 000000000000004d r22 : a00000010132fa68
  r23 : 00000000000000ed r24 : 0000000000000000 r25 : 0000000000000000
  r26 : 0000000000000001 r27 : a0000001012b8500 r28 : a00000010135f4a0
  r29 : 0000000000000000 r30 : 0000000000000000 r31 : 0000000000000001
  Unable to handle kernel NULL pointer dereference (address
  0000000000000018)
  swapper[0]: Oops 11003706212352 [2]
  Modules linked in:

  Pid: 0, CPU 0, comm:              swapper
  psr : 0000121008022018 ifs : 800000000000cc18 ip  :
  [<a0000001004dc8f1>]    Not tainted (3.6.0-rc4-zx1-smp-next-20120906)
  ip is at __copy_user+0x891/0x960
  unat: 0000000000000000 pfs : 0000000000000813 rsc : 0000000000000003
  rnat: 0000000000000000 bsps: 0000000000000000 pr  : 9666960159961765
  ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f
  csd : 0000000000000000 ssd : 0000000000000000
  b0  : a00000010004b550 b6  : a00000010004b740 b7  : a00000010000c750
  f6  : 000000000000000000000 f7  : 1003e9e3779b97f4a7c16
  f8  : 1003e0a00000010001550 f9  : 100068800000000000000
  f10 : 10005f0f0f0f0e54f0000 f11 : 1003e0000000000000078
  r1  : a00000010155eef0 r2  : a0000001012870b0 r3  : a0000001012870b8
  r8  : 0000000000000298 r9  : 0000000000000013 r10 : 0000000000000000
  r11 : 9666960159961a65 r12 : a000000101287010 r13 : a000000101280000
  r14 : a000000101287068 r15 : a000000101287080 r16 : 0000000000000298
  r17 : 0000000000000010 r18 : 0000000000000018 r19 : a000000101287310
  r20 : 0000000000000290 r21 : 0000000000000000 r22 : 0000000000000000
  r23 : a000000101386f58 r24 : 0000000000000000 r25 : 000000007fffffff
  r26 : a000000101287078 r27 : a0000001013c69b0 r28 : 0000000000000000
  r29 : 0000000000000014 r30 : 0000000000000000 r31 : 0000000000000813

Sedat Dilek and Hugh Dickins reported similar problems as well.

Earlier patches in the common set moved the zeroing of the kmem_cache
structure into common code. See "Move allocation of kmem_cache into
common code".

The allocation for the two special structures is still done from SLUB
specific code but no zeroing is done since the cache creation functions
used to zero. This now needs to be updated so that the structures are
zeroed during allocation in kmem_cache_init().  Otherwise random pointer
values may be followed.

Reported-by: Tony Luck <tony.luck@intel.com>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Reported-by: Hugh Dickins <hughd@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index dafd465f7a3..2258ed82880 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3707,7 +3707,7 @@ void __init kmem_cache_init(void)
 	/* Allocate two kmem_caches from the page allocator */
 	kmalloc_size = ALIGN(kmem_size, cache_line_size());
 	order = get_order(2 * kmalloc_size);
-	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
+	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
 
 	/*
 	 * Must first have the slab cache available for the allocations of the
-- 
cgit v1.2.3


From 947ca1856a7e60aa6d20536785e6a42dff25aa6e Mon Sep 17 00:00:00 2001
From: Michael Wang <wangyun@linux.vnet.ibm.com>
Date: Wed, 5 Sep 2012 10:33:18 +0800
Subject: slab: fix the DEADLOCK issue on l3 alien lock

DEADLOCK will be report while running a kernel with NUMA and LOCKDEP enabled,
the process of this fake report is:

	   kmem_cache_free()	//free obj in cachep
	-> cache_free_alien()	//acquire cachep's l3 alien lock
	-> __drain_alien_cache()
	-> free_block()
	-> slab_destroy()
	-> kmem_cache_free()	//free slab in cachep->slabp_cache
	-> cache_free_alien()	//acquire cachep->slabp_cache's l3 alien lock

Since the cachep and cachep->slabp_cache's l3 alien are in the same lock class,
fake report generated.

This should not happen since we already have init_lock_keys() which will
reassign the lock class for both l3 list and l3 alien.

However, init_lock_keys() was invoked at a wrong position which is before we
invoke enable_cpucache() on each cache.

Since until set slab_state to be FULL, we won't invoke enable_cpucache()
on caches to build their l3 alien while creating them, so although we invoked
init_lock_keys(), the l3 alien lock class won't change since we don't have
them until invoked enable_cpucache() later.

This patch will invoke init_lock_keys() after we done enable_cpucache()
instead of before to avoid the fake DEADLOCK report.

Michael traced the problem back to a commit in release 3.0.0:

commit 30765b92ada267c5395fc788623cb15233276f5c
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Jul 28 23:22:56 2011 +0200

    slab, lockdep: Annotate the locks before using them

    Fernando found we hit the regular OFF_SLAB 'recursion' before we
    annotate the locks, cure this.

    The relevant portion of the stack-trace:

    > [    0.000000]  [<c085e24f>] rt_spin_lock+0x50/0x56
    > [    0.000000]  [<c04fb406>] __cache_free+0x43/0xc3
    > [    0.000000]  [<c04fb23f>] kmem_cache_free+0x6c/0xdc
    > [    0.000000]  [<c04fb2fe>] slab_destroy+0x4f/0x53
    > [    0.000000]  [<c04fb396>] free_block+0x94/0xc1
    > [    0.000000]  [<c04fc551>] do_tune_cpucache+0x10b/0x2bb
    > [    0.000000]  [<c04fc8dc>] enable_cpucache+0x7b/0xa7
    > [    0.000000]  [<c0bd9d3c>] kmem_cache_init_late+0x1f/0x61
    > [    0.000000]  [<c0bba687>] start_kernel+0x24c/0x363
    > [    0.000000]  [<c0bba0ba>] i386_start_kernel+0xa9/0xaf

    Reported-by: Fernando Lopez-Lezcano <nando@ccrma.Stanford.EDU>
    Acked-by: Pekka Enberg <penberg@kernel.org>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Link: http://lkml.kernel.org/r/1311888176.2617.379.camel@laptop
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

The commit moved init_lock_keys() before we build up the alien, so we
failed to reclass it.

Cc: <stable@vger.kernel.org> # 3.0+
Acked-by: Christoph Lameter <cl@linux.com>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3b4587bb7b1..cd5a9265030 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1774,9 +1774,6 @@ void __init kmem_cache_init_late(void)
 
 	slab_state = UP;
 
-	/* Annotate slab for lockdep -- annotate the malloc caches */
-	init_lock_keys();
-
 	/* 6) resize the head arrays to their final sizes */
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(cachep, &slab_caches, list)
@@ -1784,6 +1781,9 @@ void __init kmem_cache_init_late(void)
 			BUG();
 	mutex_unlock(&slab_mutex);
 
+	/* Annotate slab for lockdep -- annotate the malloc caches */
+	init_lock_keys();
+
 	/* Done! */
 	slab_state = FULL;
 
-- 
cgit v1.2.3


From 8c7f6edbda01f1b1a2e60ad61f14fe38023e433b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Sep 2012 12:20:58 -0700
Subject: cgroup: mark subsystems with broken hierarchy support and whine if
 cgroups are nested for them

Currently, cgroup hierarchy support is a mess.  cpu related subsystems
behave correctly - configuration, accounting and control on a parent
properly cover its children.  blkio and freezer completely ignore
hierarchy and treat all cgroups as if they're directly under the root
cgroup.  Others show yet different behaviors.

These differing interpretations of cgroup hierarchy make using cgroup
confusing and it impossible to co-mount controllers into the same
hierarchy and obtain sane behavior.

Eventually, we want full hierarchy support from all subsystems and
probably a unified hierarchy.  Users using separate hierarchies
expecting completely different behaviors depending on the mounted
subsystem is deterimental to making any progress on this front.

This patch adds cgroup_subsys.broken_hierarchy and sets it to %true
for controllers which are lacking in hierarchy support.  The goal of
this patch is two-fold.

* Move users away from using hierarchy on currently non-hierarchical
  subsystems, so that implementing proper hierarchy support on those
  doesn't surprise them.

* Keep track of which controllers are broken how and nudge the
  subsystems to implement proper hierarchy support.

For now, start with a single warning message.  We can whine louder
later on.

v2: Fixed a typo spotted by Michal. Warning message updated.

v3: Updated memcg part so that it doesn't generate warning in the
    cases where .use_hierarchy=false doesn't make the behavior
    different from root.use_hierarchy=true.  Fixed a typo spotted by
    Glauber.

v4: Check ->broken_hierarchy after cgroup creation is complete so that
    ->create() can affect the result per Michal.  Dropped unnecessary
    memcg root handling per Michal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 mm/memcontrol.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afab..a72f2ffdc3d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4973,6 +4973,13 @@ mem_cgroup_create(struct cgroup *cont)
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
+		/*
+		 * Deeper hierachy with use_hierarchy == false doesn't make
+		 * much sense so let cgroup subsystem know about this
+		 * unfortunate state in our controller.
+		 */
+		if (parent && parent != root_mem_cgroup)
+			mem_cgroup_subsys.broken_hierarchy = true;
 	}
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
-- 
cgit v1.2.3


From f28510d30c7f03daa290019fbc57ad8277347614 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 11 Sep 2012 19:49:38 +0000
Subject: slab: Only define slab_error for DEBUG

On Tue, 11 Sep 2012, Stephen Rothwell wrote:
> After merging the final tree, today's linux-next build (sparc64 defconfig)
> produced this warning:
>
> mm/slab.c:808:13: warning: '__slab_error' defined but not used [-Wunused-function]
>
> Introduced by commit 945cf2b6199b ("mm/sl[aou]b: Extract a common
> function for kmem_cache_destroy").  All uses of slab_error() are now
> guarded by DEBUG.

There is no use case left for slab builds without DEBUG.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 11d9af5f9d2..8524923966b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -803,6 +803,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 
+#if DEBUG
 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 
 static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -812,6 +813,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
 	       function, cachep->name, msg);
 	dump_stack();
 }
+#endif
 
 /*
  * By default on NUMA we use alien caches to stage the freeing of
-- 
cgit v1.2.3


From 645df230cacc48f4463037016e9dbd3633183fe8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 18 Sep 2012 15:54:12 -0400
Subject: mm, sl[au]b: Taint kernel when we detect a corrupted slab

It doesn't seem worth adding a new taint flag for this, so just re-use
the one from 'bad page'

Acked-by: Christoph Lameter <cl@linux.com> # SLUB
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 1 +
 mm/slub.c | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index cd5a9265030..5c6abb831e6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -811,6 +811,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
 	dump_stack();
+	add_taint(TAINT_BAD_PAGE);
 }
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index c67bd0a4a95..a6d043e1326 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -568,6 +568,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 	printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
 	printk(KERN_ERR "----------------------------------------"
 			"-------------------------------------\n\n");
+
+	add_taint(TAINT_BAD_PAGE);
 }
 
 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
-- 
cgit v1.2.3


From a00bb1e9fc0925c0061e9d844523a3a47a7e2f7f Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Fri, 21 Sep 2012 16:40:30 +0800
Subject: mm: frontswap: fix a wrong if condition in frontswap_shrink

pages_to_unuse is set to 0 to unuse all frontswap pages
But that doesn't happen since a wrong condition in frontswap_shrink
cancel it.

-v2: Add comment to explain return value of __frontswap_shrink,
as suggested by Dan Carpenter, thanks

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 mm/frontswap.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 89dc399d332..0547a35f798 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -263,6 +263,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
 	return ret;
 }
 
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
 static int __frontswap_shrink(unsigned long target_pages,
 				unsigned long *pages_to_unuse,
 				int *type)
@@ -275,7 +280,7 @@ static int __frontswap_shrink(unsigned long target_pages,
 	if (total_pages <= target_pages) {
 		/* Nothing to do */
 		*pages_to_unuse = 0;
-		return 0;
+		return 1;
 	}
 	total_pages_to_unuse = total_pages - target_pages;
 	return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -302,7 +307,7 @@ void frontswap_shrink(unsigned long target_pages)
 	spin_lock(&swap_lock);
 	ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
 	spin_unlock(&swap_lock);
-	if (ret == 0 && pages_to_unuse)
+	if (ret == 0)
 		try_to_unuse(type, true, pages_to_unuse);
 	return;
 }
-- 
cgit v1.2.3


From e3483a5f3a8ef448c229a2aceca9b2ad6a46b8ec Mon Sep 17 00:00:00 2001
From: Dan Magenheimer <dan.magenheimer@oracle.com>
Date: Thu, 20 Sep 2012 12:16:52 -0700
Subject: frontswap: support exclusive gets if tmem backend is capable

Tmem, as originally specified, assumes that "get" operations
performed on persistent pools never flush the page of data out
of tmem on a successful get, waiting instead for a flush
operation.  This is intended to mimic the model of a swap
disk, where a disk read is non-destructive.  Unlike a
disk, however, freeing up the RAM can be valuable.  Over
the years that frontswap was in the review process, several
reviewers (and notably Hugh Dickins in 2010) pointed out that
this would result, at least temporarily, in two copies of the
data in RAM: one (compressed for zcache) copy in tmem,
and one copy in the swap cache.  We wondered if this could
be done differently, at least optionally.

This patch allows tmem backends to instruct the frontswap
code that this backend performs exclusive gets.  Zcache2
already contains hooks to support this feature.  Other
backends are completely unaffected unless/until they are
updated to support this feature.

While it is not clear that exclusive gets are a performance
win on all workloads at all times, this small patch allows for
experimentation by backends.

P.S. Let's not quibble about the naming of "get" vs "read" vs
"load" etc.  The naming is currently horribly inconsistent between
cleancache and frontswap and existing tmem backends, so will need
to be straightened out as a separate patch.  "Get" is used
by the tmem architecture spec, existing backends, and
all documentation and presentation material so I am
using it in this patch.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 mm/frontswap.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 0547a35f798..2890e67d602 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
  */
 static bool frontswap_writethrough_enabled __read_mostly;
 
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
 #ifdef CONFIG_DEBUG_FS
 /*
  * Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -96,6 +103,15 @@ void frontswap_writethrough(bool enable)
 }
 EXPORT_SYMBOL(frontswap_writethrough);
 
+/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+	frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
 /*
  * Called when a swap device is swapon'd.
  */
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
 	BUG_ON(sis == NULL);
 	if (frontswap_test(sis, offset))
 		ret = frontswap_ops.load(type, offset, page);
-	if (ret == 0)
+	if (ret == 0) {
 		inc_frontswap_loads();
+		if (frontswap_tmem_exclusive_gets_enabled) {
+			SetPageDirty(page);
+			frontswap_clear(sis, offset);
+		}
+	}
 	return ret;
 }
 EXPORT_SYMBOL(__frontswap_load);
-- 
cgit v1.2.3


From 58fac09566bb48592a09ef0fe0c7dbefa0cd2109 Mon Sep 17 00:00:00 2001
From: Michael Wang <wangyun@linux.vnet.ibm.com>
Date: Fri, 17 Aug 2012 12:33:34 +0800
Subject: kmemleak: Replace list_for_each_continue_rcu with new interface

This patch replaces list_for_each_continue_rcu() with
list_for_each_entry_continue_rcu() to save a few lines
of code and allow removing list_for_each_continue_rcu().

Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 mm/kmemleak.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf3..0de83b4541e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1483,13 +1483,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct kmemleak_object *prev_obj = v;
 	struct kmemleak_object *next_obj = NULL;
-	struct list_head *n = &prev_obj->object_list;
+	struct kmemleak_object *obj = prev_obj;
 
 	++(*pos);
 
-	list_for_each_continue_rcu(n, &object_list) {
-		struct kmemleak_object *obj =
-			list_entry(n, struct kmemleak_object, object_list);
+	list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
 		if (get_object(obj)) {
 			next_obj = obj;
 			break;
-- 
cgit v1.2.3


From 90f2cbbc49a8fe5a49cea1d362d90e377b949d49 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:51 -0300
Subject: mm, slob: Use NUMA_NO_NODE instead of -1

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slob.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933..191e1713a6d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 	void *page;
 
 #ifdef CONFIG_NUMA
-	if (node != -1)
+	if (node != NUMA_NO_NODE)
 		page = alloc_pages_exact_node(node, gfp, order);
 	else
 #endif
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 		 * If there's a node specification, search for a partial
 		 * page with a matching node id in the freelist.
 		 */
-		if (node != -1 && page_to_nid(sp) != node)
+		if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
 			continue;
 #endif
 		/* Enough room on this page? */
@@ -514,7 +514,7 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
 	struct kmem_cache *c;
 
 	c = slob_alloc(sizeof(struct kmem_cache),
-		GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
+		GFP_KERNEL, ARCH_KMALLOC_MINALIGN, NUMA_NO_NODE);
 
 	if (c) {
 		c->name = name;
-- 
cgit v1.2.3


From ff4fcd01ec86d98d15d2fd96f22f19bb1d341b88 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:52 -0300
Subject: mm, slab: Remove silly function slab_buffer_size()

This function is seldom used, and can be simply replaced with cachep->size.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 5c6abb831e6..7072848701e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-#ifdef CONFIG_TRACING
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
-	return cachep->size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
-
 /*
  * Do not go above this order unless 0 objects fit into the slab or
  * overridden on the command line.
@@ -3850,7 +3842,7 @@ kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 	ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
 
 	trace_kmalloc(_RET_IP_, ret,
-		      size, slab_buffer_size(cachep), flags);
+		      size, cachep->size, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3881,7 +3873,7 @@ void *kmem_cache_alloc_node_trace(size_t size,
 	ret = __cache_alloc_node(cachep, flags, nodeid,
 				  __builtin_return_address(0));
 	trace_kmalloc_node(_RET_IP_, ret,
-			   size, slab_buffer_size(cachep),
+			   size, cachep->size,
 			   flags, nodeid);
 	return ret;
 }
-- 
cgit v1.2.3


From f3f741019595f1e73564d985f5fe8abcbb98c769 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:53 -0300
Subject: mm, slob: Add support for kmalloc_track_caller()

Currently slob falls back to regular kmalloc for this case.
With this patch kmalloc_track_caller() is correctly implemented,
thus tracing the specified caller.

This is important to trace accurately allocations performed by
krealloc, kstrdup, kmemdup, etc.

Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slob.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 191e1713a6d..dd47d16d57b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -425,7 +425,8 @@ out:
  * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
  */
 
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 {
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 		*m = size;
 		ret = (void *)m + align;
 
-		trace_kmalloc_node(_RET_IP_, ret,
+		trace_kmalloc_node(caller, ret,
 				   size, size + align, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 			page->private = size;
 		}
 
-		trace_kmalloc_node(_RET_IP_, ret,
+		trace_kmalloc_node(caller, ret,
 				   size, PAGE_SIZE << order, gfp, node);
 	}
 
 	kmemleak_alloc(ret, size, 1, gfp);
 	return ret;
 }
+
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
 EXPORT_SYMBOL(__kmalloc_node);
 
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+	return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+					int node, unsigned long caller)
+{
+	return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
+
 void kfree(const void *block)
 {
 	struct page *sp;
-- 
cgit v1.2.3


From 7c0cb9c64f83dd9bf18b139ac0de00daba6087c2 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:55 -0300
Subject: mm, slab: Replace 'caller' type, void* -> unsigned long

This allows to use _RET_IP_ instead of builtin_address(0), thus
achiveing implementation consistency in all three allocators.
Though maybe a nitpick, the real goal behind this patch is
to be able to obtain common code between SLAB and SLUB.

Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 7072848701e..6c356793612 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3084,7 +3084,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 }
 
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
-				   void *caller)
+				   unsigned long caller)
 {
 	struct page *page;
 	unsigned int objnr;
@@ -3104,7 +3104,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 	}
 	if (cachep->flags & SLAB_STORE_USER)
-		*dbg_userword(cachep, objp) = caller;
+		*dbg_userword(cachep, objp) = (void *)caller;
 
 	objnr = obj_to_index(cachep, slabp, objp);
 
@@ -3117,7 +3117,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
-			store_stackinfo(cachep, objp, (unsigned long)caller);
+			store_stackinfo(cachep, objp, caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 0);
 		} else {
@@ -3270,7 +3270,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 
 #if DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-				gfp_t flags, void *objp, void *caller)
+				gfp_t flags, void *objp, unsigned long caller)
 {
 	if (!objp)
 		return objp;
@@ -3287,7 +3287,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		poison_obj(cachep, objp, POISON_INUSE);
 	}
 	if (cachep->flags & SLAB_STORE_USER)
-		*dbg_userword(cachep, objp) = caller;
+		*dbg_userword(cachep, objp) = (void *)caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3562,7 +3562,7 @@ done:
  */
 static __always_inline void *
 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
-		   void *caller)
+		   unsigned long caller)
 {
 	unsigned long save_flags;
 	void *ptr;
@@ -3648,7 +3648,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 
 static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+__cache_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -3784,7 +3784,7 @@ free_done:
  * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
-    void *caller)
+				unsigned long caller)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
@@ -3824,7 +3824,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+	void *ret = __cache_alloc(cachep, flags, _RET_IP_);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
@@ -3839,7 +3839,7 @@ kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret;
 
-	ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+	ret = __cache_alloc(cachep, flags, _RET_IP_);
 
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
@@ -3851,8 +3851,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	void *ret = __cache_alloc_node(cachep, flags, nodeid,
-				       __builtin_return_address(0));
+	void *ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
@@ -3870,8 +3869,8 @@ void *kmem_cache_alloc_node_trace(size_t size,
 {
 	void *ret;
 
-	ret = __cache_alloc_node(cachep, flags, nodeid,
-				  __builtin_return_address(0));
+	ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP);
+
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
 			   flags, nodeid);
@@ -3881,7 +3880,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 
 static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
 	struct kmem_cache *cachep;
 
@@ -3894,21 +3893,20 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	return __do_kmalloc_node(size, flags, node,
-			__builtin_return_address(0));
+	return __do_kmalloc_node(size, flags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
 		int node, unsigned long caller)
 {
-	return __do_kmalloc_node(size, flags, node, (void *)caller);
+	return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	return __do_kmalloc_node(size, flags, node, NULL);
+	return __do_kmalloc_node(size, flags, node, 0);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3921,7 +3919,7 @@ EXPORT_SYMBOL(__kmalloc_node);
  * @caller: function caller for debug tracking of the caller
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
-					  void *caller)
+					  unsigned long caller)
 {
 	struct kmem_cache *cachep;
 	void *ret;
@@ -3936,7 +3934,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = __cache_alloc(cachep, flags, caller);
 
-	trace_kmalloc((unsigned long) caller, ret,
+	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
 
 	return ret;
@@ -3946,20 +3944,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
-	return __do_kmalloc(size, flags, __builtin_return_address(0));
+	return __do_kmalloc(size, flags, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc);
 
 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 {
-	return __do_kmalloc(size, flags, (void *)caller);
+	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 
 #else
 void *__kmalloc(size_t size, gfp_t flags)
 {
-	return __do_kmalloc(size, flags, NULL);
+	return __do_kmalloc(size, flags, 0);
 }
 EXPORT_SYMBOL(__kmalloc);
 #endif
@@ -3980,7 +3978,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
-	__cache_free(cachep, objp, __builtin_return_address(0));
+	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
@@ -4011,7 +4009,7 @@ void kfree(const void *objp)
 	debug_check_no_locks_freed(objp, c->object_size);
 
 	debug_check_no_obj_freed(objp, c->object_size);
-	__cache_free(c, (void *)objp, __builtin_return_address(0));
+	__cache_free(c, (void *)objp, _RET_IP_);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
-- 
cgit v1.2.3


From 4052147c0afa1cf05780ed846f37e87cdde9f628 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:56 -0300
Subject: mm, slab: Match SLAB and SLUB kmem_cache_alloc_xxx_trace() prototype

This long (seemingly unnecessary) patch does not fix anything and
its only goal is to produce common code between SLAB and SLUB.

Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6c356793612..4c54a235793 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3835,7 +3835,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 
 #ifdef CONFIG_TRACING
 void *
-kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
 	void *ret;
 
@@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
-				  struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 				  gfp_t flags,
-				  int nodeid)
+				  int nodeid,
+				  size_t size)
 {
 	void *ret;
 
@@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
-- 
cgit v1.2.3


From 48356303ff8cce7036f13a23df9119d3f47461ce Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:57 -0300
Subject: mm, slab: Rename __cache_alloc() -> slab_alloc()

This patch does not fix anything and its only goal is to
produce common code between SLAB and SLUB.

Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 4c54a235793..d011030e961 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3561,7 +3561,7 @@ done:
  * Fallback to other node is possible if __GFP_THISNODE is not set.
  */
 static __always_inline void *
-__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 		   unsigned long caller)
 {
 	unsigned long save_flags;
@@ -3648,7 +3648,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 
 static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -3824,7 +3824,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	void *ret = __cache_alloc(cachep, flags, _RET_IP_);
+	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
@@ -3839,7 +3839,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
 	void *ret;
 
-	ret = __cache_alloc(cachep, flags, _RET_IP_);
+	ret = slab_alloc(cachep, flags, _RET_IP_);
 
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
@@ -3851,7 +3851,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	void *ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP_);
+	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
@@ -3869,7 +3869,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 {
 	void *ret;
 
-	ret = __cache_alloc_node(cachep, flags, nodeid, _RET_IP);
+	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP);
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
@@ -3932,7 +3932,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	ret = __cache_alloc(cachep, flags, caller);
+	ret = slab_alloc(cachep, flags, caller);
 
 	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
-- 
cgit v1.2.3


From 2b847c3cb4f8565911bd8ce59b8428e635c90594 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Sat, 8 Sep 2012 17:47:58 -0300
Subject: mm, slub: Rename slab_alloc() -> slab_alloc_node() to match SLAB

This patch does not fix anything, and its only goal is to enable us
to obtain some common code between SLAB and SLUB.
Neither behavior nor produced code is affected.

Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index a6d043e1326..f074f756405 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2313,7 +2313,7 @@ new_slab:
  *
  * Otherwise we can simply pick the next object from the lockless free list.
  */
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 		gfp_t gfpflags, int node, unsigned long addr)
 {
 	void **object;
@@ -2383,9 +2383,15 @@ redo:
 	return object;
 }
 
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+		gfp_t gfpflags, unsigned long addr)
+{
+	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
+
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
 
@@ -2396,7 +2402,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-	void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
 	return ret;
 }
@@ -2414,7 +2420,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    s->object_size, s->size, gfpflags, node);
@@ -2428,7 +2434,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 				    gfp_t gfpflags,
 				    int node, size_t size)
 {
-	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
@@ -3366,7 +3372,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
+	ret = slab_alloc(s, flags, _RET_IP_);
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
@@ -3409,7 +3415,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, flags, node, _RET_IP_);
+	ret = slab_alloc_node(s, flags, node, _RET_IP_);
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
@@ -4037,7 +4043,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
+	ret = slab_alloc(s, gfpflags, caller);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4067,7 +4073,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, gfpflags, node, caller);
+	ret = slab_alloc_node(s, gfpflags, node, caller);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
-- 
cgit v1.2.3


From 592f41450d03f74747edd81779ae206a0647152f Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Tue, 25 Sep 2012 08:07:08 -0300
Subject: mm/slab: Fix typo _RET_IP -> _RET_IP_

The bug was introduced by commit 7c0cb9c64f83 ("mm, slab: Replace
'caller' type, void* -> unsigned long").

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d011030e961..ca3849fe058 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3869,7 +3869,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 {
 	void *ret;
 
-	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP);
+	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
-- 
cgit v1.2.3


From 1e5965bf1f018cc30a4659fa3f1a40146e4276f6 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <elezegarcia@gmail.com>
Date: Tue, 25 Sep 2012 08:07:09 -0300
Subject: mm/slab: Fix kmem_cache_alloc_node_trace() declaration

The bug was introduced in commit 4052147c0afa ("mm, slab: Match SLAB
and SLUB kmem_cache_alloc_xxx_trace() prototype").

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ca3849fe058..3409eada242 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(size_t size,
+				  struct kmem_cache *cachep,
 				  gfp_t flags,
-				  int nodeid,
-				  size_t size)
+				  int nodeid)
 {
 	void *ret;
 
@@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
 }
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
-- 
cgit v1.2.3


From 82bd5508b4080e851ac1a9b62bed6d727b1b4a84 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 25 Sep 2012 12:53:51 -0700
Subject: mm, slob: fix build breakage in __kmalloc_node_track_caller

On Sat, 8 Sep 2012, Ezequiel Garcia wrote:

> @@ -454,15 +455,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
>  			gfp |= __GFP_COMP;
>  		ret = slob_new_pages(gfp, order, node);
>
> -		trace_kmalloc_node(_RET_IP_, ret,
> +		trace_kmalloc_node(caller, ret,
>  				   size, PAGE_SIZE << order, gfp, node);
>  	}
>
>  	kmemleak_alloc(ret, size, 1, gfp);
>  	return ret;
>  }
> +
> +void *__kmalloc_node(size_t size, gfp_t gfp, int node)
> +{
> +	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
> +}
>  EXPORT_SYMBOL(__kmalloc_node);
>
> +#ifdef CONFIG_TRACING
> +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
> +{
> +	return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
> +}
> +
> +#ifdef CONFIG_NUMA
> +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
> +					int node, unsigned long caller)
> +{
> +	return __do_kmalloc_node(size, gfp, node, caller);
> +}
> +#endif

This breaks Pekka's slab/next tree with this:

mm/slob.c: In function '__kmalloc_node_track_caller':
mm/slob.c:488: error: 'gfp' undeclared (first use in this function)
mm/slob.c:488: error: (Each undeclared identifier is reported only once
mm/slob.c:488: error: for each function it appears in.)

mm, slob: fix build breakage in __kmalloc_node_track_caller

"mm, slob: Add support for kmalloc_track_caller()" breaks the build
because gfp is undeclared.  Fix it.

Acked-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index dd47d16d57b..8c00d22afc9 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -482,7 +482,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
 					int node, unsigned long caller)
 {
 	return __do_kmalloc_node(size, gfp, node, caller);
-- 
cgit v1.2.3


From 611443783ada3464c44e9168f6b7de2cb3fed39a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 20:27:09 -0400
Subject: switch fadvise(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/fadvise.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf..a83245763cf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,8 @@
  */
 SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
-	struct file *file = fget(fd);
+	int fput_needed;
+	struct file *file = fget_light(fd, &fput_needed);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
@@ -128,7 +129,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		ret = -EINVAL;
 	}
 out:
-	fput(file);
+	fput_light(file, fput_needed);
 	return ret;
 }
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-- 
cgit v1.2.3


From 132ea2479f77dd598f11a77bdfebbd23d244ce6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 20:30:57 -0400
Subject: switch readahead(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/readahead.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa2164..1011111e2bf 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -580,9 +580,10 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
 	ssize_t ret;
 	struct file *file;
+	int fput_needed;
 
 	ret = -EBADF;
-	file = fget(fd);
+	file = fget_light(fd, &fput_needed);
 	if (file) {
 		if (file->f_mode & FMODE_READ) {
 			struct address_space *mapping = file->f_mapping;
@@ -591,7 +592,7 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 			unsigned long len = end - start + 1;
 			ret = do_readahead(mapping, file, start, len);
 		}
-		fput(file);
+		fput_light(file, fput_needed);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:48:26 -0400
Subject: make get_file() return its argument

simplifies a bunch of callers...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/fremap.c | 3 +--
 mm/mmap.c   | 3 +--
 mm/nommu.c  | 6 ++----
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd43246..048659c0c03 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -195,10 +195,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		 */
 		if (mapping_cap_account_dirty(mapping)) {
 			unsigned long addr;
-			struct file *file = vma->vm_file;
+			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			get_file(file);
 			addr = mmap_region(file, start, size,
 					flags, vma->vm_flags, pgoff);
 			fput(file);
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4..872441e8191 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1301,8 +1301,7 @@ munmap_back:
 				goto free_vma;
 			correct_wcount = 1;
 		}
-		vma->vm_file = file;
-		get_file(file);
+		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872d..dee2ff89fd5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1282,10 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
-		region->vm_file = file;
-		get_file(file);
-		vma->vm_file = file;
-		get_file(file);
+		region->vm_file = get_file(file);
+		vma->vm_file = get_file(file);
 		if (vm_flags & VM_EXECUTABLE) {
 			added_exe_file_vma(current->mm);
 			vma->vm_mm = current->mm;
-- 
cgit v1.2.3


From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Aug 2012 12:52:22 -0400
Subject: switch simple cases of fget_light to fdget

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/fadvise.c   | 35 +++++++++++++++++------------------
 mm/readahead.c | 15 +++++++--------
 2 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/fadvise.c b/mm/fadvise.c
index a83245763cf..a47f0f50c89 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,8 +26,7 @@
  */
 SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
-	int fput_needed;
-	struct file *file = fget_light(fd, &fput_needed);
+	struct fd f = fdget(fd);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
@@ -36,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 	unsigned long nrpages;
 	int ret = 0;
 
-	if (!file)
+	if (!f.file)
 		return -EBADF;
 
-	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+	if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
 		ret = -ESPIPE;
 		goto out;
 	}
 
-	mapping = file->f_mapping;
+	mapping = f.file->f_mapping;
 	if (!mapping || len < 0) {
 		ret = -EINVAL;
 		goto out;
@@ -77,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
-		file->f_ra.ra_pages = bdi->ra_pages;
-		spin_lock(&file->f_lock);
-		file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		f.file->f_ra.ra_pages = bdi->ra_pages;
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
-		spin_lock(&file->f_lock);
-		file->f_mode |= FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_SEQUENTIAL:
-		file->f_ra.ra_pages = bdi->ra_pages * 2;
-		spin_lock(&file->f_lock);
-		file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_WILLNEED:
 		/* First and last PARTIAL page! */
@@ -107,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		 * Ignore return value because fadvise() shall return
 		 * success even if filesystem can't retrieve a hint,
 		 */
-		force_page_cache_readahead(mapping, file, start_index,
+		force_page_cache_readahead(mapping, f.file, start_index,
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
@@ -129,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		ret = -EINVAL;
 	}
 out:
-	fput_light(file, fput_needed);
+	fdput(f);
 	return ret;
 }
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/readahead.c b/mm/readahead.c
index 1011111e2bf..7963f239123 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,20 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
 	ssize_t ret;
-	struct file *file;
-	int fput_needed;
+	struct fd f;
 
 	ret = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		if (file->f_mode & FMODE_READ) {
-			struct address_space *mapping = file->f_mapping;
+	f = fdget(fd);
+	if (f.file) {
+		if (f.file->f_mode & FMODE_READ) {
+			struct address_space *mapping = f.file->f_mapping;
 			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
 			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
 			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, file, start, len);
+			ret = do_readahead(mapping, f.file, start, len);
 		}
-		fput_light(file, fput_needed);
+		fdput(f);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 2f60d628ffd042e65e0b1d3431fb3e38d6f7c1be Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Fri, 28 Sep 2012 20:27:49 +0800
Subject: CPU hotplug, writeback: Don't call writeback_set_ratelimit() too
 often during hotplug

The CPU hotplug callback related to writeback calls writeback_set_ratelimit()
during every state change in the hotplug sequence. This is unnecessary
since num_online_cpus() changes only once during the entire hotplug operation.

So invoke the function only once per hotplug, thereby avoiding the
unnecessary repetition of those costly calculations.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 mm/page-writeback.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce23c1e..830893b2b3c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void)
 }
 
 static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+ratelimit_handler(struct notifier_block *self, unsigned long action,
+		  void *hcpu)
 {
-	writeback_set_ratelimit();
-	return NOTIFY_DONE;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		writeback_set_ratelimit();
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
 }
 
 static struct notifier_block __cpuinitdata ratelimit_nb = {
-- 
cgit v1.2.3


From c0b24b5100cb96cba71666953a8619a616684967 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Sat, 29 Sep 2012 10:00:59 +0300
Subject: Revert "mm/slab: Fix kmem_cache_alloc_node_trace() declaration"

This reverts commit 1e5965bf1f018cc30a4659fa3f1a40146e4276f6. Ezequiel
Garcia has a better fix.
---
 mm/slab.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3409eada242..ca3849fe058 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3862,10 +3862,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
-				  struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 				  gfp_t flags,
-				  int nodeid)
+				  int nodeid,
+				  size_t size)
 {
 	void *ret;
 
@@ -3887,7 +3887,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
-- 
cgit v1.2.3


From 788e1aadadd0d5a9cbffce10c34840b4072bc733 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Fri, 28 Sep 2012 16:34:05 +0800
Subject: slub: init_kmem_cache_cpus() and put_cpu_partial() can be static

Acked-by: Glauber Costa <glommer@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index f074f756405..944b4edaeb1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1716,7 +1716,7 @@ static inline void note_cmpxchg_failure(const char *n,
 	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 }
 
-void init_kmem_cache_cpus(struct kmem_cache *s)
+static void init_kmem_cache_cpus(struct kmem_cache *s)
 {
 	int cpu;
 
@@ -1941,7 +1941,7 @@ static void unfreeze_partials(struct kmem_cache *s)
  * If we did not find a slot then simply move all the partials to the
  * per node partial list.
  */
-int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 {
 	struct page *oldpage;
 	int pages;
-- 
cgit v1.2.3


From 608da7e3fc7259eca0d983b31bc8915af14cf15e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 30 Sep 2012 17:28:25 +0900
Subject: slab: Fix build failure in __kmem_cache_create()

Fix build failure with CONFIG_DEBUG_SLAB=y && CONFIG_DEBUG_PAGEALLOC=y caused
by commit 8a13a4cc "mm/sl[aou]b: Shrink __kmem_cache_create() parameter lists".

mm/slab.c: In function '__kmem_cache_create':
mm/slab.c:2474: error: 'align' undeclared (first use in this function)
mm/slab.c:2474: error: (Each undeclared identifier is reported only once
mm/slab.c:2474: error: for each function it appears in.)
make[1]: *** [mm/slab.o] Error 1
make: *** [mm] Error 2

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d011030e961..c4f6e14c2c7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2492,8 +2492,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-	    && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
-		cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
+	    && cachep->object_size > cache_line_size()
+	    && ALIGN(size, cachep->align) < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
 		size = PAGE_SIZE;
 	}
 #endif
-- 
cgit v1.2.3


From 17f3609c21706b377ad80b5251558ed700c2af17 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 4 Oct 2012 17:12:07 -0700
Subject: sections: fix section conflicts in mm/percpu.c

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/percpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce..ddc5efb9c5b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 #ifdef CONFIG_SMP
 
-const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
 	[PCPU_FC_AUTO]	= "auto",
 	[PCPU_FC_EMBED]	= "embed",
 	[PCPU_FC_PAGE]	= "page",
-- 
cgit v1.2.3


From c654345924f7cce87bb221b89db91cba890421ba Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 8 Oct 2012 16:28:21 -0700
Subject: mm: remove __GFP_NO_KSWAPD

When transparent huge pages were introduced, memory compaction and swap
storms were an issue, and the kernel had to be careful to not make THP
allocations cause pageout or compaction.

Now that we have working compaction deferral, kswapd is smart enough to
invoke compaction and the quadratic behaviour around isolate_free_pages
has been fixed, it should be safe to remove __GFP_NO_KSWAPD.

[minchan@kernel.org: Comment fix]
[mgorman@suse.de: Avoid direct reclaim for deferred compaction]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea753889..5e92698e539 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2362,9 +2362,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	if (!(gfp_mask & __GFP_NO_KSWAPD))
-		wake_all_kswapd(order, zonelist, high_zoneidx,
-						zone_idx(preferred_zone));
+	wake_all_kswapd(order, zonelist, high_zoneidx,
+					zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2440,7 @@ rebalance:
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
-						(gfp_mask & __GFP_NO_KSWAPD))
+	    (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */
-- 
cgit v1.2.3


From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 8 Oct 2012 16:28:29 -0700
Subject: x86, pat: separate the pfn attribute tracking for remap_pfn_range and
 vm_insert_pfn

With PAT enabled, vm_insert_pfn() looks up the existing pfn memory
attribute and uses it.  Expectation is that the driver reserves the
memory attributes for the pfn before calling vm_insert_pfn().

remap_pfn_range() (when called for the whole vma) will setup a new
attribute (based on the prot argument) for the specified pfn range.
This addresses the legacy usage which typically calls remap_pfn_range()
with a desired memory attribute.  For ranges smaller than the vma size
(which is typically not the case), remap_pfn_range() will use the
existing memory attribute for the pfn range.

Expose two different API's for these different behaviors.
track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn()
and track_pfn_remap() for the remap_pfn_range().

This cleanup also prepares the ground for the track/untrack pfn vma
routines to take over the ownership of setting PAT specific vm_flag in
the 'vma'.

[khlebnikov@openvz.org: Clear checks in track_pfn_remap()]
[akpm@linux-foundation.org: tweak a few comments]
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a..6bef278ad30 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1060,7 +1060,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
 		 */
-		ret = track_pfn_vma_copy(vma);
+		ret = track_pfn_copy(vma);
 		if (ret)
 			return ret;
 	}
@@ -1328,7 +1328,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 		uprobe_munmap(vma, start, end);
 
 	if (unlikely(is_pfn_mapping(vma)))
-		untrack_pfn_vma(vma, 0, 0);
+		untrack_pfn(vma, 0, 0);
 
 	if (start != end) {
 		if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -2162,14 +2162,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
-	if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+	if (track_pfn_insert(vma, &pgprot, pfn))
 		return -EINVAL;
 
 	ret = insert_pfn(vma, addr, pfn, pgprot);
 
-	if (ret)
-		untrack_pfn_vma(vma, pfn, PAGE_SIZE);
-
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2311,7 +2308,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
-	err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+	err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size));
 	if (err) {
 		/*
 		 * To indicate that track_pfn related cleanup is not
@@ -2335,7 +2332,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	} while (pgd++, addr = next, addr != end);
 
 	if (err)
-		untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
 
 	return err;
 }
-- 
cgit v1.2.3


From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:34 -0700
Subject: mm, x86, pat: rework linear pfn-mmap tracking

Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT.

We can toss mapping address from remap_pfn_range() into
track_pfn_vma_new(), and collect all PAT-related logic together in
arch/x86/.

This patch also restores orignal frustration-free is_cow_mapping() check
in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73
("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3")

is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c,
because it already handled by VM_PFNMAP in VM_NO_THP bit-mask.

[suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 19 +++----------------
 mm/memory.c      | 26 ++++++++++----------------
 2 files changed, 13 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb69509..73cb22ee966 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1655,11 +1655,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
-	/*
-	 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-	 * true too, verify it here.
-	 */
-	VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (hstart < hend)
@@ -1912,11 +1908,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 		goto out;
 	if (is_vma_temporary_stack(vma))
 		goto out;
-	/*
-	 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-	 * true too, verify it here.
-	 */
-	VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -2154,12 +2146,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			goto skip;
 		if (is_vma_temporary_stack(vma))
 			goto skip;
-		/*
-		 * If is_pfn_mapping() is true is_learn_pfn_mapping()
-		 * must be true too, verify it here.
-		 */
-		VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-			  vma->vm_flags & VM_NO_THP);
+		VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 		hend = vma->vm_end & HPAGE_PMD_MASK;
diff --git a/mm/memory.c b/mm/memory.c
index 6bef278ad30..655e1429388 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1055,7 +1055,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
-	if (unlikely(is_pfn_mapping(vma))) {
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		/*
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
@@ -1327,7 +1327,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 	if (vma->vm_file)
 		uprobe_munmap(vma, start, end);
 
-	if (unlikely(is_pfn_mapping(vma)))
+	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn(vma, 0, 0);
 
 	if (start != end) {
@@ -2299,26 +2299,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+	 * See vm_normal_page() for details.
 	 */
-	if (addr == vma->vm_start && end == vma->vm_end) {
+	if (is_cow_mapping(vma->vm_flags)) {
+		if (addr != vma->vm_start || end != vma->vm_end)
+			return -EINVAL;
 		vma->vm_pgoff = pfn;
-		vma->vm_flags |= VM_PFN_AT_MMAP;
-	} else if (is_cow_mapping(vma->vm_flags))
+	}
+
+	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+	if (err)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
-	err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size));
-	if (err) {
-		/*
-		 * To indicate that track_pfn related cleanup is not
-		 * needed from higher level routine calling unmap_vmas
-		 */
-		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-		vma->vm_flags &= ~VM_PFN_AT_MMAP;
-		return -EINVAL;
-	}
-
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
-- 
cgit v1.2.3


From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:37 -0700
Subject: mm: introduce arch-specific vma flag VM_ARCH_1

Combine several arch-specific vma flags into one.

before patch:

        0x00000200      0x01000000      0x20000000      0x40000000
x86     VM_NOHUGEPAGE   VM_HUGEPAGE     -               VM_PAT
powerpc -               -               VM_SAO          -
parisc  VM_GROWSUP      -               -               -
ia64    VM_GROWSUP      -               -               -
nommu   -               VM_MAPPED_COPY  -               -
others  -               -               -               -

after patch:

        0x00000200      0x01000000      0x20000000      0x40000000
x86     -               VM_PAT          VM_HUGEPAGE     VM_NOHUGEPAGE
powerpc -               VM_SAO          -               -
parisc  -               VM_GROWSUP      -               -
ia64    -               VM_GROWSUP      -               -
nommu   -               VM_MAPPED_COPY  -               -
others  -               VM_ARCH_1       -               -

And voila! One completely free bit.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 mm/ksm.c         | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 73cb22ee966..47206692cf8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1491,7 +1491,7 @@ out:
 	return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \
 		   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c88536889..d1cbe2aa6b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1470,9 +1470,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
 				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-				 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+				 VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
+#ifdef VM_SAO
+		if (*vm_flags & VM_SAO)
+			return 0;
+#endif
+
 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
 			err = __ksm_enter(mm);
 			if (err)
-- 
cgit v1.2.3


From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:40 -0700
Subject: mm: kill vma flag VM_INSERTPAGE

Merge VM_INSERTPAGE into VM_MIXEDMAP.  VM_MIXEDMAP VMA can mix pure-pfn
ptes, special ptes and normal ptes.

Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like
VM_PFNMAP.  If driver populates whole VMA at mmap() it probably not
expects page-faults.

This patch removes special check from vma_wants_writenotify() which
disables pages write tracking for VMA populated via vm_instert_page().
BDI below mapped file should not use dirty-accounting, moreover
do_wp_page() can handle this.

vm_insert_page() still marks vma after first usage.  Usually it is called
from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to
change vma->vm_flags.  Caller must set VM_MIXEDMAP at mmap time if it
wants to call this function from other places, for example from page-fault
handler.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  3 +--
 mm/ksm.c         |  2 +-
 mm/memory.c      | 14 ++++++++++++--
 mm/mmap.c        |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 47206692cf8..9b72d127051 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1491,8 +1491,7 @@ out:
 	return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \
-		   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
 		     unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index d1cbe2aa6b3..f9ccb16559e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1469,7 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+				 VM_RESERVED  | VM_HUGETLB |
 				 VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
diff --git a/mm/memory.c b/mm/memory.c
index 655e1429388..7b1e4feaec0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+			       VM_PFNMAP | VM_MIXEDMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -2085,6 +2086,11 @@ out:
  * ask for a shared writable mapping!
  *
  * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
  */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 			struct page *page)
@@ -2093,7 +2099,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 		return -EFAULT;
 	if (!page_count(page))
 		return -EINVAL;
-	vma->vm_flags |= VM_INSERTPAGE;
+	if (!(vma->vm_flags & VM_MIXEDMAP)) {
+		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+		BUG_ON(vma->vm_flags & VM_PFNMAP);
+		vma->vm_flags |= VM_MIXEDMAP;
+	}
 	return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 872441e8191..b0989f4d4f0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1190,7 +1190,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 		return 0;
 
 	/* Specialty mapping? */
-	if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+	if (vm_flags & VM_PFNMAP)
 		return 0;
 
 	/* Can the mapping track the dirty pages? */
-- 
cgit v1.2.3


From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:46 -0700
Subject: mm: kill vma flag VM_CAN_NONLINEAR

Move actual pte filling for non-linear file mappings into the new special
vma operation: ->remap_pages().

Filesystems must implement this method to get non-linear mapping support,
if it uses filemap_fault() then generic_file_remap_pages() can be used.

Now device drivers can implement this method and obtain nonlinear vma support.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>	#arch/tile
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c     |  2 +-
 mm/filemap_xip.c |  3 ++-
 mm/fremap.c      | 14 ++++++++------
 mm/mmap.c        |  3 +--
 mm/nommu.c       |  8 ++++++++
 mm/shmem.c       |  3 +--
 6 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c3..a9827b42556 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270..91750227a19 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -305,6 +305,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +314,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 
 	file_accessed(file);
 	vma->vm_ops = &xip_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+	vma->vm_flags |= VM_MIXEDMAP;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 048659c0c03..3d731a49878 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
  *
  * started by Ingo Molnar, Copyright (C) 2002, 2003
  */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
 	return err;
 }
 
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, unsigned long size, pgoff_t pgoff)
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+			     unsigned long size, pgoff_t pgoff)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
 	do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff++;
 	} while (size);
 
-        return 0;
-
+	return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 
 /**
  * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
 		goto out;
 
-	if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+	if (!vma->vm_ops->remap_pages)
 		goto out;
 
 	if (start < vma->vm_start || start + size > vma->vm_end)
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	}
 
 	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = populate_range(mm, vma, start, size, pgoff);
+	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/mmap.c b/mm/mmap.c
index b0989f4d4f0..d0686d35511 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -669,8 +669,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
-	/* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
-	if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
+	if (vma->vm_flags ^ vm_flags)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index dee2ff89fd5..98318dcff74 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1961,6 +1961,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+			     unsigned long size, pgoff_t pgoff)
+{
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index d3752110c8c..cc12072f878 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:54 -0700
Subject: mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas

Currently the kernel sets mm->exe_file during sys_execve() and then tracks
number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon
as this counter drops to zero kernel resets mm->exe_file to NULL.  Plus it
resets mm->exe_file at last mmput() when mm->mm_users drops to zero.

VMA with VM_EXECUTABLE flag appears after mapping file with flag
MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma
splitting, because sys_mmap ignores this flag.  Usually binfmt module sets
mm->exe_file and mmaps executable vmas with this file, they hold
mm->exe_file while task is running.

comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"),
where all this stuff was introduced:

> The kernel implements readlink of /proc/pid/exe by getting the file from
> the first executable VMA.  Then the path to the file is reconstructed and
> reported as the result.
>
> Because of the VMA walk the code is slightly different on nommu systems.
> This patch avoids separate /proc/pid/exe code on nommu systems.  Instead of
> walking the VMAs to find the first executable file-backed VMA we store a
> reference to the exec'd file in the mm_struct.
>
> That reference would prevent the filesystem holding the executable file
> from being unmounted even after unmapping the VMAs.  So we track the number
> of VM_EXECUTABLE VMAs and drop the new reference when the last one is
> unmapped.  This avoids pinning the mounted filesystem.

exe_file's vma accounting is hooked into every file mmap/unmmap and vma
split/merge just to fix some hypothetical pinning fs from umounting by mm,
which already unmapped all its executable files, but still alive.

Seems like currently nobody depends on this behaviour.  We can try to
remove this logic and keep mm->exe_file until final mmput().

mm->exe_file is still protected with mm->mmap_sem, because we want to
change it via new sys_prctl(PR_SET_MM_EXE_FILE).  Also via this syscall
task can change its mm->exe_file and unpin mountpoint explicitly.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 25 ++++---------------------
 mm/nommu.c | 11 +----------
 2 files changed, 5 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index d0686d35511..c1ad2e78ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file) {
+	if (vma->vm_file)
 		fput(vma->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(vma->vm_mm);
-	}
 	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
@@ -636,8 +633,6 @@ again:			remove_next = 1 + (end > next->vm_end);
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
 			fput(file);
-			if (next->vm_flags & VM_EXECUTABLE)
-				removed_exe_file_vma(mm);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
@@ -1304,8 +1299,6 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
-		if (vm_flags & VM_EXECUTABLE)
-			added_exe_file_vma(mm);
 
 		/* Can addr have changed??
 		 *
@@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	if (anon_vma_clone(new, vma))
 		goto out_free_mpol;
 
-	if (new->vm_file) {
+	if (new->vm_file)
 		get_file(new->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			added_exe_file_vma(mm);
-	}
 
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
@@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	/* Clean everything up if vma_adjust failed. */
 	if (new->vm_ops && new->vm_ops->close)
 		new->vm_ops->close(new);
-	if (new->vm_file) {
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(mm);
+	if (new->vm_file)
 		fput(new->vm_file);
-	}
 	unlink_anon_vmas(new);
  out_free_mpol:
 	mpol_put(pol);
@@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
-			if (new_vma->vm_file) {
+			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
-
-				if (vma->vm_flags & VM_EXECUTABLE)
-					added_exe_file_vma(mm);
-			}
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index 98318dcff74..9c4a7b63a4d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	kenter("%p", vma);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file) {
+	if (vma->vm_file)
 		fput(vma->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(mm);
-	}
 	put_nommu_region(vma->vm_region);
 	kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
 	if (file) {
 		region->vm_file = get_file(file);
 		vma->vm_file = get_file(file);
-		if (vm_flags & VM_EXECUTABLE) {
-			added_exe_file_vma(current->mm);
-			vma->vm_mm = current->mm;
-		}
 	}
 
 	down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	if (vma->vm_flags & VM_EXECUTABLE)
-		removed_exe_file_vma(vma->vm_mm);
 	kmem_cache_free(vm_area_cachep, vma);
 	kleave(" = %d", ret);
 	return ret;
-- 
cgit v1.2.3


From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:59 -0700
Subject: mm: prepare VM_DONTDUMP for using in drivers

Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags:
VM_DONTEXPAND, VM_DONTCOPY.  Currently this flag used only for
sys_madvise.  The next patch will use it for replacing the outdated flag
VM_RESERVED.

Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL
(VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d1..03dfa5c7adb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		new_flags &= ~VM_DONTCOPY;
 		break;
 	case MADV_DONTDUMP:
-		new_flags |= VM_NODUMP;
+		new_flags |= VM_DONTDUMP;
 		break;
 	case MADV_DODUMP:
-		new_flags &= ~VM_NODUMP;
+		if (new_flags & VM_SPECIAL) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTDUMP;
 		break;
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
-- 
cgit v1.2.3


From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:29:02 -0700
Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter

A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA,
currently it lost original meaning but still has some effects:

 | effect                 | alternative flags
-+------------------------+---------------------------------------------
1| account as reserved_vm | VM_IO
2| skip in core dump      | VM_IO, VM_DONTDUMP
3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
4| do not mlock           | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP

This patch removes reserved_vm counter from mm_struct.  Seems like nobody
cares about it, it does not exported into userspace directly, it only
reduces total_vm showed in proc.

Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP.

remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP.
remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP.

[akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c     |  3 +--
 mm/memory.c  | 11 +++++------
 mm/mlock.c   |  2 +-
 mm/mmap.c    |  2 --
 mm/nommu.c   |  2 +-
 mm/vmalloc.c |  3 +--
 6 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index f9ccb16559e..9638620a753 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_RESERVED  | VM_HUGETLB |
-				 VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
diff --git a/mm/memory.c b/mm/memory.c
index 7b1e4feaec0..e09c0481318 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED is specified all over the place, because
-	 *	in 2.4 it kept swapout's vma scan off this vma; but
-	 *	in 2.6 the LRU scan won't even find its pages, so this
-	 *	flag means no more than count its pages in reserved_vm,
-	 * 	and omit it from core dump, even when VM_IO turned off.
 	 *   VM_PFNMAP tells the core MM that the base pages are just
 	 *	raw PFN mappings, and do not have a "struct page" associated
 	 *	with them.
+	 *   VM_DONTEXPAND
+	 *      Disable vma merging and expanding with mremap().
+	 *   VM_DONTDUMP
+	 *      Omit vma from core dump, even when VM_IO turned off.
 	 *
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
@@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	if (err)
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e..a948be4b7ba 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 		goto no_mlock;
 
-	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+	if (!((vma->vm_flags & VM_DONTEXPAND) ||
 			is_vm_hugetlb_page(vma) ||
 			vma == get_gate_vma(current->mm))) {
 
diff --git a/mm/mmap.c b/mm/mmap.c
index c1ad2e78ea5..a76042dc806 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 			mm->exec_vm += pages;
 	} else if (flags & stack_flags)
 		mm->stack_vm += pages;
-	if (flags & (VM_RESERVED|VM_IO))
-		mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 9c4a7b63a4d..12e84e69dd0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	if (addr != (pfn << PAGE_SHIFT))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241..8de704679bf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 		usize -= PAGE_SIZE;
 	} while (usize > 0);
 
-	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	return 0;
 }
-- 
cgit v1.2.3


From d741c9cdeee6a569dae0dbbaf028065402955b59 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Mon, 8 Oct 2012 16:29:05 -0700
Subject: mm: fix nonuniform page status when writing new file with small
 buffer

When writing a new file with 2048 bytes buffer, such as write(fd, buffer,
2048), it will call generic_perform_write() twice for every page:

	write_begin
	mark_page_accessed(page)
	write_end

	write_begin
	mark_page_accessed(page)
	write_end

Pages 1-13 will be added to lru-pvecs in write_begin() and will *NOT* be
added to active_list even they have be accessed twice because they are not
PageLRU(page).  But when page 14th comes, all pages in lru-pvecs will be
moved to inactive_list (by __lru_cache_add() ) in first write_begin(), now
page 14th *is* PageLRU(page).  And after second write_end() only page 14th
will be in active_list.

In Hadoop environment, we do comes to this situation: after writing a
file, we find out that only 14th, 28th, 42th...  page are in active_list
and others in inactive_list.  Now kswapd works, shrinks the inactive_list,
the file only have 14th, 28th...pages in memory, the readahead request
size will be broken to only 52k (13*4k), system's performance falls
dramatically.

This problem can also replay by below steps (the machine has 8G memory):

	1. dd if=/dev/zero of=/test/file.out bs=1024 count=1048576
	2. cat another 7.5G file to /dev/null
	3. vmtouch -m 1G -v /test/file.out, it will show:

	/test/file.out
	[oooooooooooooooooooOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 187847/262144

	the 'o' means same pages are in memory but same are not.

The solution for this problem is simple: the 14th page should be added to
lru_add_pvecs before mark_page_accessed() just as other pages.

[akpm@linux-foundation.org: tweak comment]
[akpm@linux-foundation.org: grab better comment from the v3 patch]
Signed-off-by: Robin Dong <sanbai@taobao.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 77825883298..f76c76c7501 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
 	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
+	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec, lru);
+	pagevec_add(pvec, page);
 	put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
-- 
cgit v1.2.3


From 6597d783397aebb793fb529474cce5089aa4c67f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:29:07 -0700
Subject: mm/mmap.c: replace find_vma_prepare() with clearer find_vma_links()

People get confused by find_vma_prepare(), because it doesn't care about
what it returns in its output args, when its callers won't be interested.

Clarify by passing in end-of-range address too, and returning failure if
any existing vma overlaps the new range: instead of returning an ambiguous
vma which most callers then must check.  find_vma_links() is a clearer
name.

This does revert 2.6.27's dfe195fb79e88 ("mm: fix uninitialized variables
for find_vma_prepare callers"), but it looks like gcc 4.3.0 was one of
those releases too eager to shout about uninitialized variables: only
copy_vma() warns with 4.5.1 and 4.7.1, which a BUG on error silences.

[hughd@google.com: fix warning, remove BUG()]
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Benny Halevy <bhalevy@tonian.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index a76042dc806..2ba27434e84 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -353,17 +353,14 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
-		struct vm_area_struct **pprev, struct rb_node ***rb_link,
-		struct rb_node ** rb_parent)
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+		unsigned long end, struct vm_area_struct **pprev,
+		struct rb_node ***rb_link, struct rb_node **rb_parent)
 {
-	struct vm_area_struct * vma;
-	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 
 	__rb_link = &mm->mm_rb.rb_node;
 	rb_prev = __rb_parent = NULL;
-	vma = NULL;
 
 	while (*__rb_link) {
 		struct vm_area_struct *vma_tmp;
@@ -372,9 +369,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 
 		if (vma_tmp->vm_end > addr) {
-			vma = vma_tmp;
-			if (vma_tmp->vm_start <= addr)
-				break;
+			/* Fail if an existing vma overlaps the area */
+			if (vma_tmp->vm_start < end)
+				return -ENOMEM;
 			__rb_link = &__rb_parent->rb_left;
 		} else {
 			rb_prev = __rb_parent;
@@ -387,7 +384,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 	*rb_link = __rb_link;
 	*rb_parent = __rb_parent;
-	return vma;
+	return 0;
 }
 
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -456,11 +453,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *__vma, *prev;
+	struct vm_area_struct *prev;
 	struct rb_node **rb_link, *rb_parent;
 
-	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
-	BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+			   &prev, &rb_link, &rb_parent))
+		BUG();
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	mm->map_count++;
 }
@@ -1221,8 +1219,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-	if (vma && vma->vm_start < addr + len) {
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
@@ -2183,8 +2180,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	 * Clear old maps.  this also does some error checking for us
 	 */
  munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-	if (vma && vma->vm_start < addr + len) {
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
@@ -2298,10 +2294,10 @@ void exit_mmap(struct mm_struct *mm)
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
  * then i_mmap_mutex is taken here.
  */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct * __vma, * prev;
-	struct rb_node ** rb_link, * rb_parent;
+	struct vm_area_struct *prev;
+	struct rb_node **rb_link, *rb_parent;
 
 	/*
 	 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2319,8 +2315,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 		BUG_ON(vma->anon_vma);
 		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 	}
-	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
-	if (__vma && __vma->vm_start < vma->vm_end)
+	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+			   &prev, &rb_link, &rb_parent))
 		return -ENOMEM;
 	if ((vma->vm_flags & VM_ACCOUNT) &&
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2354,7 +2350,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		faulted_in_anon_vma = false;
 	}
 
-	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 	if (new_vma) {
-- 
cgit v1.2.3


From 4ffb6335da87b51c17e7ff6495170785f21558dd Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:09 -0700
Subject: mm: compaction: update comment in try_to_compact_pages

Allocation success rates have been far lower since 3.4 due to commit
fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled").
This commit was introduced for good reasons and it was known in advance
that the success rates would suffer but it was justified on the grounds
that the high allocation success rates were achieved by aggressive
reclaim.  Success rates are expected to suffer even more in 3.6 due to
commit 7db8889ab05b ("mm: have order > 0 compaction start off where it
left") which testing has shown to severely reduce allocation success
rates under load - to 0% in one case.

This series aims to improve the allocation success rates without
regressing the benefits of commit fe2c2a106663.  The series is based on
latest mmotm and takes into account the __GFP_NO_KSWAPD flag is going
away.

Patch 1 updates a stale comment seeing as I was in the general area.

Patch 2 updates reclaim/compaction to reclaim pages scaled on the number
	of recent failures.

Patch 3 captures suitable high-order pages freed by compaction to reduce
	races with parallel allocation requests.

Patch 4 fixes the upstream commit [7db8889a: mm: have order > 0 compaction
	start off where it left] to enable compaction again

Patch 5 identifies when compacion is taking too long due to contention
	and aborts.

STRESS-HIGHALLOC
		 3.6-rc1-akpm	  full-series
Pass 1          36.00 ( 0.00%)    51.00 (15.00%)
Pass 2          42.00 ( 0.00%)    63.00 (21.00%)
while Rested    86.00 ( 0.00%)    86.00 ( 0.00%)

From

  http://www.csn.ul.ie/~mel/postings/mmtests-20120424/global-dhp__stress-highalloc-performance-ext3/hydra/comparison.html

I know that the allocation success rates in 3.3.6 was 78% in comparison
to 36% in in the current akpm tree.  With the full series applied, the
success rates are up to around 51% with some variability in the results.
This is not as high a success rate but it does not reclaim excessively
which is a key point.

MMTests Statistics: vmstat
Page Ins                                     3050912     3078892
Page Outs                                    8033528     8039096
Swap Ins                                           0           0
Swap Outs                                          0           0

Note that swap in/out rates remain at 0. In 3.3.6 with 78% success rates
there were 71881 pages swapped out.

Direct pages scanned                           70942      122976
Kswapd pages scanned                         1366300     1520122
Kswapd pages reclaimed                       1366214     1484629
Direct pages reclaimed                         70936      105716
Kswapd efficiency                                99%         97%
Kswapd velocity                             1072.550    1182.615
Direct efficiency                                99%         85%
Direct velocity                               55.690      95.672

The kswapd velocity changes very little as expected.  kswapd velocity is
around the 1000 pages/sec mark where as in kernel 3.3.6 with the high
allocation success rates it was 8140 pages/second.  Direct velocity is
higher as a result of patch 2 of the series but this is expected and is
acceptable.  The direct reclaim and kswapd velocities change very little.

If these get accepted for merging then there is a difficulty in how they
should be handled.  7db8889a ("mm: have order > 0 compaction start off
where it left") is broken but it is already in 3.6-rc1 and needs to be
fixed.  However, if just patch 4 from this series is applied then Jim
Schutt's workload is known to break again as his workload also requires
patch 5.  While it would be preferred to have all these patches in 3.6 to
improve compaction in general, it would at least be acceptable if just
patches 4 and 5 were merged to 3.6 to fix a known problem without breaking
compaction completely.  On the face of it, that would force
__GFP_NO_KSWAPD patches to be merged at the same time but I can do a
version of this series with __GFP_NO_KSWAPD change reverted and then
rebase it on top of this series.  That might be best overall because I
note that the __GFP_NO_KSWAPD patch should have removed
deferred_compaction from page_alloc.c but it didn't but fixing that causes
collisions with this series.

This patch:

The comment about order applied when the check was order >
PAGE_ALLOC_COSTLY_ORDER which has not been the case since c5a73c3d ("thp:
use compaction for all allocation orders").  Fixing the comment while I'm
in the general area.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68..7168edc7592 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -869,11 +869,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
 
-	/*
-	 * Check whether it is worth even starting compaction. The order check is
-	 * made because an assumption is made that the page allocator can satisfy
-	 * the "cheaper" orders without taking special steps
-	 */
+	/* Check if the GFP flags allow compaction */
 	if (!order || !may_enter_fs || !may_perform_io)
 		return rc;
 
-- 
cgit v1.2.3


From 83fde0f22872aa8c1d46f775cc7bdfa864499e65 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:11 -0700
Subject: mm: vmscan: scale number of pages reclaimed by reclaim/compaction
 based on failures

If allocation fails after compaction then compaction may be deferred for
a number of allocation attempts.  If there are subsequent failures,
compact_defer_shift is increased to defer for longer periods.  This
patch uses that information to scale the number of pages reclaimed with
compact_defer_shift until allocations succeed again.  The rationale is
that reclaiming the normal number of pages still allowed compaction to
fail and its success depends on the number of pages.  If it's failing,
reclaim more pages until it succeeds again.

Note that this is not implying that VM reclaim is not reclaiming enough
pages or that its logic is broken.  try_to_free_pages() always asks for
SWAP_CLUSTER_MAX pages to be reclaimed regardless of order and that is
what it does.  Direct reclaim stops normally with this check.

	if (sc->nr_reclaimed >= sc->nr_to_reclaim)
		goto out;

should_continue_reclaim delays when that check is made until a minimum
number of pages for reclaim/compaction are reclaimed.  It is possible
that this patch could instead set nr_to_reclaim in try_to_free_pages()
and drive it from there but that's behaves differently and not
necessarily for the better.  If driven from do_try_to_free_pages(), it
is also possible that priorities will rise.

When they reach DEF_PRIORITY-2, it will also start stalling and setting
pages for immediate reclaim which is more disruptive than not desirable
in this case.  That is a more wide-reaching change that could cause
another regression related to THP requests causing interactive jitter.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c..f6fdae7a8d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1729,6 +1729,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
 	return false;
 }
 
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+			struct lruvec *lruvec, struct scan_control *sc)
+{
+	struct zone *zone = lruvec_zone(lruvec);
+
+	if (zone->compact_order_failed <= sc->order)
+		pages_for_compaction <<= zone->compact_defer_shift;
+	return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+			struct lruvec *lruvec, struct scan_control *sc)
+{
+	return pages_for_compaction;
+}
+#endif
+
 /*
  * Reclaim/compaction is used for high-order allocation requests. It reclaims
  * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1798,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
+
+	pages_for_compaction = scale_for_compaction(pages_for_compaction,
+						    lruvec, sc);
 	inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	if (nr_swap_pages > 0)
 		inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
-- 
cgit v1.2.3


From 1fb3f8ca0e9222535a39b884cb67a34628411b9f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:12 -0700
Subject: mm: compaction: capture a suitable high-order page immediately when
 it is made available

While compaction is migrating pages to free up large contiguous blocks
for allocation it races with other allocation requests that may steal
these blocks or break them up.  This patch alters direct compaction to
capture a suitable free page as soon as it becomes available to reduce
this race.  It uses similar logic to split_free_page() to ensure that
watermarks are still obeyed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 mm/internal.h   |  1 +
 mm/page_alloc.c | 63 ++++++++++++++++++++++++++++++----------
 3 files changed, 127 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 7168edc7592..0fbc6b73a52 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -91,6 +91,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
 	return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+static void compact_capture_page(struct compact_control *cc)
+{
+	unsigned long flags;
+	int mtype, mtype_low, mtype_high;
+
+	if (!cc->page || *cc->page)
+		return;
+
+	/*
+	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+	 * regardless of the migratetype of the freelist is is captured from.
+	 * This is fine because the order for a high-order MIGRATE_MOVABLE
+	 * allocation is typically at least a pageblock size and overall
+	 * fragmentation is not impaired. Other allocation types must
+	 * capture pages from their own migratelist because otherwise they
+	 * could pollute other pageblocks like MIGRATE_MOVABLE with
+	 * difficult to move pages and making fragmentation worse overall.
+	 */
+	if (cc->migratetype == MIGRATE_MOVABLE) {
+		mtype_low = 0;
+		mtype_high = MIGRATE_PCPTYPES;
+	} else {
+		mtype_low = cc->migratetype;
+		mtype_high = cc->migratetype + 1;
+	}
+
+	/* Speculatively examine the free lists without zone lock */
+	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+		int order;
+		for (order = cc->order; order < MAX_ORDER; order++) {
+			struct page *page;
+			struct free_area *area;
+			area = &(cc->zone->free_area[order]);
+			if (list_empty(&area->free_list[mtype]))
+				continue;
+
+			/* Take the lock and attempt capture of the page */
+			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+				return;
+			if (!list_empty(&area->free_list[mtype])) {
+				page = list_entry(area->free_list[mtype].next,
+							struct page, lru);
+				if (capture_free_page(page, cc->order, mtype)) {
+					spin_unlock_irqrestore(&cc->zone->lock,
+									flags);
+					*cc->page = page;
+					return;
+				}
+			}
+			spin_unlock_irqrestore(&cc->zone->lock, flags);
+		}
+	}
+}
+
 /*
  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -645,7 +699,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
-	unsigned int order;
 	unsigned long watermark;
 
 	if (fatal_signal_pending(current))
@@ -688,14 +741,22 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_CONTINUE;
 
 	/* Direct compactor: Is a suitable page free? */
-	for (order = cc->order; order < MAX_ORDER; order++) {
-		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
-			return COMPACT_PARTIAL;
-
-		/* Job done if allocation would set block type */
-		if (order >= pageblock_order && zone->free_area[order].nr_free)
+	if (cc->page) {
+		/* Was a suitable page captured? */
+		if (*cc->page)
 			return COMPACT_PARTIAL;
+	} else {
+		unsigned int order;
+		for (order = cc->order; order < MAX_ORDER; order++) {
+			struct free_area *area = &zone->free_area[cc->order];
+			/* Job done if page is free of the right migratetype */
+			if (!list_empty(&area->free_list[cc->migratetype]))
+				return COMPACT_PARTIAL;
+
+			/* Job done if allocation would set block type */
+			if (cc->order >= pageblock_order && area->nr_free)
+				return COMPACT_PARTIAL;
+		}
 	}
 
 	return COMPACT_CONTINUE;
@@ -817,6 +878,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				goto out;
 			}
 		}
+
+		/* Capture a page now if it is a suitable size */
+		compact_capture_page(cc);
 	}
 
 out:
@@ -829,7 +893,8 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+				 bool sync, bool *contended,
+				 struct page **page)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -839,6 +904,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.zone = zone,
 		.sync = sync,
 		.contended = contended,
+		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -860,7 +926,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			bool sync, bool *contended, struct page **page)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -881,7 +947,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, sync,
-						contended);
+						contended, page);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -936,6 +1002,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 	struct compact_control cc = {
 		.order = order,
 		.sync = false,
+		.page = NULL,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -946,6 +1013,7 @@ static int compact_node(int nid)
 	struct compact_control cc = {
 		.order = -1,
 		.sync = true,
+		.page = NULL,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e2..e549a7fbc29 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,6 +131,7 @@ struct compact_control {
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
 	bool *contended;		/* True if a lock was contended */
+	struct page **page;		/* Page captured of requested size */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e92698e539..cfd565dbe12 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1380,16 +1380,11 @@ void split_page(struct page *page, unsigned int order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
  */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
 	unsigned int order;
 	unsigned long watermark;
@@ -1411,10 +1406,11 @@ int split_free_page(struct page *page)
 	rmv_page_order(page);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 
-	/* Split into individual pages */
-	set_page_refcounted(page);
-	split_page(page, order);
+	if (alloc_order != order)
+		expand(zone, page, alloc_order, order,
+			&zone->free_area[order], migratetype);
 
+	/* Set the pageblock if the captured page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1421,35 @@ int split_free_page(struct page *page)
 		}
 	}
 
-	return 1 << order;
+	return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+	unsigned int order;
+	int nr_pages;
+
+	BUG_ON(!PageBuddy(page));
+	order = page_order(page);
+
+	nr_pages = capture_free_page(page, order, 0);
+	if (!nr_pages)
+		return 0;
+
+	/* Split into individual pages */
+	set_page_refcounted(page);
+	split_page(page, order);
+	return nr_pages;
 }
 
 /*
@@ -2105,7 +2129,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
-	struct page *page;
+	struct page *page = NULL;
 
 	if (!order)
 		return NULL;
@@ -2118,10 +2142,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration,
-						contended_compaction);
+						contended_compaction, &page);
 	current->flags &= ~PF_MEMALLOC;
-	if (*did_some_progress != COMPACT_SKIPPED) {
 
+	/* If compaction captured a page, prep and use it */
+	if (page) {
+		prep_new_page(page, order, gfp_mask);
+		goto got_page;
+	}
+
+	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
@@ -2131,6 +2161,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
+got_page:
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
-- 
cgit v1.2.3


From 8d34694c1abf29df1f3c7317936b7e3e2e308d9b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Date: Mon, 8 Oct 2012 16:29:14 -0700
Subject: revert "mm: mempolicy: Let vma_merge and vma_split handle
 vma->vm_policy linkages"

Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle
vma->vm_policy linkages") removed vma->vm_policy updates code but it is
the purpose of mbind_range().  Now, mbind_range() is virtually a no-op
and while it does not allow memory corruption it is not the right fix.
This patch is a revert.

[mgorman@suse.de: Edited changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e25..92daa267baf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,27 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	int err = 0;
+	struct mempolicy *old = vma->vm_policy;
+
+	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+		 vma->vm_ops, vma->vm_file,
+		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy)
+		err = vma->vm_ops->set_policy(vma, new);
+	if (!err) {
+		mpol_get(new);
+		vma->vm_policy = new;
+		mpol_put(old);
+	}
+	return err;
+}
+
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +676,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-
-		/*
-		 * Apply policy to a single VMA. The reference counting of
-		 * policy for vma_policy linkages has already been handled by
-		 * vma_merge and split_vma as necessary. If this is a shared
-		 * policy then ->set_policy will increment the reference count
-		 * for an sp node.
-		 */
-		pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-			vma->vm_start, vma->vm_end, vma->vm_pgoff,
-			vma->vm_ops, vma->vm_file,
-			vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-		if (vma->vm_ops && vma->vm_ops->set_policy) {
-			err = vma->vm_ops->set_policy(vma, new_pol);
-			if (err)
-				goto out;
-		}
+		err = policy_vma(vma, new_pol);
+		if (err)
+			goto out;
 	}
 
  out:
-- 
cgit v1.2.3


From 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:16 -0700
Subject: mempolicy: remove mempolicy sharing

Dave Jones' system call fuzz testing tool "trinity" triggered the
following bug error with slab debugging enabled

    =============================================================================
    BUG numa_policy (Not tainted): Poison overwritten
    -----------------------------------------------------------------------------

    INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b
    INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154
     __slab_alloc+0x3d3/0x445
     kmem_cache_alloc+0x29d/0x2b0
     mpol_new+0xa3/0x140
     sys_mbind+0x142/0x620
     system_call_fastpath+0x16/0x1b

    INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154
     __slab_free+0x2e/0x1de
     kmem_cache_free+0x25a/0x260
     __mpol_put+0x27/0x30
     remove_vma+0x68/0x90
     exit_mmap+0x118/0x140
     mmput+0x73/0x110
     exit_mm+0x108/0x130
     do_exit+0x162/0xb90
     do_group_exit+0x4f/0xc0
     sys_exit_group+0x17/0x20
     system_call_fastpath+0x16/0x1b

    INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x          (null) flags=0x20000000004080
    INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0

The problem is that the structure is being prematurely freed due to a
reference count imbalance. In the following case mbind(addr, len) should
replace the memory policies of both vma1 and vma2 and thus they will
become to share the same mempolicy and the new mempolicy will have the
MPOL_F_SHARED flag.

  +-------------------+-------------------+
  |     vma1          |     vma2(shmem)   |
  +-------------------+-------------------+
  |                                       |
 addr                                 addr+len

alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for
maintaining the mempolicy reference count.  The current rule is that
get_vma_policy() only increments refcount for shmem VMA and
mpol_conf_put() only decrements refcount if the policy has
MPOL_F_SHARED.

In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED!
The reference count will be decreased even though was not increased
whenever alloc_page_vma() is called.  This has been broken since commit
[52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008.

There is another serious bug with the sharing of memory policies.
Currently, mempolicy rebind logic (it is called from cpuset rebinding)
ignores a refcount of mempolicy and override it forcibly.  Thus, any
mempolicy sharing may cause mempolicy corruption.  The bug was
introduced by commit [68860ec1: cpusets: automatic numa mempolicy
rebinding].

Ideally, the shared policy handling would be rewritten to either
properly handle COW of the policy structures or at least reference count
MPOL_F_SHARED based exclusively on information within the policy.
However, this patch takes the easier approach of disabling any policy
sharing between VMAs.  Each new range allocated with sp_alloc will
allocate a new policy, set the reference count to 1 and drop the
reference count of the old policy.  This increases the memory footprint
but is not expected to be a major problem as mbind() is unlikely to be
used for fine-grained ranges.  It is also inefficient because it means
we allocate a new policy even in cases where mbind_range() could use the
new_policy passed to it.  However, it is more straight-forward and the
change should be invisible to the user.

[mgorman@suse.de: Edited changelog]
Reported-by: Dave Jones <davej@redhat.com>,
Cc: Christoph Lameter <cl@linux.com>,
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 92daa267baf..f0728ae7467 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+						struct mempolicy *pol)
 {
-	int err = 0;
-	struct mempolicy *old = vma->vm_policy;
+	int err;
+	struct mempolicy *old;
+	struct mempolicy *new;
 
 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 		 vma->vm_ops, vma->vm_file,
 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 
-	if (vma->vm_ops && vma->vm_ops->set_policy)
+	new = mpol_dup(pol);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy) {
 		err = vma->vm_ops->set_policy(vma, new);
-	if (!err) {
-		mpol_get(new);
-		vma->vm_policy = new;
-		mpol_put(old);
+		if (err)
+			goto err_out;
 	}
+
+	old = vma->vm_policy;
+	vma->vm_policy = new; /* protected by mmap_sem */
+	mpol_put(old);
+
+	return 0;
+ err_out:
+	mpol_put(new);
 	return err;
 }
 
@@ -676,7 +691,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-		err = policy_vma(vma, new_pol);
+		err = vma_replace_policy(vma, new_pol);
 		if (err)
 			goto out;
 	}
@@ -2153,15 +2168,24 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 				struct mempolicy *pol)
 {
-	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+	struct sp_node *n;
+	struct mempolicy *newpol;
 
+	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n)
 		return NULL;
+
+	newpol = mpol_dup(pol);
+	if (IS_ERR(newpol)) {
+		kmem_cache_free(sn_cache, n);
+		return NULL;
+	}
+	newpol->flags |= MPOL_F_SHARED;
+
 	n->start = start;
 	n->end = end;
-	mpol_get(pol);
-	pol->flags |= MPOL_F_SHARED;	/* for unref */
-	n->policy = pol;
+	n->policy = newpol;
+
 	return n;
 }
 
-- 
cgit v1.2.3


From b22d127a39ddd10d93deee3d96e643657ad53a49 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:17 -0700
Subject: mempolicy: fix a race in shared_policy_replace()

shared_policy_replace() use of sp_alloc() is unsafe.  1) sp_node cannot
be dereferenced if sp->lock is not held and 2) another thread can modify
sp_node between spin_unlock for allocating a new sp node and next
spin_lock.  The bug was introduced before 2.6.12-rc2.

Kosaki's original patch for this problem was to allocate an sp node and
policy within shared_policy_replace and initialise it when the lock is
reacquired.  I was not keen on this approach because it partially
duplicates sp_alloc().  As the paths were sp->lock is taken are not that
performance critical this patch converts sp->lock to sp->mutex so it can
sleep when calling sp_alloc().

[kosaki.motohiro@jp.fujitsu.com: Original patch]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f0728ae7467..b2f12ecc1b3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2083,7 +2083,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  */
 
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2147,13 +2147,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 
 	if (!sp->root.rb_node)
 		return NULL;
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	sn = sp_lookup(sp, idx, idx+1);
 	if (sn) {
 		mpol_get(sn->policy);
 		pol = sn->policy;
 	}
-	spin_unlock(&sp->lock);
+	mutex_unlock(&sp->mutex);
 	return pol;
 }
 
@@ -2193,10 +2193,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 				 unsigned long end, struct sp_node *new)
 {
-	struct sp_node *n, *new2 = NULL;
+	struct sp_node *n;
+	int ret = 0;
 
-restart:
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	n = sp_lookup(sp, start, end);
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
@@ -2209,16 +2209,14 @@ restart:
 		} else {
 			/* Old policy spanning whole new range. */
 			if (n->end > end) {
+				struct sp_node *new2;
+				new2 = sp_alloc(end, n->end, n->policy);
 				if (!new2) {
-					spin_unlock(&sp->lock);
-					new2 = sp_alloc(end, n->end, n->policy);
-					if (!new2)
-						return -ENOMEM;
-					goto restart;
+					ret = -ENOMEM;
+					goto out;
 				}
 				n->end = start;
 				sp_insert(sp, new2);
-				new2 = NULL;
 				break;
 			} else
 				n->end = start;
@@ -2229,12 +2227,9 @@ restart:
 	}
 	if (new)
 		sp_insert(sp, new);
-	spin_unlock(&sp->lock);
-	if (new2) {
-		mpol_put(new2->policy);
-		kmem_cache_free(sn_cache, new2);
-	}
-	return 0;
+out:
+	mutex_unlock(&sp->mutex);
+	return ret;
 }
 
 /**
@@ -2252,7 +2247,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
-	spin_lock_init(&sp->lock);
+	mutex_init(&sp->mutex);
 
 	if (mpol) {
 		struct vm_area_struct pvma;
@@ -2318,7 +2313,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 
 	if (!p->root.rb_node)
 		return;
-	spin_lock(&p->lock);
+	mutex_lock(&p->mutex);
 	next = rb_first(&p->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
@@ -2327,7 +2322,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
-	spin_unlock(&p->lock);
+	mutex_unlock(&p->mutex);
 }
 
 /* assumes fs == KERNEL_DS */
-- 
cgit v1.2.3


From 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:19 -0700
Subject: mempolicy: fix refcount leak in mpol_set_shared_policy()

When shared_policy_replace() fails to allocate new->policy is not freed
correctly by mpol_set_shared_policy().  The problem is that shared
mempolicy code directly call kmem_cache_free() in multiple places where
it is easy to make a mistake.

This patch creates an sp_free wrapper function and uses it. The bug was
introduced pre-git age (IOW, before 2.6.12-rc2).

[mgorman@suse.de: Editted changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b2f12ecc1b3..1763418f1ad 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2157,12 +2157,17 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return pol;
 }
 
+static void sp_free(struct sp_node *n)
+{
+	mpol_put(n->policy);
+	kmem_cache_free(sn_cache, n);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
-	mpol_put(n->policy);
-	kmem_cache_free(sn_cache, n);
+	sp_free(n);
 }
 
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
@@ -2301,7 +2306,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	}
 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
 	if (err && new)
-		kmem_cache_free(sn_cache, new);
+		sp_free(new);
 	return err;
 }
 
@@ -2318,9 +2323,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
-		rb_erase(&n->nd, &p->root);
-		mpol_put(n->policy);
-		kmem_cache_free(sn_cache, n);
+		sp_delete(p, n);
 	}
 	mutex_unlock(&p->mutex);
 }
-- 
cgit v1.2.3


From 00442ad04a5eac08a98255697c510e708f6082e2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:20 -0700
Subject: mempolicy: fix a memory corruption by refcount imbalance in
 alloc_pages_vma()

Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier
related damage v3") introduced a potential memory corruption.
shmem_alloc_page() uses a pseudo vma and it has one significant unique
combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED.

get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL
and mpol_cond_put() DOES decrease a policy ref when a policy has
MPOL_F_SHARED.  Therefore, when a cpuset update race occurs,
alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the
reference count and frees the policy prematurely.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1763418f1ad..3d64b369180 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1552,8 +1552,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 									addr);
 			if (vpol)
 				pol = vpol;
-		} else if (vma->vm_policy)
+		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
+
+			/*
+			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
+			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+			 * count on these policies which will be dropped by
+			 * mpol_cond_put() later
+			 */
+			if (mpol_needs_cond_ref(pol))
+				mpol_get(pol);
+		}
 	}
 	if (!pol)
 		pol = &default_policy;
-- 
cgit v1.2.3


From 21a92735f660eaecf69a6f2e777f18463760ec32 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.co.il>
Date: Mon, 8 Oct 2012 16:29:24 -0700
Subject: mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may
 safely schedule

With an RCU based mmu_notifier implementation, any callout to
mmu_notifier_invalidate_range_{start,end}() or
mmu_notifier_invalidate_page() would not be allowed to call schedule()
as that could potentially allow a modification to the mmu_notifier
structure while it is currently being used.

Since srcu allocs 4 machine words per instance per cpu, we may end up
with memory exhaustion if we use srcu per mm.  So all mms share a global
srcu.  Note that during large mmu_notifier activity exit & unregister
paths might hang for longer periods, but it is tolerable for current
mmu_notifier clients.

Signed-off-by: Sagi Grimberg <sagig@mellanox.co.il>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 73 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9..35ff447d8d1 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+/* global SRCU for all MMs */
+struct srcu_struct srcu;
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
 	/*
 	 * RCU here will block mmu_notifier_unregister until
 	 * ->release returns.
 	 */
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
 		/*
 		 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_rcu here prevents mmu_notifier_release to
+	 * synchronize_srcu here prevents mmu_notifier_release to
 	 * return to exit_mmap (which would proceed freeing all pages
 	 * in the mm) until the ->release method returns, if it was
 	 * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	 * The mmu_notifier_mm can't go away from under us because one
 	 * mm_count is hold by exit_mmap.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 }
 
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->test_young) {
 			young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 				break;
 		}
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
@@ -138,7 +144,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 		else if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +152,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +167,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +182,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +201,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
+	/*
+	* Verify that mmu_notifier_init() already run and the global srcu is
+	* initialized.
+	*/
+	BUG_ON(!srcu.per_cpu_ref);
+
 	ret = -ENOMEM;
 	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
 	if (unlikely(!mmu_notifier_mm))
@@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -290,8 +305,9 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		 * RCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
-		rcu_read_lock();
+		int id;
 
+		id = srcu_read_lock(&srcu);
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
 		 * guarantee ->release is called before freeing the
@@ -299,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&srcu, id);
 
 		spin_lock(&mm->mmu_notifier_mm->lock);
 		hlist_del_rcu(&mn->hlist);
@@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * Wait any running method to finish, of course including
 	 * ->release if it was run by mmu_notifier_relase instead of us.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
 	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+	return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
-- 
cgit v1.2.3


From e0f3c3f78da29b114e7c1c68019036559f715948 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:26 -0700
Subject: mm/mmu_notifier: init notifier if necessary

While registering MMU notifier, new instance of MMU notifier_mm will be
allocated and later free'd if currrent mm_struct's MMU notifier_mm has
been initialized.  That causes some overhead.  The patch tries to
elominate that.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Sagi Grimberg <sagig@mellanox.co.il>
Cc: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 35ff447d8d1..947df83dccb 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -207,22 +207,23 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	*/
 	BUG_ON(!srcu.per_cpu_ref);
 
-	ret = -ENOMEM;
-	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
-	if (unlikely(!mmu_notifier_mm))
-		goto out;
-
 	if (take_mmap_sem)
 		down_write(&mm->mmap_sem);
 	ret = mm_take_all_locks(mm);
 	if (unlikely(ret))
-		goto out_cleanup;
+		goto out;
 
 	if (!mm_has_notifiers(mm)) {
+		mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
+					GFP_KERNEL);
+		if (unlikely(!mmu_notifier_mm)) {
+			ret = -ENOMEM;
+			goto out_of_mem;
+		}
 		INIT_HLIST_HEAD(&mmu_notifier_mm->list);
 		spin_lock_init(&mmu_notifier_mm->lock);
+
 		mm->mmu_notifier_mm = mmu_notifier_mm;
-		mmu_notifier_mm = NULL;
 	}
 	atomic_inc(&mm->mm_count);
 
@@ -238,13 +239,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
+out_of_mem:
 	mm_drop_all_locks(mm);
-out_cleanup:
+out:
 	if (take_mmap_sem)
 		up_write(&mm->mmap_sem);
-	/* kfree() does nothing if mmu_notifier_mm is NULL */
-	kfree(mmu_notifier_mm);
-out:
+
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 	return ret;
 }
-- 
cgit v1.2.3


From d5dc0ad928fb9e972001e552597fd0b794863f34 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:27 -0700
Subject: mm/vmscan: fix error number for failed kthread

Fix the return value while failing to create the kswapd kernel thread.
Also, the error message is prioritized as KERN_ERR.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6fdae7a8d4..d16bf5a5326 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3126,9 +3126,9 @@ int kswapd_run(int nid)
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
-		printk("Failed to start kswapd on node %d\n",nid);
 		pgdat->kswapd = NULL;
-		ret = -1;
+		pr_err("Failed to start kswapd on node %d\n", nid);
+		ret = PTR_ERR(pgdat->kswapd);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Mon, 8 Oct 2012 16:29:30 -0700
Subject: oom: remove deprecated oom_adj

The deprecated /proc/<pid>/oom_adj is scheduled for removal this month.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 19860086163..79e0f3e2483 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d, oom_score_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj,
+		"oom_score_adj=%d\n",
+		current->comm, gfp_mask, order,
 		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
-- 
cgit v1.2.3


From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 8 Oct 2012 16:29:32 -0700
Subject: mm: hugetlb: add arch hook for clearing page flags before entering
 pool

The core page allocator ensures that page flags are zeroed when freeing
pages via free_pages_check.  A number of architectures (ARM, PPC, MIPS)
rely on this property to treat new pages as dirty with respect to the data
cache and perform the appropriate flushing before mapping the pages into
userspace.

This can lead to cache synchronisation problems when using hugepages,
since the allocator keeps its own pool of pages above the usual page
allocator and does not reset the page flags when freeing a page into the
pool.

This patch adds a new architecture hook, arch_clear_hugepage_flags, so
that architectures which rely on the page flags being in a particular
state for fresh allocations can adjust the flags accordingly when a page
is freed into the pool.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd4..f1bb534254f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -637,6 +637,7 @@ static void free_huge_page(struct page *page)
 		h->surplus_huge_pages--;
 		h->surplus_huge_pages_node[nid]--;
 	} else {
+		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
-- 
cgit v1.2.3


From db971418824381d3583c73751181fec76c743bf6 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:29:34 -0700
Subject: mm: adjust final #endif position in mm/internal.h

Make sure the #endif that terminates the standard #ifndef / #define /
#endif construct gets labeled, and gets positioned at the end of the file
as is normally the case.

Signed-off-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index e549a7fbc29..bbd7b34db4e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -341,7 +341,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0
 #define ZONE_RECLAIM_SUCCESS	1
-#endif
 
 extern int hwpoison_filter(struct page *p);
 
@@ -357,3 +356,5 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
+
+#endif	/* __MM_INTERNAL_H */
-- 
cgit v1.2.3


From 65b3c07b43f7f8a5cbf8923011bd4e6650e3d1dc Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:35 -0700
Subject: thp: fix the count of THP_COLLAPSE_ALLOC

THP_COLLAPSE_ALLOC is double counted if NUMA is disabled since it has
already been calculated in khugepaged_alloc_hugepage

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9b72d127051..860ea912785 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1874,9 +1874,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
+	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
 
-	count_vm_event(THP_COLLAPSE_ALLOC);
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 #ifdef CONFIG_NUMA
 		put_page(new_page);
-- 
cgit v1.2.3


From 637e3a27ec2c84f7ecd083fa6943da2f19eb5e9f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:38 -0700
Subject: thp: remove unnecessary check in start_khugepaged

The check is unnecessary since if mm_slot_cache or mm_slots_hash
initialize failed, no sysfs interface will be created

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ea912785..9833d8ecf38 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -140,10 +140,7 @@ static int start_khugepaged(void)
 	int err = 0;
 	if (khugepaged_enabled()) {
 		int wakeup;
-		if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-			err = -ENOMEM;
-			goto out;
-		}
+
 		mutex_lock(&khugepaged_mutex);
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
@@ -163,7 +160,7 @@ static int start_khugepaged(void)
 	} else
 		/* wakeup to exit */
 		wake_up_interruptible(&khugepaged_wait);
-out:
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 911891afe1c3104adf0f802189909868239ebbfd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:41 -0700
Subject: thp: move khugepaged_mutex out of khugepaged

Currently, hugepaged_mutex is used really complexly and hard to
understand, actually, it is just used to serialize start_khugepaged and
khugepaged for these reasons:

- khugepaged_thread is shared between them
- the thp disable path (echo never > transparent_hugepage/enabled) is
  nonblocking, so we need to protect khugepaged_thread to get a stable
  running state

These can be avoided by:

- use the lock to serialize the thread creation and cancel
- thp disable path can not finised until the thread exits

Then khugepaged_thread is fully controlled by start_khugepaged, khugepaged
will be happy without the lock

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9833d8ecf38..0931b2b19c5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -139,9 +139,6 @@ static int start_khugepaged(void)
 {
 	int err = 0;
 	if (khugepaged_enabled()) {
-		int wakeup;
-
-		mutex_lock(&khugepaged_mutex);
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
@@ -151,15 +148,17 @@ static int start_khugepaged(void)
 			err = PTR_ERR(khugepaged_thread);
 			khugepaged_thread = NULL;
 		}
-		wakeup = !list_empty(&khugepaged_scan.mm_head);
-		mutex_unlock(&khugepaged_mutex);
-		if (wakeup)
+
+		if (!list_empty(&khugepaged_scan.mm_head))
 			wake_up_interruptible(&khugepaged_wait);
 
 		set_recommended_min_free_kbytes();
-	} else
+	} else if (khugepaged_thread) {
 		/* wakeup to exit */
 		wake_up_interruptible(&khugepaged_wait);
+		kthread_stop(khugepaged_thread);
+		khugepaged_thread = NULL;
+	}
 
 	return err;
 }
@@ -221,7 +220,12 @@ static ssize_t enabled_store(struct kobject *kobj,
 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 
 	if (ret > 0) {
-		int err = start_khugepaged();
+		int err;
+
+		mutex_lock(&khugepaged_mutex);
+		err = start_khugepaged();
+		mutex_unlock(&khugepaged_mutex);
+
 		if (err)
 			ret = err;
 	}
@@ -2329,20 +2333,10 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	/* serialize with start_khugepaged() */
-	mutex_lock(&khugepaged_mutex);
-
-	for (;;) {
-		mutex_unlock(&khugepaged_mutex);
+	while (!kthread_should_stop()) {
 		VM_BUG_ON(khugepaged_thread != current);
 		khugepaged_loop();
 		VM_BUG_ON(khugepaged_thread != current);
-
-		mutex_lock(&khugepaged_mutex);
-		if (!khugepaged_enabled())
-			break;
-		if (unlikely(kthread_should_stop()))
-			break;
 	}
 
 	spin_lock(&khugepaged_mm_lock);
@@ -2351,10 +2345,6 @@ static int khugepaged(void *none)
 	if (mm_slot)
 		collect_mm_slot(mm_slot);
 	spin_unlock(&khugepaged_mm_lock);
-
-	khugepaged_thread = NULL;
-	mutex_unlock(&khugepaged_mutex);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From e060f0e0139b83f05bb90fa05563d14179b9a7ff Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:42 -0700
Subject: thp: remove unnecessary khugepaged_thread check

Now, khugepaged creation and cancel are completely serial under the
protection of khugepaged_mutex, it is impossible that many khugepaged
entities are running

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0931b2b19c5..46e3f150a6c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2333,11 +2333,8 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	while (!kthread_should_stop()) {
-		VM_BUG_ON(khugepaged_thread != current);
+	while (!kthread_should_stop())
 		khugepaged_loop();
-		VM_BUG_ON(khugepaged_thread != current);
-	}
 
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = khugepaged_scan.mm_slot;
-- 
cgit v1.2.3


From 2017c0bff8ba79ea527361adbe19471e174775d6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:44 -0700
Subject: thp: remove wake_up_interruptible in the exit path

Add the check of kthread_should_stop() to the conditions which are used to
wakeup on khugepaged_wait, then kthread_stop is enough to let the thread
exit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 46e3f150a6c..a94c07a1b3c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -154,8 +154,6 @@ static int start_khugepaged(void)
 
 		set_recommended_min_free_kbytes();
 	} else if (khugepaged_thread) {
-		/* wakeup to exit */
-		wake_up_interruptible(&khugepaged_wait);
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
 	}
@@ -2221,7 +2219,7 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) ||
-		!khugepaged_enabled();
+		kthread_should_stop();
 }
 
 static void khugepaged_do_scan(struct page **hpage)
@@ -2288,6 +2286,24 @@ static struct page *khugepaged_alloc_hugepage(void)
 }
 #endif
 
+static void khugepaged_wait_work(void)
+{
+	try_to_freeze();
+
+	if (khugepaged_has_work()) {
+		if (!khugepaged_scan_sleep_millisecs)
+			return;
+
+		wait_event_freezable_timeout(khugepaged_wait,
+					     kthread_should_stop(),
+			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+		return;
+	}
+
+	if (khugepaged_enabled())
+		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
 static void khugepaged_loop(void)
 {
 	struct page *hpage;
@@ -2312,17 +2328,8 @@ static void khugepaged_loop(void)
 		if (hpage)
 			put_page(hpage);
 #endif
-		try_to_freeze();
-		if (unlikely(kthread_should_stop()))
-			break;
-		if (khugepaged_has_work()) {
-			if (!khugepaged_scan_sleep_millisecs)
-				continue;
-			wait_event_freezable_timeout(khugepaged_wait, false,
-			    msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-		} else if (khugepaged_enabled())
-			wait_event_freezable(khugepaged_wait,
-					     khugepaged_wait_event());
+
+		khugepaged_wait_work();
 	}
 }
 
-- 
cgit v1.2.3


From 9817626e722a5e5699cf38f5d3a4c9851e054436 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:46 -0700
Subject: thp: remove some code depend on CONFIG_NUMA

If NUMA is disabled, hpage is used as page pre-alloc, so there are two
cases for hpage:

- it is !NULL, means the page is not consumed otherwise,
- the page has been consumed

If NUMA is enabled, hpage is just used as alloc-fail indicator which is
not a real page, NULL means not fail triggered.

So, we can release the page only if !IS_ERR_OR_NULL

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a94c07a1b3c..1e21b4cf4c7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2306,11 +2306,8 @@ static void khugepaged_wait_work(void)
 
 static void khugepaged_loop(void)
 {
-	struct page *hpage;
+	struct page *hpage = NULL;
 
-#ifdef CONFIG_NUMA
-	hpage = NULL;
-#endif
 	while (likely(khugepaged_enabled())) {
 #ifndef CONFIG_NUMA
 		hpage = khugepaged_alloc_hugepage();
@@ -2324,10 +2321,9 @@ static void khugepaged_loop(void)
 #endif
 
 		khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-		if (hpage)
+
+		if (!IS_ERR_OR_NULL(hpage))
 			put_page(hpage);
-#endif
 
 		khugepaged_wait_work();
 	}
-- 
cgit v1.2.3


From d516904bd239fe2c9f1bd46cf146bb4b8831321c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:48 -0700
Subject: thp: merge page pre-alloc in khugepaged_loop into khugepaged_do_scan

There are two pre-alloc operations in these two function, the different is:
- it allows to sleep if page alloc fail in khugepaged_loop
- it exits immediately if page alloc fail in khugepaged_do_scan

Actually, in khugepaged_do_scan, we can allow the pre-alloc to sleep on
the first failure, then the operation in khugepaged_loop can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 97 ++++++++++++++++++++++++++------------------------------
 1 file changed, 45 insertions(+), 52 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1e21b4cf4c7..d5b5fcc73c4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2222,10 +2222,40 @@ static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_alloc_sleep(void)
+{
+	wait_event_freezable_timeout(khugepaged_wait, false,
+			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+			if (!*wait)
+				return NULL;
+
+			*wait = false;
+			khugepaged_alloc_sleep();
+		} else
+			count_vm_event(THP_COLLAPSE_ALLOC);
+	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+	return hpage;
+}
+#endif
+
+static void khugepaged_do_scan(void)
 {
+	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = khugepaged_pages_to_scan;
+	bool wait = true;
 
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
@@ -2233,17 +2263,18 @@ static void khugepaged_do_scan(struct page **hpage)
 		cond_resched();
 
 #ifndef CONFIG_NUMA
-		if (!*hpage) {
-			*hpage = alloc_hugepage(khugepaged_defrag());
-			if (unlikely(!*hpage)) {
-				count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+		if (!hpage)
+			hpage = khugepaged_alloc_hugepage(&wait);
+
+		if (unlikely(!hpage))
+			break;
+#else
+		if (IS_ERR(hpage)) {
+			if (!wait)
 				break;
-			}
-			count_vm_event(THP_COLLAPSE_ALLOC);
+			wait = false;
+			khugepaged_alloc_sleep();
 		}
-#else
-		if (IS_ERR(*hpage))
-			break;
 #endif
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
@@ -2255,37 +2286,16 @@ static void khugepaged_do_scan(struct page **hpage)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    hpage);
+							    &hpage);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
-}
 
-static void khugepaged_alloc_sleep(void)
-{
-	wait_event_freezable_timeout(khugepaged_wait, false,
-			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+	if (!IS_ERR_OR_NULL(hpage))
+		put_page(hpage);
 }
 
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) &&
-		 likely(khugepaged_enabled()));
-	return hpage;
-}
-#endif
-
 static void khugepaged_wait_work(void)
 {
 	try_to_freeze();
@@ -2306,25 +2316,8 @@ static void khugepaged_wait_work(void)
 
 static void khugepaged_loop(void)
 {
-	struct page *hpage = NULL;
-
 	while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
-		hpage = khugepaged_alloc_hugepage();
-		if (unlikely(!hpage))
-			break;
-#else
-		if (IS_ERR(hpage)) {
-			khugepaged_alloc_sleep();
-			hpage = NULL;
-		}
-#endif
-
-		khugepaged_do_scan(&hpage);
-
-		if (!IS_ERR_OR_NULL(hpage))
-			put_page(hpage);
-
+		khugepaged_do_scan();
 		khugepaged_wait_work();
 	}
 }
-- 
cgit v1.2.3


From 420256ef02660af0acf28c12fe4b7d514ca88a4d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:49 -0700
Subject: thp: release page in page pre-alloc path

If NUMA is enabled, we can release the page in the page pre-alloc
operation, then the CONFIG_NUMA dependent code can be reduced

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d5b5fcc73c4..9c4390f60c3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1873,15 +1873,12 @@ static void collapse_huge_page(struct mm_struct *mm,
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
+	*hpage = new_page;
 	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
-		put_page(new_page);
-#endif
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
 		return;
-	}
 
 	/*
 	 * Prevent all access to pagetables with the exception of
@@ -1982,9 +1979,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	prepare_pmd_huge_pte(pgtable, mm);
 	spin_unlock(&mm->page_table_lock);
 
-#ifndef CONFIG_NUMA
 	*hpage = NULL;
-#endif
+
 	khugepaged_pages_collapsed++;
 out_up_write:
 	up_write(&mm->mmap_sem);
@@ -1992,9 +1988,6 @@ out_up_write:
 
 out:
 	mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-	put_page(new_page);
-#endif
 	goto out_up_write;
 }
 
@@ -2260,8 +2253,6 @@ static void khugepaged_do_scan(void)
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
 	while (progress < pages) {
-		cond_resched();
-
 #ifndef CONFIG_NUMA
 		if (!hpage)
 			hpage = khugepaged_alloc_hugepage(&wait);
@@ -2274,8 +2265,12 @@ static void khugepaged_do_scan(void)
 				break;
 			wait = false;
 			khugepaged_alloc_sleep();
+		} else if (hpage) {
+			put_page(hpage);
+			hpage = NULL;
 		}
 #endif
+		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
 			break;
-- 
cgit v1.2.3


From 26234f36ef3ec7efcfa9acb181427849c1f9db7c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:51 -0700
Subject: thp: introduce khugepaged_prealloc_page and khugepaged_alloc_page

They are used to abstract the difference between NUMA enabled and NUMA
disabled to make the code more readable

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 166 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 98 insertions(+), 68 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c4390f60c3..f0e999379dd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1827,28 +1827,34 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 	}
 }
 
-static void collapse_huge_page(struct mm_struct *mm,
-			       unsigned long address,
-			       struct page **hpage,
-			       struct vm_area_struct *vma,
-			       int node)
+static void khugepaged_alloc_sleep(void)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd, _pmd;
-	pte_t *pte;
-	pgtable_t pgtable;
-	struct page *new_page;
-	spinlock_t *ptl;
-	int isolated;
-	unsigned long hstart, hend;
+	wait_event_freezable_timeout(khugepaged_wait, false,
+			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
 
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
-	up_read(&mm->mmap_sem);
-	VM_BUG_ON(!*hpage);
-	new_page = *hpage;
-#else
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+	if (IS_ERR(*hpage)) {
+		if (!*wait)
+			return false;
+
+		*wait = false;
+		khugepaged_alloc_sleep();
+	} else if (*hpage) {
+		put_page(*hpage);
+		*hpage = NULL;
+	}
+
+	return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+		       struct vm_area_struct *vma, unsigned long address,
+		       int node)
+{
 	VM_BUG_ON(*hpage);
 	/*
 	 * Allocate the page while the vma is still valid and under
@@ -1860,7 +1866,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * mmap_sem in read mode is good idea also to allow greater
 	 * scalability.
 	 */
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+	*hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
 				      node, __GFP_OTHER_NODE);
 
 	/*
@@ -1868,15 +1874,81 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * preparation for taking it in write mode.
 	 */
 	up_read(&mm->mmap_sem);
-	if (unlikely(!new_page)) {
+	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
-		return;
+		return NULL;
 	}
-	*hpage = new_page;
+
 	count_vm_event(THP_COLLAPSE_ALLOC);
+	return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+			if (!*wait)
+				return NULL;
+
+			*wait = false;
+			khugepaged_alloc_sleep();
+		} else
+			count_vm_event(THP_COLLAPSE_ALLOC);
+	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+	return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+	if (!*hpage)
+		*hpage = khugepaged_alloc_hugepage(wait);
+
+	if (unlikely(!*hpage))
+		return false;
+
+	return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+		       struct vm_area_struct *vma, unsigned long address,
+		       int node)
+{
+	up_read(&mm->mmap_sem);
+	VM_BUG_ON(!*hpage);
+	return  *hpage;
+}
 #endif
 
+static void collapse_huge_page(struct mm_struct *mm,
+				   unsigned long address,
+				   struct page **hpage,
+				   struct vm_area_struct *vma,
+				   int node)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd, _pmd;
+	pte_t *pte;
+	pgtable_t pgtable;
+	struct page *new_page;
+	spinlock_t *ptl;
+	int isolated;
+	unsigned long hstart, hend;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	/* release the mmap_sem read lock. */
+	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+	if (!new_page)
+		return;
+
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
 		return;
 
@@ -2215,34 +2287,6 @@ static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_alloc_sleep(void)
-{
-	wait_event_freezable_timeout(khugepaged_wait, false,
-			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-}
-
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-#endif
-
 static void khugepaged_do_scan(void)
 {
 	struct page *hpage = NULL;
@@ -2253,23 +2297,9 @@ static void khugepaged_do_scan(void)
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
 	while (progress < pages) {
-#ifndef CONFIG_NUMA
-		if (!hpage)
-			hpage = khugepaged_alloc_hugepage(&wait);
-
-		if (unlikely(!hpage))
+		if (!khugepaged_prealloc_page(&hpage, &wait))
 			break;
-#else
-		if (IS_ERR(hpage)) {
-			if (!wait)
-				break;
-			wait = false;
-			khugepaged_alloc_sleep();
-		} else if (hpage) {
-			put_page(hpage);
-			hpage = NULL;
-		}
-#endif
+
 		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
-- 
cgit v1.2.3


From b7231789b0224e73af4efc7973f8bcf17fc16edd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:54 -0700
Subject: thp: remove khugepaged_loop

Merge khugepaged_loop into khugepaged

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e999379dd..77b470b7e3c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2339,14 +2339,6 @@ static void khugepaged_wait_work(void)
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
-static void khugepaged_loop(void)
-{
-	while (likely(khugepaged_enabled())) {
-		khugepaged_do_scan();
-		khugepaged_wait_work();
-	}
-}
-
 static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
@@ -2354,8 +2346,10 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	while (!kthread_should_stop())
-		khugepaged_loop();
+	while (!kthread_should_stop()) {
+		khugepaged_do_scan();
+		khugepaged_wait_work();
+	}
 
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = khugepaged_scan.mm_slot;
-- 
cgit v1.2.3


From 17c230afa58a6d013a4949d5c04b823a281d40fa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:56 -0700
Subject: thp: use khugepaged_enabled to remove duplicate code

Use khugepaged_enabled to see whether thp is enabled

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 77b470b7e3c..a169c4dbcdd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
 	unsigned long recommended_min;
 	extern int min_free_kbytes;
 
-	if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-		      &transparent_hugepage_flags) &&
-	    !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-		      &transparent_hugepage_flags))
+	if (!khugepaged_enabled())
 		return 0;
 
 	for_each_populated_zone(zone)
@@ -228,11 +225,7 @@ static ssize_t enabled_store(struct kobject *kobj,
 			ret = err;
 	}
 
-	if (ret > 0 &&
-	    (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-		      &transparent_hugepage_flags) ||
-	     test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-		      &transparent_hugepage_flags)))
+	if (ret > 0 && khugepaged_enabled())
 		set_recommended_min_free_kbytes();
 
 	return ret;
-- 
cgit v1.2.3


From 227e4047488d3ee0173a914275a7fd207ad51e5b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:59 -0700
Subject: thp: remove unnecessary set_recommended_min_free_kbytes

Since it is called in start_khugepaged

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a169c4dbcdd..e19cc426c52 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -225,9 +225,6 @@ static ssize_t enabled_store(struct kobject *kobj,
 			ret = err;
 	}
 
-	if (ret > 0 && khugepaged_enabled())
-		set_recommended_min_free_kbytes();
-
 	return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -562,8 +559,6 @@ static int __init hugepage_init(void)
 
 	start_khugepaged();
 
-	set_recommended_min_free_kbytes();
-
 	return 0;
 out:
 	hugepage_exit_sysfs(hugepage_kobj);
-- 
cgit v1.2.3


From ca42b26ab285edc5ee3f9faa48379d258db53c35 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:01 -0700
Subject: mm: fix potential anon_vma locking issue in mprotect()

Fix an anon_vma locking issue in the following situation:

- vma has no anon_vma
- next has an anon_vma
- vma is being shrunk / next is being expanded, due to an mprotect call

We need to take next's anon_vma lock to avoid races with rmap users (such
as page migration) while next is being expanded.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 2ba27434e84..e3c365ff1b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -573,8 +573,12 @@ again:			remove_next = 1 + (end > next->vm_end);
 	 */
 	if (vma->anon_vma && (importer || start != vma->vm_start)) {
 		anon_vma = vma->anon_vma;
+		VM_BUG_ON(adjust_next && next->anon_vma &&
+			  anon_vma != next->anon_vma);
+	} else if (adjust_next && next->anon_vma)
+		anon_vma = next->anon_vma;
+	if (anon_vma)
 		anon_vma_lock(anon_vma);
-	}
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
-- 
cgit v1.2.3


From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:04 -0700
Subject: thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE

Cleanup patch in preparation for transparent hugepage support on s390.
Adding new architectures to the TRANSPARENT_HUGEPAGE config option can
make the "depends" line rather ugly, like "depends on (X86 || (S390 &&
64BIT)) && MMU".

This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead.  x86 already has
MMU "def_bool y", so the MMU check is superfluous there and
HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c662..3322342a1ff 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on X86 && MMU
+	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
-- 
cgit v1.2.3


From e3ebcf64381188a2744a9829a4eb5c2b60f1974c Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:07 -0700
Subject: thp: remove assumptions on pgtable_t type

The thp page table pre-allocation code currently assumes that pgtable_t is
of type "struct page *".  This may not be true for all architectures, so
this patch removes that assumption by replacing the functions
prepare_pmd_huge_pte() and get_pmd_huge_pte() with two new functions that
can be defined architecture-specific.

It also removes two VM_BUG_ON checks for page_count() and page_mapcount()
operating on a pgtable_t.  Apart from the VM_BUG_ON removal, there will be
no functional change introduced by this patch.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c     | 50 ++++++++------------------------------------------
 mm/pgtable-generic.c | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 42 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e19cc426c52..9ea6d195376 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -598,19 +598,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-				 struct mm_struct *mm)
-{
-	assert_spin_locked(&mm->page_table_lock);
-
-	/* FIFO */
-	if (!mm->pmd_huge_pte)
-		INIT_LIST_HEAD(&pgtable->lru);
-	else
-		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-	mm->pmd_huge_pte = pgtable;
-}
-
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
@@ -652,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		 */
 		page_add_new_anon_rmap(page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		prepare_pmd_huge_pte(pgtable, mm);
+		pgtable_trans_huge_deposit(mm, pgtable);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm->nr_ptes++;
 		spin_unlock(&mm->page_table_lock);
@@ -778,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	prepare_pmd_huge_pte(pgtable, dst_mm);
+	pgtable_trans_huge_deposit(dst_mm, pgtable);
 	dst_mm->nr_ptes++;
 
 	ret = 0;
@@ -789,25 +776,6 @@ out:
 	return ret;
 }
 
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-	pgtable_t pgtable;
-
-	assert_spin_locked(&mm->page_table_lock);
-
-	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
-	if (list_empty(&pgtable->lru))
-		mm->pmd_huge_pte = NULL;
-	else {
-		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-					      struct page, lru);
-		list_del(&pgtable->lru);
-	}
-	return pgtable;
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long address,
@@ -863,7 +831,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	pmdp_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
-	pgtable = get_pmd_huge_pte(mm);
+	pgtable = pgtable_trans_huge_withdraw(mm);
 	pmd_populate(mm, &_pmd, pgtable);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1028,7 +996,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
-		pgtable = get_pmd_huge_pte(tlb->mm);
+		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
 		page = pmd_page(*pmd);
 		pmd_clear(pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
@@ -1345,11 +1313,11 @@ static int __split_huge_page_map(struct page *page,
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
 	if (pmd) {
-		pgtable = get_pmd_huge_pte(mm);
+		pgtable = pgtable_trans_huge_withdraw(mm);
 		pmd_populate(mm, &_pmd, pgtable);
 
-		for (i = 0, haddr = address; i < HPAGE_PMD_NR;
-		     i++, haddr += PAGE_SIZE) {
+		haddr = address;
+		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 			pte_t *pte, entry;
 			BUG_ON(PageCompound(page+i));
 			entry = mk_pte(page + i, vma->vm_page_prot);
@@ -2017,8 +1985,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte_unmap(pte);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
-	VM_BUG_ON(page_count(pgtable) != 1);
-	VM_BUG_ON(page_mapcount(pgtable) != 0);
 
 	_pmd = mk_pmd(new_page, vma->vm_page_prot);
 	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -2036,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache(vma, address, _pmd);
-	prepare_pmd_huge_pte(pgtable, mm);
+	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
 	*hpage = NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa..29867e083d3 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,42 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	if (!mm->pmd_huge_pte)
+		INIT_LIST_HEAD(&pgtable->lru);
+	else
+		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+	mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+	pgtable_t pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	pgtable = mm->pmd_huge_pte;
+	if (list_empty(&pgtable->lru))
+		mm->pmd_huge_pte = NULL;
+	else {
+		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+					      struct page, lru);
+		list_del(&pgtable->lru);
+	}
+	return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
-- 
cgit v1.2.3


From 46dcde735c9d8953bbd8d105ca6779e5b5300c28 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:09 -0700
Subject: thp: introduce pmdp_invalidate()

On s390, a valid page table entry must not be changed while it is attached
to any CPU.  So instead of pmd_mknotpresent() and set_pmd_at(), an IDTE
operation would be necessary there.  This patch introduces the
pmdp_invalidate() function, to allow architecture-specific
implementations.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c     |  3 +--
 mm/pgtable-generic.c | 11 +++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9ea6d195376..bec6243b696 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1361,8 +1361,7 @@ static int __split_huge_page_map(struct page *page,
 		 * SMP TLB and finally we write the non-huge version
 		 * of the pmd entry with pmd_populate.
 		 */
-		set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
-		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+		pmdp_invalidate(vma, address, pmd);
 		pmd_populate(mm, pmd, pgtable);
 		ret = 1;
 	}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 29867e083d3..e642627da6b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -159,3 +159,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		     pmd_t *pmdp)
+{
+	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
-- 
cgit v1.2.3


From 8e72033f2a489b6c98c4e3c7cc281b1afd6cb85c Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:12 -0700
Subject: thp: make MADV_HUGEPAGE check for mm->def_flags

This adds a check to hugepage_madvise(), to refuse MADV_HUGEPAGE if
VM_NOHUGEPAGE is set in mm->def_flags.  On s390, the VM_NOHUGEPAGE flag
will be set in mm->def_flags for kvm processes, to prevent any future thp
mappings.  In order to also prevent MADV_HUGEPAGE on such an mm,
hugepage_madvise() should check mm->def_flags.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bec6243b696..010d32944d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1450,6 +1450,8 @@ out:
 int hugepage_madvise(struct vm_area_struct *vma,
 		     unsigned long *vm_flags, int advice)
 {
+	struct mm_struct *mm = vma->vm_mm;
+
 	switch (advice) {
 	case MADV_HUGEPAGE:
 		/*
@@ -1457,6 +1459,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 */
 		if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
 			return -EINVAL;
+		if (mm->def_flags & VM_NOHUGEPAGE)
+			return -EINVAL;
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
 		/*
-- 
cgit v1.2.3


From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:25 -0700
Subject: mm: replace vma prio_tree with an interval tree

Implement an interval tree as a replacement for the VMA prio_tree.  The
algorithms are similar to lib/interval_tree.c; however that code can't be
directly reused as the interval endpoints are not explicitly stored in the
VMA.  So instead, the common algorithm is moved into a template and the
details (node type, how to get interval endpoints from the node, etc) are
filled in using the C preprocessor.

Once the interval tree functions are available, using them as a
replacement to the VMA prio tree is a relatively simple, mechanical job.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Makefile         |   4 +-
 mm/filemap_xip.c    |   3 +-
 mm/fremap.c         |   2 +-
 mm/hugetlb.c        |   3 +-
 mm/interval_tree.c  |  61 +++++++++++++++
 mm/memory-failure.c |   3 +-
 mm/memory.c         |   9 +--
 mm/mmap.c           |  22 +++---
 mm/nommu.c          |  12 ++-
 mm/prio_tree.c      | 208 ----------------------------------------------------
 mm/rmap.c           |  18 ++---
 11 files changed, 94 insertions(+), 251 deletions(-)
 create mode 100644 mm/interval_tree.c
 delete mode 100644 mm/prio_tree.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82d..6b025f80af3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
-			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o $(mmu-y)
+			   compaction.o interval_tree.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 91750227a19..a52daee11d3 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
-	struct prio_tree_iter iter;
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 
 retry:
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
diff --git a/mm/fremap.c b/mm/fremap.c
index 3d731a49878..3899a86851c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f1bb534254f..c9b40e3a993 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct hstate *h = hstate_vma(vma);
 	struct vm_area_struct *iter_vma;
 	struct address_space *mapping;
-	struct prio_tree_iter iter;
 	pgoff_t pgoff;
 
 	/*
@@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
 			continue;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 00000000000..7dc565660e5
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,61 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#define ITSTRUCT   struct vm_area_struct
+#define ITRB       shared.linear.rb
+#define ITTYPE     unsigned long
+#define ITSUBTREE  shared.linear.rb_subtree_last
+#define ITSTART(n) ((n)->vm_pgoff)
+#define ITLAST(n)  ((n)->vm_pgoff + \
+		    (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1)
+#define ITSTATIC
+#define ITPREFIX   vma_interval_tree
+
+#include <linux/interval_tree_tmpl.h>
+
+/* Insert old immediately after vma in the interval tree */
+void vma_interval_tree_add(struct vm_area_struct *vma,
+			   struct vm_area_struct *old,
+			   struct address_space *mapping)
+{
+	struct rb_node **link;
+	struct vm_area_struct *parent;
+	unsigned long last;
+
+	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+		list_add(&vma->shared.nonlinear, &old->shared.nonlinear);
+		return;
+	}
+
+	last = ITLAST(vma);
+
+	if (!old->shared.linear.rb.rb_right) {
+		parent = old;
+		link = &old->shared.linear.rb.rb_right;
+	} else {
+		parent = rb_entry(old->shared.linear.rb.rb_right,
+				  struct vm_area_struct, shared.linear.rb);
+		if (parent->shared.linear.rb_subtree_last < last)
+			parent->shared.linear.rb_subtree_last = last;
+		while (parent->shared.linear.rb.rb_left) {
+			parent = rb_entry(parent->shared.linear.rb.rb_left,
+				struct vm_area_struct, shared.linear.rb);
+			if (parent->shared.linear.rb_subtree_last < last)
+				parent->shared.linear.rb_subtree_last = last;
+		}
+		link = &parent->shared.linear.rb.rb_left;
+	}
+
+	vma->shared.linear.rb_subtree_last = last;
+	rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link);
+	rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap,
+			    &vma_interval_tree_augment_callbacks);
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a661..c38a6257d08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	struct prio_tree_iter iter;
 	struct address_space *mapping = page->mapping;
 
 	mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 		if (!task_early_kill(tsk))
 			continue;
 
-		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 				      pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index e09c0481318..d205e4381a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	pgoff_t vba, vea, zba, zea;
 
-	vma_prio_tree_foreach(vma, &iter, root,
+	vma_interval_tree_foreach(vma, root,
 			details->first_index, details->last_index) {
 
 		vba = vma->vm_pgoff;
@@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
 	 * across *all* the pages in each nonlinear VMA, not just the pages
 	 * whose virtual address lies outside the file truncation point.
 	 */
-	list_for_each_entry(vma, head, shared.vm_set.list) {
+	list_for_each_entry(vma, head, shared.nonlinear) {
 		details->nonlinear_vma = vma;
 		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
 	}
@@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping,
 
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/mmap.c b/mm/mmap.c
index e3c365ff1b6..5ac533f88e9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 
 	flush_dcache_mmap_lock(mapping);
 	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.vm_set.list);
+		list_del_init(&vma->shared.nonlinear);
 	else
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
  * vma from rmap and vmtruncate before freeing its page tables.
  */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 		if (unlikely(vma->vm_flags & VM_NONLINEAR))
 			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		else
-			vma_prio_tree_insert(vma, &mapping->i_mmap);
+			vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct vm_area_struct *next = vma->vm_next;
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
-	struct prio_tree_root *root = NULL;
+	struct rb_root *root = NULL;
 	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	long adjust_next = 0;
@@ -554,7 +554,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		mutex_lock(&mapping->i_mmap_mutex);
 		if (insert) {
 			/*
-			 * Put into prio_tree now, so instantiated pages
+			 * Put into interval tree now, so instantiated pages
 			 * are visible to arm/parisc __flush_dcache_page
 			 * throughout; but we cannot insert into address
 			 * space until vma start or end is updated.
@@ -582,9 +582,9 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, root);
+		vma_interval_tree_remove(vma, root);
 		if (adjust_next)
-			vma_prio_tree_remove(next, root);
+			vma_interval_tree_remove(next, root);
 	}
 
 	vma->vm_start = start;
@@ -597,8 +597,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		if (adjust_next)
-			vma_prio_tree_insert(next, root);
-		vma_prio_tree_insert(vma, root);
+			vma_interval_tree_insert(next, root);
+		vma_interval_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 12e84e69dd0..45131b41bcd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
@@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 				size_t newsize)
 {
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	struct vm_region *region;
 	pgoff_t low, high;
 	size_t r_size, r_top;
@@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	mutex_lock(&inode->i_mapping->i_mmap_mutex);
 
 	/* search for VMAs that fall within the dead zone */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      low, high) {
+	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
@@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	 * we don't check for any regions that start beyond the EOF as there
 	 * shouldn't be any
 	 */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      0, ULONG_MAX) {
+	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+				  0, ULONG_MAX) {
 		if (!(vma->vm_flags & VM_SHARED))
 			continue;
 
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8..00000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004	Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- * 	vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- * 	vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-	/* Leave these BUG_ONs till prio_tree patch stabilizes */
-	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
-	vma->shared.vm_set.head = NULL;
-	vma->shared.vm_set.parent = NULL;
-
-	if (!old->shared.vm_set.parent)
-		list_add(&vma->shared.vm_set.list,
-				&old->shared.vm_set.list);
-	else if (old->shared.vm_set.head)
-		list_add_tail(&vma->shared.vm_set.list,
-				&old->shared.vm_set.head->shared.vm_set.list);
-	else {
-		INIT_LIST_HEAD(&vma->shared.vm_set.list);
-		vma->shared.vm_set.head = old;
-		old->shared.vm_set.head = vma;
-	}
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-			  struct prio_tree_root *root)
-{
-	struct prio_tree_node *ptr;
-	struct vm_area_struct *old;
-
-	vma->shared.vm_set.head = NULL;
-
-	ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-	if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-		old = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		vma_prio_tree_add(vma, old);
-	}
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-			  struct prio_tree_root *root)
-{
-	struct vm_area_struct *node, *head, *new_head;
-
-	if (!vma->shared.vm_set.head) {
-		if (!vma->shared.vm_set.parent)
-			list_del_init(&vma->shared.vm_set.list);
-		else
-			raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-	} else {
-		/* Leave this BUG_ON till prio_tree patch stabilizes */
-		BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-		if (vma->shared.vm_set.parent) {
-			head = vma->shared.vm_set.head;
-			if (!list_empty(&head->shared.vm_set.list)) {
-				new_head = list_entry(
-					head->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&head->shared.vm_set.list);
-			} else
-				new_head = NULL;
-
-			raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-					&head->shared.prio_tree_node);
-			head->shared.vm_set.head = new_head;
-			if (new_head)
-				new_head->shared.vm_set.head = head;
-
-		} else {
-			node = vma->shared.vm_set.head;
-			if (!list_empty(&vma->shared.vm_set.list)) {
-				new_head = list_entry(
-					vma->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&vma->shared.vm_set.list);
-				node->shared.vm_set.head = new_head;
-				new_head->shared.vm_set.head = node;
-			} else
-				node->shared.vm_set.head = NULL;
-		}
-	}
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-					struct prio_tree_iter *iter)
-{
-	struct prio_tree_node *ptr;
-	struct vm_area_struct *next;
-
-	if (!vma) {
-		/*
-		 * First call is with NULL vma
-		 */
-		ptr = prio_tree_next(iter);
-		if (ptr) {
-			next = prio_tree_entry(ptr, struct vm_area_struct,
-						shared.prio_tree_node);
-			prefetch(next->shared.vm_set.head);
-			return next;
-		} else
-			return NULL;
-	}
-
-	if (vma->shared.vm_set.parent) {
-		if (vma->shared.vm_set.head) {
-			next = vma->shared.vm_set.head;
-			prefetch(next->shared.vm_set.list.next);
-			return next;
-		}
-	} else {
-		next = list_entry(vma->shared.vm_set.list.next,
-				struct vm_area_struct, shared.vm_set.list);
-		if (!next->shared.vm_set.head) {
-			prefetch(next->shared.vm_set.list.next);
-			return next;
-		}
-	}
-
-	ptr = prio_tree_next(iter);
-	if (ptr) {
-		next = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		prefetch(next->shared.vm_set.head);
-		return next;
-	} else
-		return NULL;
-}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a2..7b5b51d25fc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page,
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int referenced = 0;
 
 	/*
@@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page,
 	 */
 	mapcount = page_mapcount(page);
 
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = 0;
 
 	BUG_ON(PageAnon(page));
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED) {
 			unsigned long address = vma_address(page, vma);
 			if (address == -EFAULT)
@@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 	unsigned long cursor;
 	unsigned long max_nl_cursor = 0;
@@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	unsigned int mapcount;
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 		goto out;
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-						shared.vm_set.list) {
+							shared.nonlinear) {
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
 			max_nl_cursor = cursor;
@@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-						shared.vm_set.list) {
+							shared.nonlinear) {
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	 * in locked vmas).  Reset cursor on all unreserved nonlinear
 	 * vmas, now forgetting on which ones it had fallen behind.
 	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
 		vma->vm_private_data = NULL;
 out:
 	mutex_unlock(&mapping->i_mmap_mutex);
@@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 
 	if (!mapping)
 		return ret;
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
-- 
cgit v1.2.3


From 85d3a316c714197f94e75c1e5b2d37607d66e5de Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:27 -0700
Subject: kmemleak: use rbtree instead of prio tree

kmemleak uses a tree where each node represents an allocated memory object
in order to quickly find out what object a given address is part of.
However, the objects don't overlap, so rbtrees are a better choice than
prio tree for this use.  They are both faster and have lower memory
overhead.

Tested by booting a kernel with kmemleak enabled, loading the
kmemleak_test module, and looking for the expected messages.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 100 ++++++++++++++++++++++++++++++----------------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

(limited to 'mm')

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0de83b4541e..a217cc54406 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
  * - kmemleak_lock (rwlock): protects the object_list modifications and
  *   accesses to the object_tree_root. The object_list is the main list
  *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
  *   metadata based on a pointer to the corresponding memory block.  The
  *   kmemleak_object structures are added to the object_list and
  *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
  * Structure holding the metadata for each allocated memory block.
  * Modifications to such objects should be made while holding the
  * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
  * the notes on locking above). These objects are reference-counted
  * (use_count) and freed using the RCU mechanism.
  */
@@ -141,7 +141,7 @@ struct kmemleak_object {
 	unsigned long flags;		/* object status flags */
 	struct list_head object_list;
 	struct list_head gray_list;
-	struct prio_tree_node tree_node;
+	struct rb_node rb_node;
 	struct rcu_head rcu;		/* object_list lockless traversal */
 	/* object usage count; object freed when use_count == 0 */
 	atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
 	trace.entries = object->trace;
 
 	pr_notice("Object 0x%08lx (size %zu):\n",
-		  object->tree_node.start, object->size);
+		  object->pointer, object->size);
 	pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
 		  object->comm, object->pid, object->jiffies);
 	pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
  * tree based on a pointer value. If alias is 0, only values pointing to the
  * beginning of the memory block are allowed. The kmemleak_lock must be held
  * when calling this function.
  */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-	struct prio_tree_node *node;
-	struct prio_tree_iter iter;
-	struct kmemleak_object *object;
-
-	prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
-	node = prio_tree_next(&iter);
-	if (node) {
-		object = prio_tree_entry(node, struct kmemleak_object,
-					 tree_node);
-		if (!alias && object->pointer != ptr) {
+	struct rb_node *rb = object_tree_root.rb_node;
+
+	while (rb) {
+		struct kmemleak_object *object =
+			rb_entry(rb, struct kmemleak_object, rb_node);
+		if (ptr < object->pointer)
+			rb = object->rb_node.rb_left;
+		else if (object->pointer + object->size <= ptr)
+			rb = object->rb_node.rb_right;
+		else if (object->pointer == ptr || alias)
+			return object;
+		else {
 			kmemleak_warn("Found object by alias at 0x%08lx\n",
 				      ptr);
 			dump_object_info(object);
-			object = NULL;
+			break;
 		}
-	} else
-		object = NULL;
-
-	return object;
+	}
+	return NULL;
 }
 
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
  */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 					     int min_count, gfp_t gfp)
 {
 	unsigned long flags;
-	struct kmemleak_object *object;
-	struct prio_tree_node *node;
+	struct kmemleak_object *object, *parent;
+	struct rb_node **link, *rb_parent;
 
 	object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
 	if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 	/* kernel backtrace */
 	object->trace_len = __save_stack_trace(object->trace);
 
-	INIT_PRIO_TREE_NODE(&object->tree_node);
-	object->tree_node.start = ptr;
-	object->tree_node.last = ptr + size - 1;
-
 	write_lock_irqsave(&kmemleak_lock, flags);
 
 	min_addr = min(min_addr, ptr);
 	max_addr = max(max_addr, ptr + size);
-	node = prio_tree_insert(&object_tree_root, &object->tree_node);
-	/*
-	 * The code calling the kernel does not yet have the pointer to the
-	 * memory block to be able to free it.  However, we still hold the
-	 * kmemleak_lock here in case parts of the kernel started freeing
-	 * random memory blocks.
-	 */
-	if (node != &object->tree_node) {
-		kmemleak_stop("Cannot insert 0x%lx into the object search tree "
-			      "(already existing)\n", ptr);
-		object = lookup_object(ptr, 1);
-		spin_lock(&object->lock);
-		dump_object_info(object);
-		spin_unlock(&object->lock);
-
-		goto out;
+	link = &object_tree_root.rb_node;
+	rb_parent = NULL;
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+		if (ptr + size <= parent->pointer)
+			link = &parent->rb_node.rb_left;
+		else if (parent->pointer + parent->size <= ptr)
+			link = &parent->rb_node.rb_right;
+		else {
+			kmemleak_stop("Cannot insert 0x%lx into the object "
+				      "search tree (overlaps existing)\n",
+				      ptr);
+			kmem_cache_free(object_cache, object);
+			object = parent;
+			spin_lock(&object->lock);
+			dump_object_info(object);
+			spin_unlock(&object->lock);
+			goto out;
+		}
 	}
+	rb_link_node(&object->rb_node, rb_parent, link);
+	rb_insert_color(&object->rb_node, &object_tree_root);
+
 	list_add_tail_rcu(&object->object_list, &object_list);
 out:
 	write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
 	unsigned long flags;
 
 	write_lock_irqsave(&kmemleak_lock, flags);
-	prio_tree_remove(&object_tree_root, &object->tree_node);
+	rb_erase(&object->rb_node, &object_tree_root);
 	list_del_rcu(&object->object_list);
 	write_unlock_irqrestore(&kmemleak_lock, flags);
 
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void)
 
 	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
 	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-	INIT_PRIO_TREE_ROOT(&object_tree_root);
 
 	if (crt_early_log >= ARRAY_SIZE(early_log))
 		pr_warning("Early log buffer exceeded (%d), please increase "
-- 
cgit v1.2.3


From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:35 -0700
Subject: mm: interval tree updates

Update the generic interval tree code that was introduced in "mm: replace
vma prio_tree with an interval tree".

Changes:

- fixed 'endpoing' typo noticed by Andrew Morton

- replaced include/linux/interval_tree_tmpl.h, which was used as a
  template (including it automatically defined the interval tree
  functions) with include/linux/interval_tree_generic.h, which only
  defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself
  defines the interval tree functions when invoked. Now that is a very
  long macro which is unfortunate, but it does make the usage sites
  (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously.

- make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro,
  instead of duplicating that code in the interval tree template.

- replaced vma_interval_tree_add(), which was actually handling the
  nonlinear and interval tree cases, with vma_interval_tree_insert_after()
  which handles only the interval tree case and has an API that is more
  consistent with the other interval tree handling functions.
  The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/interval_tree.c | 60 ++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 7dc565660e5..4ab7b9ec3a5 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -8,40 +8,38 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/interval_tree_generic.h>
 
-#define ITSTRUCT   struct vm_area_struct
-#define ITRB       shared.linear.rb
-#define ITTYPE     unsigned long
-#define ITSUBTREE  shared.linear.rb_subtree_last
-#define ITSTART(n) ((n)->vm_pgoff)
-#define ITLAST(n)  ((n)->vm_pgoff + \
-		    (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1)
-#define ITSTATIC
-#define ITPREFIX   vma_interval_tree
-
-#include <linux/interval_tree_tmpl.h>
-
-/* Insert old immediately after vma in the interval tree */
-void vma_interval_tree_add(struct vm_area_struct *vma,
-			   struct vm_area_struct *old,
-			   struct address_space *mapping)
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+	return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+		     unsigned long, shared.linear.rb_subtree_last,
+		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+				    struct vm_area_struct *prev,
+				    struct rb_root *root)
 {
 	struct rb_node **link;
 	struct vm_area_struct *parent;
-	unsigned long last;
-
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		list_add(&vma->shared.nonlinear, &old->shared.nonlinear);
-		return;
-	}
+	unsigned long last = vma_last_pgoff(node);
 
-	last = ITLAST(vma);
+	VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
 
-	if (!old->shared.linear.rb.rb_right) {
-		parent = old;
-		link = &old->shared.linear.rb.rb_right;
+	if (!prev->shared.linear.rb.rb_right) {
+		parent = prev;
+		link = &prev->shared.linear.rb.rb_right;
 	} else {
-		parent = rb_entry(old->shared.linear.rb.rb_right,
+		parent = rb_entry(prev->shared.linear.rb.rb_right,
 				  struct vm_area_struct, shared.linear.rb);
 		if (parent->shared.linear.rb_subtree_last < last)
 			parent->shared.linear.rb_subtree_last = last;
@@ -54,8 +52,8 @@ void vma_interval_tree_add(struct vm_area_struct *vma,
 		link = &parent->shared.linear.rb.rb_left;
 	}
 
-	vma->shared.linear.rb_subtree_last = last;
-	rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap,
-			    &vma_interval_tree_augment_callbacks);
+	node->shared.linear.rb_subtree_last = last;
+	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+	rb_insert_augmented(&node->shared.linear.rb, root,
+			    &vma_interval_tree_augment);
 }
-- 
cgit v1.2.3


From 108d6642ad81bb1d62b401490a334d2c12397517 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:36 -0700
Subject: mm anon rmap: remove anon_vma_moveto_tail

mremap() had a clever optimization where move_ptes() did not take the
anon_vma lock to avoid a race with anon rmap users such as page migration.
 Instead, the avc's were ordered in such a way that the origin vma was
always visited by rmap before the destination.  This ordering and the use
of page table locks rmap usage safe.  However, we want to replace the use
of linked lists in anon rmap with an interval tree, and this will make it
harder to impose such ordering as the interval tree will always be sorted
by the avc->vma->vm_pgoff value.  For now, let's replace the
anon_vma_moveto_tail() ordering function with proper anon_vma locking in
move_ptes().  Once we have the anon interval tree in place, we will
re-introduce an optimization to avoid taking these locks in the most
common cases.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c   |  3 +--
 mm/mremap.c | 14 +++++---------
 mm/rmap.c   | 45 ---------------------------------------------
 3 files changed, 6 insertions(+), 56 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 5ac533f88e9..66984aab791 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
 			*vmap = new_vma;
-		} else
-			anon_vma_moveto_tail(new_vma);
+		}
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d0..5588bb6e929 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long new_addr)
 {
 	struct address_space *mapping = NULL;
+	struct anon_vma *anon_vma = vma->anon_vma;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
@@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		mapping = vma->vm_file->f_mapping;
 		mutex_lock(&mapping->i_mmap_mutex);
 	}
+	if (anon_vma)
+		anon_vma_lock(anon_vma);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		spin_unlock(new_ptl);
 	pte_unmap(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
+	if (anon_vma)
+		anon_vma_unlock(anon_vma);
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -220,15 +225,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
-		/*
-		 * Before moving the page tables from the new vma to
-		 * the old vma, we need to be sure the old vma is
-		 * queued after new vma in the same_anon_vma list to
-		 * prevent SMP races with rmap_walk (that could lead
-		 * rmap_walk to miss some page table).
-		 */
-		anon_vma_moveto_tail(vma);
-
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
diff --git a/mm/rmap.c b/mm/rmap.c
index 7b5b51d25fc..8cbd62fde0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -268,51 +268,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 	return -ENOMEM;
 }
 
-/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-	struct anon_vma_chain *pavc;
-	struct anon_vma *root = NULL;
-
-	list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-		struct anon_vma *anon_vma = pavc->anon_vma;
-		VM_BUG_ON(pavc->vma != dst);
-		root = lock_anon_vma_root(root, anon_vma);
-		list_del(&pavc->same_anon_vma);
-		list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-	}
-	unlock_anon_vma_root(root);
-}
-
 /*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.
-- 
cgit v1.2.3


From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:39 -0700
Subject: mm anon rmap: replace same_anon_vma linked list with an interval
 tree.

When a large VMA (anon or private file mapping) is first touched, which
will populate its anon_vma field, and then split into many regions through
the use of mprotect(), the original anon_vma ends up linking all of the
vmas on a linked list.  This can cause rmap to become inefficient, as we
have to walk potentially thousands of irrelevent vmas before finding the
one a given anon page might fall into.

By replacing the same_anon_vma linked list with an interval tree (where
each avc's interval is determined by its vma's start and last pgoffs), we
can make rmap efficient for this use case again.

While the change is large, all of its pieces are fairly simple.

Most places that were walking the same_anon_vma list were looking for a
known pgoff, so they can just use the anon_vma_interval_tree_foreach()
interval tree iterator instead.  The exception here is ksm, where the
page's index is not known.  It would probably be possible to rework ksm so
that the index would be known, but for now I have decided to keep things
simple and just walk the entirety of the interval tree there.

When updating vma's that already have an anon_vma assigned, we must take
care to re-index the corresponding avc's on their interval tree.  This is
done through the use of anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma(), which remove the avc's from
their interval tree before the update and re-insert them after the update.
 The anon_vma stays locked during the update, so there is no chance that
rmap would miss the vmas that are being updated.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c    |  5 ++--
 mm/interval_tree.c  | 14 ++++++++++
 mm/ksm.c            |  9 ++++---
 mm/memory-failure.c |  5 +++-
 mm/mmap.c           | 73 ++++++++++++++++++++++++++++++++++++++++-------------
 mm/rmap.c           | 24 +++++++++---------
 6 files changed, 94 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 010d32944d1..ce59ada0946 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page,
 			      struct anon_vma *anon_vma)
 {
 	int mapcount, mapcount2;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma_chain *avc;
 
 	BUG_ON(!PageHead(page));
 	BUG_ON(PageTail(page));
 
 	mapcount = 0;
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
@@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page,
 	__split_huge_page_refcount(page);
 
 	mapcount2 = 0;
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4ab7b9ec3a5..f7c72cd35e1 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -8,6 +8,7 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/rmap.h>
 #include <linux/interval_tree_generic.h>
 
 static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
@@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 	rb_insert_augmented(&node->shared.linear.rb, root,
 			    &vma_interval_tree_augment);
 }
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+	return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+	return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+		     avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)
diff --git a/mm/ksm.c b/mm/ksm.c
index 9638620a753..14ee5cf8a51 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1618,7 +1618,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1671,7 +1672,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1723,7 +1725,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c38a6257d08..6c5899b9034 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct anon_vma *av;
+	pgoff_t pgoff;
 
 	av = page_lock_anon_vma(page);
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	read_lock(&tasklist_lock);
 	for_each_process (tsk) {
 		struct anon_vma_chain *vmac;
 
 		if (!task_early_kill(tsk))
 			continue;
-		list_for_each_entry(vmac, &av->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+					       pgoff, pgoff) {
 			vma = vmac->vma;
 			if (!page_mapped_in_vma(page, vma))
 				continue;
diff --git a/mm/mmap.c b/mm/mmap.c
index 66984aab791..2e580ed7921 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+	struct anon_vma_chain *avc;
+
+	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+	struct anon_vma_chain *avc;
+
+	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 		unsigned long end, struct vm_area_struct **pprev,
 		struct rb_node ***rb_link, struct rb_node **rb_parent)
@@ -565,20 +597,17 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	vma_adjust_trans_huge(vma, start, end, adjust_next);
 
-	/*
-	 * When changing only vma->vm_end, we don't really need anon_vma
-	 * lock. This is a fairly rare case by itself, but the anon_vma
-	 * lock may be shared between many sibling processes.  Skipping
-	 * the lock for brk adjustments makes a difference sometimes.
-	 */
-	if (vma->anon_vma && (importer || start != vma->vm_start)) {
-		anon_vma = vma->anon_vma;
+	anon_vma = vma->anon_vma;
+	if (!anon_vma && adjust_next)
+		anon_vma = next->anon_vma;
+	if (anon_vma) {
 		VM_BUG_ON(adjust_next && next->anon_vma &&
 			  anon_vma != next->anon_vma);
-	} else if (adjust_next && next->anon_vma)
-		anon_vma = next->anon_vma;
-	if (anon_vma)
 		anon_vma_lock(anon_vma);
+		anon_vma_interval_tree_pre_update_vma(vma);
+		if (adjust_next)
+			anon_vma_interval_tree_pre_update_vma(next);
+	}
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
@@ -619,8 +648,12 @@ again:			remove_next = 1 + (end > next->vm_end);
 		__insert_vm_struct(mm, insert);
 	}
 
-	if (anon_vma)
+	if (anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		if (adjust_next)
+			anon_vma_interval_tree_post_update_vma(next);
 		anon_vma_unlock(anon_vma);
+	}
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 
@@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
+				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
+				anon_vma_interval_tree_post_update_vma(vma);
 				perf_event_mmap(vma);
 			}
 		}
@@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma,
 		if (grow <= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
+				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
+				anon_vma_interval_tree_post_update_vma(vma);
 				perf_event_mmap(vma);
 			}
 		}
@@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
@@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 		 * anon_vma->root->mutex.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->root->head.next))
+				       &anon_vma->root->rb_root.rb_node))
 			BUG();
 	}
 }
@@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
@@ -2619,13 +2656,13 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
 		 *
 		 * We must however clear the bitflag before unlocking
-		 * the vma so the users using the anon_vma->head will
+		 * the vma so the users using the anon_vma->rb_root will
 		 * never see our bitflag.
 		 *
 		 * No need of atomic instructions here, head.next
@@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 * anon_vma->root->mutex.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->root->head.next))
+					  &anon_vma->root->rb_root.rb_node))
 			BUG();
 		anon_vma_unlock(anon_vma);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 8cbd62fde0f..9c61bf387fd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 	avc->vma = vma;
 	avc->anon_vma = anon_vma;
 	list_add(&avc->same_vma, &vma->anon_vma_chain);
-
-	/*
-	 * It's critical to add new vmas to the tail of the anon_vma,
-	 * see comment in huge_memory.c:__split_huge_page().
-	 */
-	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
 }
 
 /**
@@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 		struct anon_vma *anon_vma = avc->anon_vma;
 
 		root = lock_anon_vma_root(root, anon_vma);
-		list_del(&avc->same_anon_vma);
+		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
 
 		/*
 		 * Leave empty anon_vmas on the list - we'll need
 		 * to free them outside the lock.
 		 */
-		if (list_empty(&anon_vma->head))
+		if (RB_EMPTY_ROOT(&anon_vma->rb_root))
 			continue;
 
 		list_del(&avc->same_vma);
@@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data)
 
 	mutex_init(&anon_vma->mutex);
 	atomic_set(&anon_vma->refcount, 0);
-	INIT_LIST_HEAD(&anon_vma->head);
+	anon_vma->rb_root = RB_ROOT;
 }
 
 void __init anon_vma_init(void)
@@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page,
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
 	struct anon_vma_chain *avc;
 	int referenced = 0;
 
@@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page,
 		return referenced;
 
 	mapcount = page_mapcount(page);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
@@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 	if (!anon_vma)
 		return ret;
 
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address;
 
@@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		struct vm_area_struct *, unsigned long, void *), void *arg)
 {
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
@@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	if (!anon_vma)
 		return ret;
 	anon_vma_lock(anon_vma);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
-- 
cgit v1.2.3


From 86c2ad19956f84f2191e062fcb979367b6365871 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:42 -0700
Subject: mm rmap: remove vma_address check for address inside vma

In file and anon rmap, we use interval trees to find potentially relevant
vmas and then call vma_address() to find the virtual address the given
page might be found at in these vmas.  vma_address() used to include a
check that the returned address falls within the limits of the vma, but
this check isn't necessary now that we always use interval trees in rmap:
the interval tree just doesn't return any vmas which this check would find
to be irrelevant.  As a result, we can replace the use of -EFAULT error
code (which then needed to be checked in every call site) with a
VM_BUG_ON().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  4 ----
 mm/rmap.c        | 48 +++++++++++++++++++++---------------------------
 2 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ce59ada0946..7cf8b0ec11e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1386,8 +1386,6 @@ static void __split_huge_page(struct page *page,
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
-		if (addr == -EFAULT)
-			continue;
 		mapcount += __split_huge_page_splitting(page, vma, addr);
 	}
 	/*
@@ -1412,8 +1410,6 @@ static void __split_huge_page(struct page *page,
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
-		if (addr == -EFAULT)
-			continue;
 		mapcount2 += __split_huge_page_map(page, vma, addr);
 	}
 	if (mapcount != mapcount2)
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c61bf387fd..28777412de6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -510,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 
 /*
  * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
  */
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-	unsigned long address;
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		pgoff = page->index << huge_page_order(page_hstate(page));
-	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-		/* page should be within @vma mapping range */
-		return -EFAULT;
-	}
+
+	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+	unsigned long address = __vma_address(page, vma);
+
+	/* page should be within @vma mapping range */
+	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
 	return address;
 }
 
@@ -535,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+	unsigned long address;
 	if (PageAnon(page)) {
 		struct anon_vma *page__anon_vma = page_anon_vma(page);
 		/*
@@ -550,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 			return -EFAULT;
 	} else
 		return -EFAULT;
-	return vma_address(page, vma);
+	address = __vma_address(page, vma);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+		return -EFAULT;
+	return address;
 }
 
 /*
@@ -624,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	address = vma_address(page, vma);
-	if (address == -EFAULT)		/* out of vma range */
+	address = __vma_address(page, vma);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
 		return 0;
 	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
 	if (!pte)			/* the page is not in this mm */
@@ -732,8 +740,6 @@ static int page_referenced_anon(struct page *page,
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@@ -799,8 +805,6 @@ static int page_referenced_file(struct page *page,
 
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@@ -904,8 +908,6 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED) {
 			unsigned long address = vma_address(page, vma);
-			if (address == -EFAULT)
-				continue;
 			ret += page_mkclean_one(page, vma, address);
 		}
 	}
@@ -1468,8 +1470,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 			continue;
 
 		address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = try_to_unmap_one(page, vma, address, flags);
 		if (ret != SWAP_AGAIN || !page_mapped(page))
 			break;
@@ -1508,8 +1508,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = try_to_unmap_one(page, vma, address, flags);
 		if (ret != SWAP_AGAIN || !page_mapped(page))
 			goto out;
@@ -1684,8 +1682,6 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = rmap_one(page, vma, address, arg);
 		if (ret != SWAP_AGAIN)
 			break;
@@ -1707,8 +1703,6 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = rmap_one(page, vma, address, arg);
 		if (ret != SWAP_AGAIN)
 			break;
-- 
cgit v1.2.3


From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:45 -0700
Subject: mm: add CONFIG_DEBUG_VM_RB build option

Add a CONFIG_DEBUG_VM_RB build option for the previously existing
DEBUG_MM_RB code.  Now that Andi Kleen modified it to avoid using
recursive algorithms, we can expose it a bit more.

Also extend this code to validate_mm() after stack expansion, and to check
that the vma's start and last pgoffs have not changed since the nodes were
inserted on the anon vma interval tree (as it is important that the nodes
be reindexed after each such update).

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/interval_tree.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 mm/mmap.c          | 19 +++++++++----------
 2 files changed, 49 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index f7c72cd35e1..4a5822a586e 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -70,4 +70,43 @@ static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
 }
 
 INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
-		     avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)
+		     avc_start_pgoff, avc_last_pgoff,
+		     static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+				   struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+	node->cached_vma_start = avc_start_pgoff(node);
+	node->cached_vma_last = avc_last_pgoff(node);
+#endif
+	__anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+				   struct rb_root *root)
+{
+	__anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+				  unsigned long first, unsigned long last)
+{
+	return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+				 unsigned long first, unsigned long last)
+{
+	return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+	WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+	WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/mmap.c b/mm/mmap.c
index 2e580ed7921..deb422c39e2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
@@ -303,7 +297,7 @@ out:
 	return retval;
 }
 
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
 	int i = 0, j;
@@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
 	int bug = 0;
 	int i = 0;
-	struct vm_area_struct *tmp = mm->mmap;
-	while (tmp) {
-		tmp = tmp->vm_next;
+	struct vm_area_struct *vma = mm->mmap;
+	while (vma) {
+		struct anon_vma_chain *avc;
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+			anon_vma_interval_tree_verify(avc);
+		vma = vma->vm_next;
 		i++;
 	}
 	if (i != mm->map_count)
@@ -1790,6 +1787,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
+	validate_mm(vma->vm_mm);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1843,6 +1841,7 @@ int expand_downwards(struct vm_area_struct *vma,
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
+	validate_mm(vma->vm_mm);
 	return error;
 }
 
-- 
cgit v1.2.3


From 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:48 -0700
Subject: mm anon rmap: in mremap, set the new vma's position before
 anon_vma_clone()

anon_vma_clone() expects new_vma->vm_{start,end,pgoff} to be correctly set
so that the new vma can be indexed on the anon interval tree.

copy_vma() was failing to do that, which broke mremap().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index deb422c39e2..81248992120 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2419,16 +2419,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
+			new_vma->vm_start = addr;
+			new_vma->vm_end = addr + len;
+			new_vma->vm_pgoff = pgoff;
 			pol = mpol_dup(vma_policy(vma));
 			if (IS_ERR(pol))
 				goto out_free_vma;
+			vma_set_policy(new_vma, pol);
 			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 			if (anon_vma_clone(new_vma, vma))
 				goto out_free_mempol;
-			vma_set_policy(new_vma, pol);
-			new_vma->vm_start = addr;
-			new_vma->vm_end = addr + len;
-			new_vma->vm_pgoff = pgoff;
 			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
-- 
cgit v1.2.3


From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:50 -0700
Subject: mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the
original vma in rmap traversal order: in move_vma(), we always have
new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >=
vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one.

When the destination VMA is placed after the original in rmap traversal
order, we can avoid taking the rmap locks in move_ptes().

Essentially, this reintroduces the optimization that had been disabled in
"mm anon rmap: remove anon_vma_moveto_tail".  The difference is that we
don't try to impose the rmap traversal order; instead we just rely on
things being in the desired order in the common case and fall back to
taking locks in the uncommon case.  Also we skip the i_mmap_mutex in
addition to the anon_vma lock: in both cases, the vmas are traversed in
increasing vm_pgoff order with ties resolved in tree insertion order.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c   |  7 +++++--
 mm/mremap.c | 57 +++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 81248992120..2d942353d68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-	unsigned long addr, unsigned long len, pgoff_t pgoff)
+	unsigned long addr, unsigned long len, pgoff_t pgoff,
+	bool *need_rmap_locks)
 {
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
@@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			 * linear if there are no pages mapped yet.
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
-			*vmap = new_vma;
+			*vmap = vma = new_vma;
 		}
+		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
@@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+			*need_rmap_locks = false;
 		}
 	}
 	return new_vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5588bb6e929..3b639a4b26b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long old_addr, unsigned long old_end,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr)
+		unsigned long new_addr, bool need_rmap_locks)
 {
 	struct address_space *mapping = NULL;
-	struct anon_vma *anon_vma = vma->anon_vma;
+	struct anon_vma *anon_vma = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
 
-	if (vma->vm_file) {
-		/*
-		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock truncate_pagecache
-		 * out, since it might clean the dst vma before the src vma,
-		 * and we propagate stale pages into the dst afterward.
-		 */
-		mapping = vma->vm_file->f_mapping;
-		mutex_lock(&mapping->i_mmap_mutex);
+	/*
+	 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+	 * locks to ensure that rmap will always observe either the old or the
+	 * new ptes. This is the easiest way to avoid races with
+	 * truncate_pagecache(), page migration, etc...
+	 *
+	 * When need_rmap_locks is false, we use other ways to avoid
+	 * such races:
+	 *
+	 * - During exec() shift_arg_pages(), we use a specially tagged vma
+	 *   which rmap call sites look for using is_vma_temporary_stack().
+	 *
+	 * - During mremap(), new_vma is often known to be placed after vma
+	 *   in rmap traversal order. This ensures rmap will always observe
+	 *   either the old pte, or the new pte, or both (the page table locks
+	 *   serialize access to individual ptes, but only rmap traversal
+	 *   order guarantees that we won't miss both the old and new ptes).
+	 */
+	if (need_rmap_locks) {
+		if (vma->vm_file) {
+			mapping = vma->vm_file->f_mapping;
+			mutex_lock(&mapping->i_mmap_mutex);
+		}
+		if (vma->anon_vma) {
+			anon_vma = vma->anon_vma;
+			anon_vma_lock(anon_vma);
+		}
 	}
-	if (anon_vma)
-		anon_vma_lock(anon_vma);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len)
+		unsigned long new_addr, unsigned long len,
+		bool need_rmap_locks)
 {
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		if (extent > LATENCY_LIMIT)
 			extent = LATENCY_LIMIT;
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-				new_vma, new_pmd, new_addr);
+			  new_vma, new_pmd, new_addr, need_rmap_locks);
 		need_flush = true;
 	}
 	if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long hiwater_vm;
 	int split = 0;
 	int err;
+	bool need_rmap_locks;
 
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		return err;
 
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+			   &need_rmap_locks);
 	if (!new_vma)
 		return -ENOMEM;
 
-	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+				     need_rmap_locks);
 	if (moved_len < old_len) {
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
 		 */
-		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+				 true);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
-- 
cgit v1.2.3


From 1e8537baacd59e96bbe5f8d3d32feafd11f509fe Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Mon, 8 Oct 2012 16:31:51 -0700
Subject: memory-hotplug: build zonelists when offlining pages

online_pages() does build_all_zonelists() and zone_pcp_update(), I think
offline_pages() should do it too.

When the zone has no memory to allocate, remove it from other nodes'
zonelists.  zone_batchsize() depends on zone's present pages, if zone's
present pages are changed, zone's pcp should be updated.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd..b35016156c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -970,8 +970,13 @@ repeat:
 
 	init_per_zone_wmark_min();
 
-	if (!populated_zone(zone))
+	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
+		mutex_lock(&zonelists_mutex);
+		build_all_zonelists(NULL, NULL);
+		mutex_unlock(&zonelists_mutex);
+	} else
+		zone_pcp_update(zone);
 
 	if (!node_present_pages(node)) {
 		node_clear_state(node, N_HIGH_MEMORY);
-- 
cgit v1.2.3


From 70400303ce0c4ced3139499c676d5c79636b0c72 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 8 Oct 2012 16:31:52 -0700
Subject: mm: mmu_notifier: make the mmu_notifier srcu static

The variable must be static especially given the variable name.

s/RCU/SRCU/ over a few comments.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 947df83dccb..c297142f0fe 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 
 /* global SRCU for all MMs */
-struct srcu_struct srcu;
+static struct srcu_struct srcu;
 
 /*
  * This function can't run concurrently against mmu_notifier_register
@@ -41,7 +41,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	int id;
 
 	/*
-	 * RCU here will block mmu_notifier_unregister until
+	 * SRCU here will block mmu_notifier_unregister until
 	 * ->release returns.
 	 */
 	id = srcu_read_lock(&srcu);
@@ -302,7 +302,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 
 	if (!hlist_unhashed(&mn->hlist)) {
 		/*
-		 * RCU here will force exit_mmap to wait ->release to finish
+		 * SRCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
 		int id;
-- 
cgit v1.2.3


From 02c6de8d757cb32c0829a45d81c3dfcbcafd998b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:31:55 -0700
Subject: mm: cma: discard clean pages during contiguous allocation instead of
 migration

Drop clean cache pages instead of migration during alloc_contig_range() to
minimise allocation latency by reducing the amount of migration that is
necessary.  It's useful for CMA because latency of migration is more
important than evicting the background process's working set.  In
addition, as pages are reclaimed then fewer free pages for migration
targets are required so it avoids memory reclaiming to get free pages,
which is a contributory factor to increased latency.

I measured elapsed time of __alloc_contig_migrate_range() which migrates
10M in 40M movable zone in QEMU machine.

Before - 146ms, After - 7ms

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Mel Gorman <mgorman@suse.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Rik van Riel <riel@redhat.com>
Tested-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   |  3 ++-
 mm/page_alloc.c |  2 ++
 mm/vmscan.c     | 43 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index bbd7b34db4e..8312d4fadf5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -356,5 +356,6 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
-
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    struct list_head *page_list);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfd565dbe12..cefd14e6dcf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5700,6 +5700,8 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 			break;
 		}
 
+		reclaim_clean_pages_from_list(cc.zone, &cc.migratepages);
+
 		ret = migrate_pages(&cc.migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d16bf5a5326..1ee4b69a28a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct zone *zone,
 				      struct scan_control *sc,
+				      enum ttu_flags ttu_flags,
 				      unsigned long *ret_nr_dirty,
-				      unsigned long *ret_nr_writeback)
+				      unsigned long *ret_nr_writeback,
+				      bool force_reclaim)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
-		enum page_references references;
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
+		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 
 		cond_resched();
 
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			wait_on_page_writeback(page);
 		}
 
-		references = page_check_references(page, sc);
+		if (!force_reclaim)
+			references = page_check_references(page, sc);
+
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, TTU_UNMAP)) {
+			switch (try_to_unmap(page, ttu_flags)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
 	return nr_reclaimed;
 }
 
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    struct list_head *page_list)
+{
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_unmap = 1,
+	};
+	unsigned long ret, dummy1, dummy2;
+	struct page *page, *next;
+	LIST_HEAD(clean_pages);
+
+	list_for_each_entry_safe(page, next, page_list, lru) {
+		if (page_is_file_cache(page) && !PageDirty(page)) {
+			ClearPageActive(page);
+			list_move(&page->lru, &clean_pages);
+		}
+	}
+
+	ret = shrink_page_list(&clean_pages, zone, &sc,
+				TTU_UNMAP|TTU_IGNORE_ACCESS,
+				&dummy1, &dummy2, true);
+	list_splice(&clean_pages, page_list);
+	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+	return ret;
+}
+
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, zone, sc,
-						&nr_dirty, &nr_writeback);
+	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+					&nr_dirty, &nr_writeback, false);
 
 	spin_lock_irq(&zone->lru_lock);
 
-- 
cgit v1.2.3


From 770c8aaaf6f04a87e6765f24d497132de9152a46 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:31:57 -0700
Subject: mm: fix tracing in free_pcppages_bulk()

page->private gets re-used in __free_one_page() to store page order
(so trace_mm_page_pcpu_drain() may print order instead of migratetype)
thus migratetype value must be cached locally.

Fixes regression introduced in commit a7016235a61d ("mm: fix migratetype
bug which slowed swapping").  This caused incorrect data to be attached
to the mm_page_pcpu_drain trace event.

[akpm@linux-foundation.org: add comment]
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cefd14e6dcf..3f18a14effb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -668,12 +668,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free = to_free;
 
 		do {
+			int mt;	/* migratetype of the to-be-freed page */
+
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
+			mt = page_private(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-			__free_one_page(page, zone, 0, page_private(page));
-			trace_mm_page_pcpu_drain(page, 0, page_private(page));
+			__free_one_page(page, zone, 0, mt);
+			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
-- 
cgit v1.2.3


From 2139cbe627b8910ded55148f87ee10f7485408ed Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:00 -0700
Subject: cma: fix counting of isolated pages

Isolated free pages shouldn't be accounted to NR_FREE_PAGES counter.  Fix
it by properly decreasing/increasing NR_FREE_PAGES counter in
set_migratetype_isolate()/unset_migratetype_isolate() and removing counter
adjustment for isolated pages from free_one_page() and split_free_page().

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     |  9 +++++++--
 mm/page_isolation.c | 12 +++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f18a14effb..d259cc2b69c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,7 +691,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+	if (unlikely(migratetype != MIGRATE_ISOLATE))
+		__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	spin_unlock(&zone->lock);
 }
 
@@ -1392,6 +1393,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
+	int mt;
 
 	BUG_ON(!PageBuddy(page));
 
@@ -1407,7 +1409,10 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+	mt = get_pageblock_migratetype(page);
+	if (unlikely(mt != MIGRATE_ISOLATE))
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 
 	if (alloc_order != order)
 		expand(zone, page, alloc_order, order,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f17573..3ca1716471b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,12 @@ int set_migratetype_isolate(struct page *page)
 
 out:
 	if (!ret) {
+		unsigned long nr_pages;
+
 		set_pageblock_isolate(page);
-		move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages);
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +93,14 @@ out:
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
 	struct zone *zone;
-	unsigned long flags;
+	unsigned long flags, nr_pages;
+
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
-	move_freepages_block(zone, page, migratetype);
+	nr_pages = move_freepages_block(zone, page, migratetype);
+	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
 	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
-- 
cgit v1.2.3


From d1ce749a0db12202b711d1aba1d29e823034648d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:02 -0700
Subject: cma: count free CMA pages

Add NR_FREE_CMA_PAGES counter to be later used for checking watermark in
__zone_watermark_ok().  For simplicity and to avoid #ifdef hell make this
counter always available (not only when CONFIG_CMA=y).

[akpm@linux-foundation.org: use conventional migratetype naming]
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 26 +++++++++++++++++++-------
 mm/page_isolation.c |  5 +++--
 mm/vmstat.c         |  1 +
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d259cc2b69c..6969a8abdba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
-			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+			__mod_zone_freepage_state(zone, 1 << order,
+						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
@@ -677,6 +678,8 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
+			if (is_migrate_cma(mt))
+				__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -692,7 +695,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(migratetype != MIGRATE_ISOLATE))
-		__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 
@@ -815,7 +818,8 @@ static inline void expand(struct zone *zone, struct page *page,
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
-			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+			__mod_zone_freepage_state(zone, -(1 << high),
+						  migratetype);
 			continue;
 		}
 #endif
@@ -1141,6 +1145,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		}
 		set_page_private(page, mt);
 		list = &page->lru;
+		if (is_migrate_cma(mt))
+			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
@@ -1412,7 +1419,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 
 	mt = get_pageblock_migratetype(page);
 	if (unlikely(mt != MIGRATE_ISOLATE))
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 
 	if (alloc_order != order)
 		expand(zone, page, alloc_order, order,
@@ -1516,7 +1523,8 @@ again:
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+		__mod_zone_freepage_state(zone, -(1 << order),
+					  get_pageblock_migratetype(page));
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -2890,7 +2898,8 @@ void show_free_areas(unsigned int filter)
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
@@ -2907,7 +2916,8 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
-		global_page_state(NR_BOUNCE));
+		global_page_state(NR_BOUNCE),
+		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_populated_zone(zone) {
 		int i;
@@ -2939,6 +2949,7 @@ void show_free_areas(unsigned int filter)
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
+			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -2968,6 +2979,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
+			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3ca1716471b..345643b85bd 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -77,11 +77,12 @@ int set_migratetype_isolate(struct page *page)
 out:
 	if (!ret) {
 		unsigned long nr_pages;
+		int migratetype = get_pageblock_migratetype(page);
 
 		set_pageblock_isolate(page);
 		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
 
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages);
+		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -100,7 +101,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	nr_pages = move_freepages_block(zone, page, migratetype);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+	__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b3e3b9d525d..acbd85c983e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -722,6 +722,7 @@ const char * const vmstat_text[] = {
 	"numa_other",
 #endif
 	"nr_anon_transparent_hugepages",
+	"nr_free_cma",
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
 
-- 
cgit v1.2.3


From d95ea5d18e699515468368415c93ed49b1a3221b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:05 -0700
Subject: cma: fix watermark checking

* Add ALLOC_CMA alloc flag and pass it to [__]zone_watermark_ok()
  (from Minchan Kim).

* During watermark check decrease available free pages number by
  free CMA pages number if necessary (unmovable allocations cannot
  use pages from CMA areas).

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c |  8 +++++++-
 mm/internal.h   | 14 ++++++++++++++
 mm/page_alloc.c | 31 +++++++++++++++----------------
 3 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 0fbc6b73a52..1f61bcbd626 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -934,6 +934,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
+	int alloc_flags = 0;
 
 	/* Check if the GFP flags allow compaction */
 	if (!order || !may_enter_fs || !may_perform_io)
@@ -941,6 +942,10 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 	count_vm_event(COMPACTSTALL);
 
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
 								nodemask) {
@@ -951,7 +956,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
-		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+				      alloc_flags))
 			break;
 	}
 
diff --git a/mm/internal.h b/mm/internal.h
index 8312d4fadf5..96cda4c6ac5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -358,4 +358,18 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN		WMARK_MIN
+#define ALLOC_WMARK_LOW		WMARK_LOW
+#define ALLOC_WMARK_HIGH	WMARK_HIGH
+#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_CMA		0x80 /* allow allocations from CMA areas */
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6969a8abdba..f2c7cc6a303 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1541,19 +1541,6 @@ failed:
 	return NULL;
 }
 
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN		WMARK_MIN
-#define ALLOC_WMARK_LOW		WMARK_LOW
-#define ALLOC_WMARK_HIGH	WMARK_HIGH
-#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
-
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
-
-#define ALLOC_HARDER		0x10 /* try to alloc harder */
-#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
 static struct {
@@ -1648,7 +1635,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
-
+#ifdef CONFIG_CMA
+	/* If allocation can't use CMA areas don't use free CMA pages */
+	if (!(alloc_flags & ALLOC_CMA))
+		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
 	if (free_pages <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
@@ -2362,7 +2353,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
-
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	return alloc_flags;
 }
 
@@ -2587,6 +2581,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
+	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
 
 	gfp_mask &= gfp_allowed_mask;
 
@@ -2615,9 +2610,13 @@ retry_cpuset:
 	if (!preferred_zone)
 		goto out;
 
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, migratetype);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,
-- 
cgit v1.2.3


From b12c4ad14ee0232ad47c2bef404b6d42a3578332 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:08 -0700
Subject: mm: page_alloc: use get_freepage_migratetype() instead of
 page_private()

The page allocator uses set_page_private and page_private for handling
migratetype when it frees page.  Let's replace them with [set|get]
_freepage_migratetype to make it more clear.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 6 +++---
 mm/page_isolation.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2c7cc6a303..6aa0a8e89c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -674,7 +674,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
-			mt = page_private(page);
+			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
@@ -1143,7 +1143,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
 				mt = migratetype;
 		}
-		set_page_private(page, mt);
+		set_freepage_migratetype(page, mt);
 		list = &page->lru;
 		if (is_migrate_cma(mt))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -1313,7 +1313,7 @@ void free_hot_cold_page(struct page *page, int cold)
 		return;
 
 	migratetype = get_pageblock_migratetype(page);
-	set_page_private(page, migratetype);
+	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 345643b85bd..9c03dca8c2a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -203,7 +203,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 		if (PageBuddy(page))
 			pfn += 1 << page_order(page);
 		else if (page_count(page) == 0 &&
-				page_private(page) == MIGRATE_ISOLATE)
+			get_freepage_migratetype(page) == MIGRATE_ISOLATE)
 			pfn += 1;
 		else
 			break;
-- 
cgit v1.2.3


From 95e3441248053fc06bbb1dbbd34409a84211619e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:11 -0700
Subject: mm: remain migratetype in freed page

The page allocator caches the pageblock information in page->private while
it is in the PCP freelists but this is overwritten with the order of the
page when freed to the buddy allocator.  This patch stores the migratetype
of the page in the page->index field so that it is available at all times
when the page remain in free_list.

This patch adds a new call site in __free_pages_ok so it might be overhead
a bit but it's for high order allocation.  So I believe damage isn't hurt.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6aa0a8e89c5..94fd283dde9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -729,6 +729,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int wasMlocked = __TestClearPageMlocked(page);
+	int migratetype;
 
 	if (!free_pages_prepare(page, order))
 		return;
@@ -737,8 +738,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
-	free_one_page(page_zone(page), page, order,
-					get_pageblock_migratetype(page));
+	migratetype = get_pageblock_migratetype(page);
+	set_freepage_migratetype(page, migratetype);
+	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 
@@ -959,6 +961,7 @@ static int move_freepages(struct zone *zone,
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
+		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
-- 
cgit v1.2.3


From 41d575ad4a511b71a4a41c8313004212f5c229b1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:14 -0700
Subject: memory-hotplug: bug fix race between isolation and allocation

Like below, memory-hotplug makes race between page-isolation
and page-allocation so it can hit BUG_ON in __offline_isolated_pages.

	CPU A					CPU B

start_isolate_page_range
set_migratetype_isolate
spin_lock_irqsave(zone->lock)

				free_hot_cold_page(Page A)
				/* without zone->lock */
				migratetype = get_pageblock_migratetype(Page A);
				/*
				 * Page could be moved into MIGRATE_MOVABLE
				 * of per_cpu_pages
				 */
				list_add_tail(&page->lru, &pcp->lists[migratetype]);

set_pageblock_isolate
move_freepages_block
drain_all_pages

				/* Page A could be in MIGRATE_MOVABLE of free_list. */

check_pages_isolated
__test_page_isolated_in_pageblock
/*
 * We can't catch freed page which
 * is free_list[MIGRATE_MOVABLE]
 */
if (PageBuddy(page A))
	pfn += 1 << page_order(page A);

				/* So, Page A could be allocated */

__offline_isolated_pages
/*
 * BUG_ON hit or offline page
 * which is used by someone
 */
BUG_ON(!PageBuddy(page A));

This patch checks page's migratetype in freelist in
__test_page_isolated_in_pageblock.  So now
__test_page_isolated_in_pageblock can check the page caused by above race
and can fail of memory offlining.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9c03dca8c2a..6744235d2d0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -200,8 +200,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 			continue;
 		}
 		page = pfn_to_page(pfn);
-		if (PageBuddy(page))
+		if (PageBuddy(page)) {
+			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE)
+				break;
 			pfn += 1 << page_order(page);
+		}
 		else if (page_count(page) == 0 &&
 			get_freepage_migratetype(page) == MIGRATE_ISOLATE)
 			pfn += 1;
-- 
cgit v1.2.3


From 435b405c06119d93333738172b8060b0ed12af41 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:16 -0700
Subject: memory-hotplug: fix pages missed by race rather than failing

If race between allocation and isolation in memory-hotplug offline
happens, some pages could be in MIGRATE_MOVABLE of free_list although the
pageblock's migratetype of the page is MIGRATE_ISOLATE.

The race could be detected by get_freepage_migratetype in
__test_page_isolated_in_pageblock.  If it is detected, now EBUSY gets
bubbled all the way up and the hotplug operations fails.

But better idea is instead of returning and failing memory-hotremove, move
the free page to the correct list at the time it is detected.  It could
enhance memory-hotremove operation success ratio although the race is
really rare.

Suggested by Mel Gorman.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     |  2 +-
 mm/page_isolation.c | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94fd283dde9..82f0b2f54f8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -925,7 +925,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6744235d2d0..5f34a9053ce 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -201,8 +201,20 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 		}
 		page = pfn_to_page(pfn);
 		if (PageBuddy(page)) {
-			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE)
-				break;
+			/*
+			 * If race between isolatation and allocation happens,
+			 * some free pages could be in MIGRATE_MOVABLE list
+			 * although pageblock's migratation type of the page
+			 * is MIGRATE_ISOLATE. Catch it and move the page into
+			 * MIGRATE_ISOLATE list.
+			 */
+			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+				struct page *end_page;
+
+				end_page = page + (1 << page_order(page)) - 1;
+				move_freepages(page_zone(page), page, end_page,
+						MIGRATE_ISOLATE);
+			}
 			pfn += 1 << page_order(page);
 		}
 		else if (page_count(page) == 0 &&
-- 
cgit v1.2.3


From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 8 Oct 2012 16:32:19 -0700
Subject: readahead: fault retry breaks mmap file read random detection

.fault now can retry.  The retry can break state machine of .fault.  In
filemap_fault, if page is miss, ra->mmap_miss is increased.  In the second
try, since the page is in page cache now, ra->mmap_miss is decreased.  And
these are done in one fault, so we can't detect random mmap file access.

Add a new flag to indicate .fault is tried once.  In the second try, skip
ra->mmap_miss decreasing.  The filemap_fault state machine is ok with it.

I only tested x86, didn't test other archs, but looks the change for other
archs is obvious, but who knows :)

Signed-off-by: Shaohua Li <shaohua.li@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index a9827b42556..83efee76a5c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
-	if (likely(page)) {
+	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-	} else {
+	} else if (!page) {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
-- 
cgit v1.2.3


From e9d24ad30fc5c4c601824fb39712350b053ca812 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:21 -0700
Subject: mm/memblock: use existing interface to set nid

Use the existing interface function to set the NUMA node ID (NID) for the
regions, either memory or reserved region.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7..d809f17ef75 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -756,7 +756,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 		return ret;
 
 	for (i = start_rgn; i < end_rgn; i++)
-		type->regions[i].nid = nid;
+		memblock_set_region_node(&type->regions[i], nid);
 
 	memblock_merge_regions(type);
 	return 0;
-- 
cgit v1.2.3


From f2d52fe51c8c0a18cf5fbe583bad51090d12c146 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:24 -0700
Subject: mm/memblock: cleanup early_node_map[] related comments

Commit 0ee332c14518 ("memblock: Kill early_node_map[]") removed
early_node_map[].  Clean up the comments to comply with that change.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nobootmem.c  | 2 --
 mm/page_alloc.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f9..bd82f6b3141 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -162,8 +162,6 @@ unsigned long __init free_all_bootmem(void)
 	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
 	 *  because in some case like Node0 doesn't have RAM installed
 	 *  low ram will be on Node1
-	 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-	 *  will be used instead of only Node0 related
 	 */
 	return free_low_memory_core_early(MAX_NUMNODES);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 82f0b2f54f8..ca002b39b9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4931,7 +4931,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
-	/* Print out the early_node_map[] */
+	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
-- 
cgit v1.2.3


From e64c5237cf6ff474cb2f3f832f48f2b441dd9979 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 8 Oct 2012 16:32:27 -0700
Subject: mm: compaction: abort compaction loop if lock is contended or run too
 long

isolate_migratepages_range() might isolate no pages if for example when
zone->lru_lock is contended and running asynchronous compaction. In this
case, we should abort compaction, otherwise, compact_zone will run a
useless loop and make zone->lru_lock is even contended.

An additional check is added to ensure that cc.migratepages and
cc.freepages get properly drained whan compaction is aborted.

[minchan@kernel.org: Putback pages isolated for migration if aborting]
[akpm@linux-foundation.org: compact_zone_order requires non-NULL arg contended]
[akpm@linux-foundation.org: make compact_zone_order() require non-NULL arg `contended']
[minchan@kernel.org: Putback pages isolated for migration if aborting]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 17 ++++++++++++-----
 mm/internal.h   |  2 +-
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 1f61bcbd626..0649cc1b347 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 
 		/* async aborts if taking too long or contended */
 		if (!cc->sync) {
-			if (cc->contended)
-				*cc->contended = true;
+			cc->contended = true;
 			return false;
 		}
 
@@ -688,7 +687,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
 	/* Perform the isolation */
 	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
-	if (!low_pfn)
+	if (!low_pfn || cc->contended)
 		return ISOLATE_ABORT;
 
 	cc->migrate_pfn = low_pfn;
@@ -848,6 +847,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		switch (isolate_migratepages(zone, cc)) {
 		case ISOLATE_ABORT:
 			ret = COMPACT_PARTIAL;
+			putback_lru_pages(&cc->migratepages);
+			cc->nr_migratepages = 0;
 			goto out;
 		case ISOLATE_NONE:
 			continue;
@@ -896,6 +897,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 				 bool sync, bool *contended,
 				 struct page **page)
 {
+	unsigned long ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -903,13 +905,18 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
-		.contended = contended,
 		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
 
-	return compact_zone(zone, &cc);
+	ret = compact_zone(zone, &cc);
+
+	VM_BUG_ON(!list_empty(&cc.freepages));
+	VM_BUG_ON(!list_empty(&cc.migratepages));
+
+	*contended = cc.contended;
+	return ret;
 }
 
 int sysctl_extfrag_threshold = 500;
diff --git a/mm/internal.h b/mm/internal.h
index 96cda4c6ac5..97664be2ca3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -130,7 +130,7 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool *contended;		/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended */
 	struct page **page;		/* Page captured of requested size */
 };
 
-- 
cgit v1.2.3


From 3cc668f4e30fbd97b3c0574d8cac7a83903c9bc7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:30 -0700
Subject: mm: compaction: move fatal signal check out of
 compact_checklock_irqsave

Commit c67fe3752abe ("mm: compaction: Abort async compaction if locks
are contended or taking too long") addressed a lock contention problem
in compaction by introducing compact_checklock_irqsave() that effecively
aborting async compaction in the event of compaction.

To preserve existing behaviour it also moved a fatal_signal_pending()
check into compact_checklock_irqsave() but that is very misleading.  It
"hides" the check within a locking function but has nothing to do with
locking as such.  It just happens to work in a desirable fashion.

This patch moves the fatal_signal_pending() check to
isolate_migratepages_range() where it belongs.  Arguably the same check
should also happen when isolating pages for freeing but it's overkill.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 0649cc1b347..78075a26839 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -75,8 +75,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		cond_resched();
-		if (fatal_signal_pending(current))
-			return false;
 	}
 
 	if (!locked)
@@ -363,7 +361,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		/* Check if it is ok to still hold the lock */
 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
 								locked, cc);
-		if (!locked)
+		if (!locked || fatal_signal_pending(current))
 			break;
 
 		/*
-- 
cgit v1.2.3


From 661c4cb9b829110cb68c18ea05a56be39f75a4d2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:31 -0700
Subject: mm: compaction: Update try_to_compact_pages()kerneldoc comment

Parameters were added without documentation, tut tut.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 78075a26839..b16dd382299 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -926,6 +926,8 @@ int sysctl_extfrag_threshold = 500;
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
  * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
  *
  * This is the main entry point for direct page compaction.
  */
-- 
cgit v1.2.3


From 2a1402aa044b55c2d30ab0ed9405693ef06fb07c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:33 -0700
Subject: mm: compaction: acquire the zone->lru_lock as late as possible

Richard Davies and Shaohua Li have both reported lock contention problems
in compaction on the zone and LRU locks as well as significant amounts of
time being spent in compaction.  This series aims to reduce lock
contention and scanning rates to reduce that CPU usage.  Richard reported
at https://lkml.org/lkml/2012/9/21/91 that this series made a big
different to a problem he reported in August:

   http://marc.info/?l=kvm&m=134511507015614&w=2

Patch 1 defers acquiring the zone->lru_lock as long as possible.

Patch 2 defers acquiring the zone->lock as lock as possible.

Patch 3 reverts Rik's "skip-free" patches as the core concept gets
	reimplemented later and the remaining patches are easier to
	understand if this is reverted first.

Patch 4 adds a pageblock-skip bit to the pageblock flags to cache what
	pageblocks should be skipped by the migrate and free scanners.
	This drastically reduces the amount of scanning compaction has
	to do.

Patch 5 reimplements something similar to Rik's idea except it uses the
	pageblock-skip information to decide where the scanners should
	restart from and does not need to wrap around.

I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were

akpm-20120920	3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012
lesslock	Patches 1-6
revert		Patches 1-7
cachefail	Patches 1-8
skipuseless	Patches 1-9

Stress high-order allocation tests looked ok.  Success rates are more or
less the same with the full series applied but there is an expectation
that there is less opportunity to race with other allocation requests if
there is less scanning.  The time to complete the tests did not vary that
much and are uninteresting as were the vmstat statistics so I will not
present them here.

Using ftrace I recorded how much scanning was done by compaction and got this

                            3.6.0-rc6     3.6.0-rc6   3.6.0-rc6  3.6.0-rc6 3.6.0-rc6
                            akpm-20120920 lockless  revert-v2r2  cachefail skipuseless

Total   free    scanned         360753976  515414028  565479007   17103281   18916589
Total   free    isolated          2852429    3597369    4048601     670493     727840
Total   free    efficiency        0.0079%    0.0070%    0.0072%    0.0392%    0.0385%
Total   migrate scanned         247728664  822729112 1004645830   17946827   14118903
Total   migrate isolated          2555324    3245937    3437501     616359     658616
Total   migrate efficiency        0.0103%    0.0039%    0.0034%    0.0343%    0.0466%

The efficiency is worthless because of the nature of the test and the
number of failures.  The really interesting point as far as this patch
series is concerned is the number of pages scanned.  Note that reverting
Rik's patches massively increases the number of pages scanned indicating
that those patches really did make a difference to CPU usage.

However, caching what pageblocks should be skipped has a much higher
impact.  With patches 1-8 applied, free page and migrate page scanning are
both reduced by 95% in comparison to the akpm kernel.  If the basic
concept of Rik's patches are implemened on top then scanning then the free
scanner barely changed but migrate scanning was further reduced.  That
said, tests on 3.6-rc5 indicated that the last patch had greater impact
than what was measured here so it is a bit variable.

One way or the other, this series has a large impact on the amount of
scanning compaction does when there is a storm of THP allocations.

This patch:

Compaction's migrate scanner acquires the zone->lru_lock when scanning a
range of pages looking for LRU pages to acquire.  It does this even if
there are no LRU pages in the range.  If multiple processes are compacting
then this can cause severe locking contention.  To make matters worse
commit b2eef8c0 ("mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration") releases the lru_lock every
SWAP_CLUSTER_MAX pages that are scanned.

This patch makes two changes to how the migrate scanner acquires the LRU
lock.  First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages
if the lock is contended.  This reduces the number of times it
unnecessarily disables and re-enables IRQs.  The second is that it defers
acquiring the LRU lock for as long as possible.  If there are no LRU pages
or the only LRU pages are transhuge then the LRU lock will not be acquired
at all which reduces contention on zone->lru_lock.

[minchan@kernel.org: augment comment]
[akpm@linux-foundation.org: tweak comment text]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 65 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index b16dd382299..832c4183dcc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype)
 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+static inline bool should_release_lock(spinlock_t *lock)
+{
+	return need_resched() || spin_is_contended(lock);
+}
+
 /*
  * Compaction requires the taking of some coarse locks that are potentially
  * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype)
 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 				      bool locked, struct compact_control *cc)
 {
-	if (need_resched() || spin_is_contended(lock)) {
+	if (should_release_lock(lock)) {
 		if (locked) {
 			spin_unlock_irqrestore(lock, *flags);
 			locked = false;
@@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	isolate_mode_t mode = 0;
 	struct lruvec *lruvec;
 	unsigned long flags;
-	bool locked;
+	bool locked = false;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	cond_resched();
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	locked = true;
 	for (; low_pfn < end_pfn; low_pfn++) {
 		struct page *page;
 
 		/* give a chance to irqs before checking need_resched() */
-		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-			spin_unlock_irqrestore(&zone->lru_lock, flags);
-			locked = false;
+		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+			if (should_release_lock(&zone->lru_lock)) {
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+				locked = false;
+			}
 		}
 
-		/* Check if it is ok to still hold the lock */
-		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-								locked, cc);
-		if (!locked || fatal_signal_pending(current))
-			break;
-
 		/*
 		 * migrate_pfn does not necessarily start aligned to a
 		 * pageblock. Ensure that pfn_valid is called when moving
@@ -403,20 +402,39 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		pageblock_nr = low_pfn >> pageblock_order;
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-			low_pfn += pageblock_nr_pages;
-			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
-			last_pageblock_nr = pageblock_nr;
-			continue;
+			goto next_pageblock;
 		}
 
+		/* Check may be lockless but that's ok as we recheck later */
 		if (!PageLRU(page))
 			continue;
 
 		/*
-		 * PageLRU is set, and lru_lock excludes isolation,
-		 * splitting and collapsing (collapsing has already
-		 * happened if PageLRU is set).
+		 * PageLRU is set. lru_lock normally excludes isolation
+		 * splitting and collapsing (collapsing has already happened
+		 * if PageLRU is set) but the lock is not necessarily taken
+		 * here and it is wasteful to take it just to check transhuge.
+		 * Check TransHuge without lock and skip the whole pageblock if
+		 * it's either a transhuge or hugetlbfs page, as calling
+		 * compound_order() without preventing THP from splitting the
+		 * page underneath us may return surprising results.
 		 */
+		if (PageTransHuge(page)) {
+			if (!locked)
+				goto next_pageblock;
+			low_pfn += (1 << compound_order(page)) - 1;
+			continue;
+		}
+
+		/* Check if it is ok to still hold the lock */
+		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+								locked, cc);
+		if (!locked || fatal_signal_pending(current))
+			break;
+
+		/* Recheck PageLRU and PageTransHuge under lock */
+		if (!PageLRU(page))
+			continue;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
 			continue;
@@ -444,6 +462,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			++low_pfn;
 			break;
 		}
+
+		continue;
+
+next_pageblock:
+		low_pfn += pageblock_nr_pages;
+		low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+		last_pageblock_nr = pageblock_nr;
 	}
 
 	acct_isolated(zone, locked, cc);
-- 
cgit v1.2.3


From f40d1e42bb988d2a26e8e111ea4c4c7bac819b7e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:36 -0700
Subject: mm: compaction: acquire the zone->lock as late as possible

Compaction's free scanner acquires the zone->lock when checking for
PageBuddy pages and isolating them.  It does this even if there are no
PageBuddy pages in the range.

This patch defers acquiring the zone lock for as long as possible.  In the
event there are no free pages in the pageblock then the lock will not be
acquired at all which reduces contention on zone->lock.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 140 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 76 insertions(+), 64 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 832c4183dcc..bdf6e13045e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -93,6 +93,27 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
 	return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+	int migratetype = get_pageblock_migratetype(page);
+
+	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+		return false;
+
+	/* If the page is a large free page, then allow migration */
+	if (PageBuddy(page) && page_order(page) >= pageblock_order)
+		return true;
+
+	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+	if (migrate_async_suitable(migratetype))
+		return true;
+
+	/* Otherwise skip the block */
+	return false;
+}
+
 static void compact_capture_page(struct compact_control *cc)
 {
 	unsigned long flags;
@@ -153,38 +174,56 @@ static void compact_capture_page(struct compact_control *cc)
  * pages inside of the pageblock (even though it may still end up isolating
  * some pages).
  */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+				unsigned long blockpfn,
 				unsigned long end_pfn,
 				struct list_head *freelist,
 				bool strict)
 {
 	int nr_scanned = 0, total_isolated = 0;
 	struct page *cursor;
+	unsigned long nr_strict_required = end_pfn - blockpfn;
+	unsigned long flags;
+	bool locked = false;
 
 	cursor = pfn_to_page(blockpfn);
 
-	/* Isolate free pages. This assumes the block is valid */
+	/* Isolate free pages. */
 	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 		int isolated, i;
 		struct page *page = cursor;
 
-		if (!pfn_valid_within(blockpfn)) {
-			if (strict)
-				return 0;
-			continue;
-		}
 		nr_scanned++;
+		if (!pfn_valid_within(blockpfn))
+			continue;
+		if (!PageBuddy(page))
+			continue;
+
+		/*
+		 * The zone lock must be held to isolate freepages.
+		 * Unfortunately this is a very coarse lock and can be
+		 * heavily contended if there are parallel allocations
+		 * or parallel compactions. For async compaction do not
+		 * spin on the lock and we acquire the lock as late as
+		 * possible.
+		 */
+		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+								locked, cc);
+		if (!locked)
+			break;
+
+		/* Recheck this is a suitable migration target under lock */
+		if (!strict && !suitable_migration_target(page))
+			break;
 
-		if (!PageBuddy(page)) {
-			if (strict)
-				return 0;
+		/* Recheck this is a buddy page under lock */
+		if (!PageBuddy(page))
 			continue;
-		}
 
 		/* Found a free page, break it into order-0 pages */
 		isolated = split_free_page(page);
 		if (!isolated && strict)
-			return 0;
+			break;
 		total_isolated += isolated;
 		for (i = 0; i < isolated; i++) {
 			list_add(&page->lru, freelist);
@@ -199,6 +238,18 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 	}
 
 	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+	/*
+	 * If strict isolation is requested by CMA then check that all the
+	 * pages requested were isolated. If there were any failures, 0 is
+	 * returned and CMA will fail.
+	 */
+	if (strict && nr_strict_required != total_isolated)
+		total_isolated = 0;
+
+	if (locked)
+		spin_unlock_irqrestore(&cc->zone->lock, flags);
+
 	return total_isolated;
 }
 
@@ -218,12 +269,17 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 unsigned long
 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 {
-	unsigned long isolated, pfn, block_end_pfn, flags;
+	unsigned long isolated, pfn, block_end_pfn;
 	struct zone *zone = NULL;
 	LIST_HEAD(freelist);
 
+	/* cc needed for isolate_freepages_block to acquire zone->lock */
+	struct compact_control cc = {
+		.sync = true,
+	};
+
 	if (pfn_valid(start_pfn))
-		zone = page_zone(pfn_to_page(start_pfn));
+		cc.zone = zone = page_zone(pfn_to_page(start_pfn));
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
 		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
@@ -236,10 +292,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		spin_lock_irqsave(&zone->lock, flags);
-		isolated = isolate_freepages_block(pfn, block_end_pfn,
+		isolated = isolate_freepages_block(&cc, pfn, block_end_pfn,
 						   &freelist, true);
-		spin_unlock_irqrestore(&zone->lock, flags);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -483,29 +537,6 @@ next_pageblock:
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-
-	int migratetype = get_pageblock_migratetype(page);
-
-	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-		return false;
-
-	/* If the page is a large free page, then allow migration */
-	if (PageBuddy(page) && page_order(page) >= pageblock_order)
-		return true;
-
-	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(migratetype))
-		return true;
-
-	/* Otherwise skip the block */
-	return false;
-}
-
 /*
  * Returns the start pfn of the last page block in a zone.  This is the starting
  * point for full compaction of a zone.  Compaction searches for free pages from
@@ -529,7 +560,6 @@ static void isolate_freepages(struct zone *zone,
 {
 	struct page *page;
 	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
-	unsigned long flags;
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -577,30 +607,12 @@ static void isolate_freepages(struct zone *zone,
 		if (!suitable_migration_target(page))
 			continue;
 
-		/*
-		 * Found a block suitable for isolating free pages from. Now
-		 * we disabled interrupts, double check things are ok and
-		 * isolate the pages. This is to minimise the time IRQs
-		 * are disabled
-		 */
+		/* Found a block suitable for isolating free pages from */
 		isolated = 0;
-
-		/*
-		 * The zone lock must be held to isolate freepages. This
-		 * unfortunately this is a very coarse lock and can be
-		 * heavily contended if there are parallel allocations
-		 * or parallel compactions. For async compaction do not
-		 * spin on the lock
-		 */
-		if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
-			break;
-		if (suitable_migration_target(page)) {
-			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-			isolated = isolate_freepages_block(pfn, end_pfn,
-							   freelist, false);
-			nr_freepages += isolated;
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
+		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+		isolated = isolate_freepages_block(cc, pfn, end_pfn,
+						   freelist, false);
+		nr_freepages += isolated;
 
 		/*
 		 * Record the highest PFN we isolated pages from. When next
-- 
cgit v1.2.3


From 753341a4b85ff337487b9959c71c529f522004f4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:40 -0700
Subject: revert "mm: have order > 0 compaction start off where it left"

This reverts commit 7db8889ab05b ("mm: have order > 0 compaction start
off where it left") and commit de74f1cc ("mm: have order > 0 compaction
start near a pageblock with free pages").  These patches were a good
idea and tests confirmed that they massively reduced the amount of
scanning but the implementation is complex and tricky to understand.  A
later patch will cache what pageblocks should be skipped and
reimplements the concept of compact_cached_free_pfn on top for both
migration and free scanners.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 65 +++++----------------------------------------------------
 mm/internal.h   |  6 ------
 mm/page_alloc.c |  5 -----
 3 files changed, 5 insertions(+), 71 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index bdf6e13045e..db76361a311 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -537,20 +537,6 @@ next_pageblock:
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-	unsigned long free_pfn;
-	free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-	free_pfn &= ~(pageblock_nr_pages-1);
-	return free_pfn;
-}
-
 /*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
@@ -619,19 +605,8 @@ static void isolate_freepages(struct zone *zone,
 		 * looking for free pages, the search will restart here as
 		 * page migration may have returned some pages to the allocator
 		 */
-		if (isolated) {
+		if (isolated)
 			high_pfn = max(high_pfn, pfn);
-
-			/*
-			 * If the free scanner has wrapped, update
-			 * compact_cached_free_pfn to point to the highest
-			 * pageblock with free pages. This reduces excessive
-			 * scanning of full pageblocks near the end of the
-			 * zone
-			 */
-			if (cc->order > 0 && cc->wrapped)
-				zone->compact_cached_free_pfn = high_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
@@ -639,11 +614,6 @@ static void isolate_freepages(struct zone *zone,
 
 	cc->free_pfn = high_pfn;
 	cc->nr_freepages = nr_freepages;
-
-	/* If compact_cached_free_pfn is reset then set it now */
-	if (cc->order > 0 && !cc->wrapped &&
-			zone->compact_cached_free_pfn == start_free_pfn(zone))
-		zone->compact_cached_free_pfn = high_pfn;
 }
 
 /*
@@ -738,26 +708,8 @@ static int compact_finished(struct zone *zone,
 	if (fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
-	/*
-	 * A full (order == -1) compaction run starts at the beginning and
-	 * end of a zone; it completes when the migrate and free scanner meet.
-	 * A partial (order > 0) compaction can start with the free scanner
-	 * at a random point in the zone, and may have to restart.
-	 */
-	if (cc->free_pfn <= cc->migrate_pfn) {
-		if (cc->order > 0 && !cc->wrapped) {
-			/* We started partway through; restart at the end. */
-			unsigned long free_pfn = start_free_pfn(zone);
-			zone->compact_cached_free_pfn = free_pfn;
-			cc->free_pfn = free_pfn;
-			cc->wrapped = 1;
-			return COMPACT_CONTINUE;
-		}
-		return COMPACT_COMPLETE;
-	}
-
-	/* We wrapped around and ended up where we started. */
-	if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
+	/* Compaction run completes if the migrate and free scanner meet */
+	if (cc->free_pfn <= cc->migrate_pfn)
 		return COMPACT_COMPLETE;
 
 	/*
@@ -863,15 +815,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 	/* Setup to move all movable pages to the end of the zone */
 	cc->migrate_pfn = zone->zone_start_pfn;
-
-	if (cc->order > 0) {
-		/* Incremental compaction. Start where the last one stopped. */
-		cc->free_pfn = zone->compact_cached_free_pfn;
-		cc->start_free_pfn = cc->free_pfn;
-	} else {
-		/* Order == -1 starts at the end of the zone. */
-		cc->free_pfn = start_free_pfn(zone);
-	}
+	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
+	cc->free_pfn &= ~(pageblock_nr_pages-1);
 
 	migrate_prep_local();
 
diff --git a/mm/internal.h b/mm/internal.h
index 97664be2ca3..6f6bb9ab938 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,14 +118,8 @@ struct compact_control {
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
-	unsigned long start_free_pfn;	/* where we started the search */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
-	bool wrapped;			/* Order > 0 compactions are
-					   incremental, once free_pfn
-					   and migrate_pfn meet, we restart
-					   from the top of the zone;
-					   remember we wrapped around. */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ca002b39b9b..628968c1ccf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4490,11 +4490,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-		zone->compact_cached_free_pfn = zone->zone_start_pfn +
-						zone->spanned_pages;
-		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
-- 
cgit v1.2.3


From bb13ffeb9f6bfeb301443994dfbf29f91117dfb3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:41 -0700
Subject: mm: compaction: cache if a pageblock was scanned and no pages were
 isolated

When compaction was implemented it was known that scanning could
potentially be excessive.  The ideal was that a counter be maintained for
each pageblock but maintaining this information would incur a severe
penalty due to a shared writable cache line.  It has reached the point
where the scanning costs are a serious problem, particularly on
long-lived systems where a large process starts and allocates a large
number of THPs at the same time.

Instead of using a shared counter, this patch adds another bit to the
pageblock flags called PG_migrate_skip.  If a pageblock is scanned by
either migrate or free scanner and 0 pages were isolated, the pageblock is
marked to be skipped in the future.  When scanning, this bit is checked
before any scanning takes place and the block skipped if set.

The main difficulty with a patch like this is "when to ignore the cached
information?" If it's ignored too often, the scanning rates will still be
excessive.  If the information is too stale then allocations will fail
that might have otherwise succeeded.  In this patch

o CMA always ignores the information
o If the migrate and free scanner meet then the cached information will
  be discarded if it's at least 5 seconds since the last time the cache
  was discarded
o If there are a large number of allocation failures, discard the cache.

The time-based heuristic is very clumsy but there are few choices for a
better event.  Depending solely on multiple allocation failures still
allows excessive scanning when THP allocations are failing in quick
succession due to memory pressure.  Waiting until memory pressure is
relieved would cause compaction to continually fail instead of using
reclaim/compaction to try allocate the page.  The time-based mechanism is
clumsy but a better option is not obvious.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 mm/internal.h   |   4 +-
 mm/page_alloc.c |  38 +++++++++--------
 3 files changed, 131 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index db76361a311..d9dbb97e607 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,79 @@ static inline bool migrate_async_suitable(int migratetype)
 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+					struct page *page)
+{
+	if (cc->ignore_skip_hint)
+		return true;
+
+	return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void reset_isolation_suitable(struct zone *zone)
+{
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	unsigned long pfn;
+
+	/*
+	 * Do not reset more than once every five seconds. If allocations are
+	 * failing sufficiently quickly to allow this to happen then continually
+	 * scanning for compaction is not going to help. The choice of five
+	 * seconds is arbitrary but will mitigate excessive scanning.
+	 */
+	if (time_before(jiffies, zone->compact_blockskip_expire))
+		return;
+	zone->compact_blockskip_expire = jiffies + (HZ * 5);
+
+	/* Walk the zone and mark every pageblock as suitable for isolation */
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+		struct page *page;
+
+		cond_resched();
+
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		if (zone != page_zone(page))
+			continue;
+
+		clear_pageblock_skip(page);
+	}
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+{
+	if (!page)
+		return;
+
+	if (!nr_isolated)
+		set_pageblock_skip(page);
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+					struct page *page)
+{
+	return true;
+}
+
+static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
 static inline bool should_release_lock(spinlock_t *lock)
 {
 	return need_resched() || spin_is_contended(lock);
@@ -181,7 +254,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 				bool strict)
 {
 	int nr_scanned = 0, total_isolated = 0;
-	struct page *cursor;
+	struct page *cursor, *valid_page = NULL;
 	unsigned long nr_strict_required = end_pfn - blockpfn;
 	unsigned long flags;
 	bool locked = false;
@@ -196,6 +269,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		nr_scanned++;
 		if (!pfn_valid_within(blockpfn))
 			continue;
+		if (!valid_page)
+			valid_page = page;
 		if (!PageBuddy(page))
 			continue;
 
@@ -250,6 +325,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	if (locked)
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
 
+	/* Update the pageblock-skip if the whole pageblock was scanned */
+	if (blockpfn == end_pfn)
+		update_pageblock_skip(valid_page, total_isolated);
+
 	return total_isolated;
 }
 
@@ -267,22 +346,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
  * a free page).
  */
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+			unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long isolated, pfn, block_end_pfn;
-	struct zone *zone = NULL;
 	LIST_HEAD(freelist);
 
-	/* cc needed for isolate_freepages_block to acquire zone->lock */
-	struct compact_control cc = {
-		.sync = true,
-	};
-
-	if (pfn_valid(start_pfn))
-		cc.zone = zone = page_zone(pfn_to_page(start_pfn));
-
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
 			break;
 
 		/*
@@ -292,7 +363,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		isolated = isolate_freepages_block(&cc, pfn, block_end_pfn,
+		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
 						   &freelist, true);
 
 		/*
@@ -387,6 +458,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	struct lruvec *lruvec;
 	unsigned long flags;
 	bool locked = false;
+	struct page *page = NULL, *valid_page = NULL;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -407,8 +479,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	/* Time to isolate some pages for migration */
 	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
-		struct page *page;
-
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
 			if (should_release_lock(&zone->lru_lock)) {
@@ -444,6 +514,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (page_zone(page) != zone)
 			continue;
 
+		if (!valid_page)
+			valid_page = page;
+
+		/* If isolation recently failed, do not retry */
+		pageblock_nr = low_pfn >> pageblock_order;
+		if (!isolation_suitable(cc, page))
+			goto next_pageblock;
+
 		/* Skip if free */
 		if (PageBuddy(page))
 			continue;
@@ -453,7 +531,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 * migration is optimistic to see if the minimum amount of work
 		 * satisfies the allocation
 		 */
-		pageblock_nr = low_pfn >> pageblock_order;
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
 			goto next_pageblock;
@@ -530,6 +607,10 @@ next_pageblock:
 	if (locked)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
+	/* Update the pageblock-skip if the whole pageblock was scanned */
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(valid_page, nr_isolated);
+
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
 	return low_pfn;
@@ -593,6 +674,10 @@ static void isolate_freepages(struct zone *zone,
 		if (!suitable_migration_target(page))
 			continue;
 
+		/* If isolation recently failed, do not retry */
+		if (!isolation_suitable(cc, page))
+			continue;
+
 		/* Found a block suitable for isolating free pages from */
 		isolated = 0;
 		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
@@ -709,8 +794,10 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
-	if (cc->free_pfn <= cc->migrate_pfn)
+	if (cc->free_pfn <= cc->migrate_pfn) {
+		reset_isolation_suitable(cc->zone);
 		return COMPACT_COMPLETE;
+	}
 
 	/*
 	 * order == -1 is expected when compacting via
@@ -818,6 +905,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
 	cc->free_pfn &= ~(pageblock_nr_pages-1);
 
+	/* Clear pageblock skip if there are numerous alloc failures */
+	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
+		reset_isolation_suitable(zone);
+
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
diff --git a/mm/internal.h b/mm/internal.h
index 6f6bb9ab938..7ba56ac360b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -120,6 +120,7 @@ struct compact_control {
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
+	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
@@ -129,7 +130,8 @@ struct compact_control {
 };
 
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+			unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 628968c1ccf..44c56049edf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5679,7 +5679,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
 }
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 
@@ -5687,25 +5688,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 	unsigned int tries = 0;
 	int ret = 0;
 
-	struct compact_control cc = {
-		.nr_migratepages = 0,
-		.order = -1,
-		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
-	};
-	INIT_LIST_HEAD(&cc.migratepages);
-
 	migrate_prep_local();
 
-	while (pfn < end || !list_empty(&cc.migratepages)) {
+	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		if (list_empty(&cc.migratepages)) {
-			cc.nr_migratepages = 0;
-			pfn = isolate_migratepages_range(cc.zone, &cc,
+		if (list_empty(&cc->migratepages)) {
+			cc->nr_migratepages = 0;
+			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end);
 			if (!pfn) {
 				ret = -EINTR;
@@ -5717,14 +5710,14 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 			break;
 		}
 
-		reclaim_clean_pages_from_list(cc.zone, &cc.migratepages);
+		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
 
-		ret = migrate_pages(&cc.migratepages,
+		ret = migrate_pages(&cc->migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
 	}
 
-	putback_lru_pages(&cc.migratepages);
+	putback_lru_pages(&cc->migratepages);
 	return ret > 0 ? 0 : ret;
 }
 
@@ -5803,6 +5796,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 
+	struct compact_control cc = {
+		.nr_migratepages = 0,
+		.order = -1,
+		.zone = page_zone(pfn_to_page(start)),
+		.sync = true,
+		.ignore_skip_hint = true,
+	};
+	INIT_LIST_HEAD(&cc.migratepages);
+
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
@@ -5832,7 +5834,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	if (ret)
 		goto done;
 
-	ret = __alloc_contig_migrate_range(start, end);
+	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 
@@ -5881,7 +5883,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
 
 	/* Grab isolated pages from freelists. */
-	outer_end = isolate_freepages_range(outer_start, end);
+	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
-- 
cgit v1.2.3


From c89511ab2f8fe2b47585e60da8af7fd213ec877e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:45 -0700
Subject: mm: compaction: Restart compaction from near where it left off

This is almost entirely based on Rik's previous patches and discussions
with him about how this might be implemented.

Order > 0 compaction stops when enough free pages of the correct page
order have been coalesced.  When doing subsequent higher order
allocations, it is possible for compaction to be invoked many times.

However, the compaction code always starts out looking for things to
compact at the start of the zone, and for free pages to compact things to
at the end of the zone.

This can cause quadratic behaviour, with isolate_freepages starting at the
end of the zone each time, even though previous invocations of the
compaction code already filled up all free memory on that end of the zone.
 This can cause isolate_freepages to take enormous amounts of CPU with
certain workloads on larger memory systems.

This patch caches where the migration and free scanner should start from
on subsequent compaction invocations using the pageblock-skip information.
 When compaction starts it begins from the cached restart points and will
update the cached restart points until a page is isolated or a pageblock
is skipped that would have been scanned by synchronous compaction.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++----------
 mm/internal.h   |  4 ++++
 2 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index d9dbb97e607..f94cbc0b99a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -80,6 +80,9 @@ static void reset_isolation_suitable(struct zone *zone)
 	 */
 	if (time_before(jiffies, zone->compact_blockskip_expire))
 		return;
+
+	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_expire = jiffies + (HZ * 5);
 
 	/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -103,13 +106,29 @@ static void reset_isolation_suitable(struct zone *zone)
  * If no pages were isolated then mark this pageblock to be skipped in the
  * future. The information is later cleared by reset_isolation_suitable().
  */
-static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+static void update_pageblock_skip(struct compact_control *cc,
+			struct page *page, unsigned long nr_isolated,
+			bool migrate_scanner)
 {
+	struct zone *zone = cc->zone;
 	if (!page)
 		return;
 
-	if (!nr_isolated)
+	if (!nr_isolated) {
+		unsigned long pfn = page_to_pfn(page);
 		set_pageblock_skip(page);
+
+		/* Update where compaction should restart */
+		if (migrate_scanner) {
+			if (!cc->finished_update_migrate &&
+			    pfn > zone->compact_cached_migrate_pfn)
+				zone->compact_cached_migrate_pfn = pfn;
+		} else {
+			if (!cc->finished_update_free &&
+			    pfn < zone->compact_cached_free_pfn)
+				zone->compact_cached_free_pfn = pfn;
+		}
+	}
 }
 #else
 static inline bool isolation_suitable(struct compact_control *cc,
@@ -118,7 +137,9 @@ static inline bool isolation_suitable(struct compact_control *cc,
 	return true;
 }
 
-static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+static void update_pageblock_skip(struct compact_control *cc,
+			struct page *page, unsigned long nr_isolated,
+			bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -327,7 +348,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(valid_page, total_isolated);
+		update_pageblock_skip(cc, valid_page, total_isolated, false);
 
 	return total_isolated;
 }
@@ -533,6 +554,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
+			cc->finished_update_migrate = true;
 			goto next_pageblock;
 		}
 
@@ -583,6 +605,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		VM_BUG_ON(PageTransCompound(page));
 
 		/* Successfully isolated */
+		cc->finished_update_migrate = true;
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		list_add(&page->lru, migratelist);
 		cc->nr_migratepages++;
@@ -609,7 +632,7 @@ next_pageblock:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (low_pfn == end_pfn)
-		update_pageblock_skip(valid_page, nr_isolated);
+		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -690,8 +713,10 @@ static void isolate_freepages(struct zone *zone,
 		 * looking for free pages, the search will restart here as
 		 * page migration may have returned some pages to the allocator
 		 */
-		if (isolated)
+		if (isolated) {
+			cc->finished_update_free = true;
 			high_pfn = max(high_pfn, pfn);
+		}
 	}
 
 	/* split_free_page does not map the pages */
@@ -888,6 +913,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -900,10 +927,21 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		;
 	}
 
-	/* Setup to move all movable pages to the end of the zone */
-	cc->migrate_pfn = zone->zone_start_pfn;
-	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-	cc->free_pfn &= ~(pageblock_nr_pages-1);
+	/*
+	 * Setup to move all movable pages to the end of the zone. Used cached
+	 * information on where the scanners should start but check that it
+	 * is initialised by ensuring the values are within zone boundaries.
+	 */
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->free_pfn = zone->compact_cached_free_pfn;
+	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+		zone->compact_cached_free_pfn = cc->free_pfn;
+	}
+	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+		cc->migrate_pfn = start_pfn;
+		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+	}
 
 	/* Clear pageblock skip if there are numerous alloc failures */
 	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
diff --git a/mm/internal.h b/mm/internal.h
index 7ba56ac360b..7f72f249bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -121,6 +121,10 @@ struct compact_control {
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
+	bool finished_update_free;	/* True when the zone cached pfns are
+					 * no longer being updated
+					 */
+	bool finished_update_migrate;
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
-- 
cgit v1.2.3


From 62997027ca5b3d4618198ed8b1aba40b61b1137b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:47 -0700
Subject: mm: compaction: clear PG_migrate_skip based on compaction and reclaim
 activity

Compaction caches if a pageblock was scanned and no pages were isolated so
that the pageblocks can be skipped in the future to reduce scanning.  This
information is not cleared by the page allocator based on activity due to
the impact it would have to the page allocator fast paths.  Hence there is
a requirement that something clear the cache or pageblocks will be skipped
forever.  Currently the cache is cleared if there were a number of recent
allocation failures and it has not been cleared within the last 5 seconds.
Time-based decisions like this are terrible as they have no relationship
to VM activity and is basically a big hammer.

Unfortunately, accurate heuristics would add cost to some hot paths so
this patch implements a rough heuristic.  There are two cases where the
cache is cleared.

1. If a !kswapd process completes a compaction cycle (migrate and free
   scanner meet), the zone is marked compact_blockskip_flush. When kswapd
   goes to sleep, it will clear the cache. This is expected to be the
   common case where the cache is cleared. It does not really matter if
   kswapd happens to be asleep or going to sleep when the flag is set as
   it will be woken on the next allocation request.

2. If there have been multiple failures recently and compaction just
   finished being deferred then a process will clear the cache and start a
   full scan.  This situation happens if there are multiple high-order
   allocation requests under heavy memory pressure.

The clearing of the PG_migrate_skip bits and other scans is inherently
racy but the race is harmless.  For allocations that can fail such as THP,
they will simply fail.  For requests that cannot fail, they will retry the
allocation.  Tests indicated that scanning rates were roughly similar to
when the time-based heuristic was used and the allocation success rates
were similar.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 50 ++++++++++++++++++++++++++++++++++----------------
 mm/page_alloc.c |  1 +
 mm/vmscan.c     |  8 ++++++++
 3 files changed, 43 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index f94cbc0b99a..d8187f9cabb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,24 +66,15 @@ static inline bool isolation_suitable(struct compact_control *cc,
  * should be skipped for page isolation when the migrate and free page scanner
  * meet.
  */
-static void reset_isolation_suitable(struct zone *zone)
+static void __reset_isolation_suitable(struct zone *zone)
 {
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	unsigned long pfn;
 
-	/*
-	 * Do not reset more than once every five seconds. If allocations are
-	 * failing sufficiently quickly to allow this to happen then continually
-	 * scanning for compaction is not going to help. The choice of five
-	 * seconds is arbitrary but will mitigate excessive scanning.
-	 */
-	if (time_before(jiffies, zone->compact_blockskip_expire))
-		return;
-
 	zone->compact_cached_migrate_pfn = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
-	zone->compact_blockskip_expire = jiffies + (HZ * 5);
+	zone->compact_blockskip_flush = false;
 
 	/* Walk the zone and mark every pageblock as suitable for isolation */
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -102,9 +93,24 @@ static void reset_isolation_suitable(struct zone *zone)
 	}
 }
 
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+	int zoneid;
+
+	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+		struct zone *zone = &pgdat->node_zones[zoneid];
+		if (!populated_zone(zone))
+			continue;
+
+		/* Only flush if a full compaction finished recently */
+		if (zone->compact_blockskip_flush)
+			__reset_isolation_suitable(zone);
+	}
+}
+
 /*
  * If no pages were isolated then mark this pageblock to be skipped in the
- * future. The information is later cleared by reset_isolation_suitable().
+ * future. The information is later cleared by __reset_isolation_suitable().
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
@@ -820,7 +826,15 @@ static int compact_finished(struct zone *zone,
 
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
-		reset_isolation_suitable(cc->zone);
+		/*
+		 * Mark that the PG_migrate_skip information should be cleared
+		 * by kswapd when it goes to sleep. kswapd does not set the
+		 * flag itself as the decision to be clear should be directly
+		 * based on an allocation request.
+		 */
+		if (!current_is_kswapd())
+			zone->compact_blockskip_flush = true;
+
 		return COMPACT_COMPLETE;
 	}
 
@@ -943,9 +957,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
 	}
 
-	/* Clear pageblock skip if there are numerous alloc failures */
-	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
-		reset_isolation_suitable(zone);
+	/*
+	 * Clear pageblock skip if there were failures recently and compaction
+	 * is about to be retried after being deferred. kswapd does not do
+	 * this reset as it'll reset the cached information when going to sleep.
+	 */
+	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+		__reset_isolation_suitable(zone);
 
 	migrate_prep_local();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44c56049edf..b97cf12f07a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2172,6 +2172,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				preferred_zone, migratetype);
 		if (page) {
 got_page:
+			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ee4b69a28a..b010efc4389 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2895,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 
+		/*
+		 * Compaction records what page blocks it recently failed to
+		 * isolate pages from and skips them in the future scanning.
+		 * When kswapd is going to sleep, it is reasonable to assume
+		 * that pages and compaction may succeed so reset the cache.
+		 */
+		reset_isolation_suitable(pgdat);
+
 		if (!kthread_should_stop())
 			schedule();
 
-- 
cgit v1.2.3


From 3f6d4caeb9a9d8f7e5bbf3f49f1fd71e1414ff64 Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Mon, 8 Oct 2012 16:32:50 -0700
Subject: mm/hugetlb.c: remove duplicate inclusion of header file

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c9b40e3a993..8536741f069 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-- 
cgit v1.2.3


From 723a0644a7255f532575fd43245f9ef976491328 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:52 -0700
Subject: mm/page_alloc: refactor out __alloc_contig_migrate_alloc()

__alloc_contig_migrate_alloc() can be used by memory-hotplug so refactor
it out (move + rename as a common name) into page_isolation.c.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 14 +-------------
 mm/page_isolation.c | 11 +++++++++++
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b97cf12f07a..8ac593893e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5667,18 +5667,6 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
 				pageblock_nr_pages));
 }
 
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-			     int **resultp)
-{
-	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
-	if (PageHighMem(page))
-		gfp_mask |= __GFP_HIGHMEM;
-
-	return alloc_page(gfp_mask);
-}
-
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
@@ -5714,7 +5702,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
 
 		ret = migrate_pages(&cc->migratepages,
-				    __alloc_contig_migrate_alloc,
+				    alloc_migrate_target,
 				    0, false, MIGRATE_SYNC);
 	}
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5f34a9053ce..f2f5b4818e9 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -255,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret ? 0 : -EBUSY;
 }
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+				  int **resultp)
+{
+	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+	if (PageHighMem(page))
+		gfp_mask |= __GFP_HIGHMEM;
+
+	return alloc_page(gfp_mask);
+}
-- 
cgit v1.2.3


From 74c08f982674cfd5dfeb2702d631db9bcdabf788 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:54 -0700
Subject: memory-hotplug: don't replace lowmem pages with highmem

The changelog for commit 6a6dccba2fdc ("mm: cma: don't replace lowmem
pages with highmem") mentioned that lowmem pages can be replaced by
highmem pages during CMA migration.  6a6dccba2fdc fixed that issue.

Quote from that changelog:

:   The filesystem layer expects pages in the block device's mapping to not
:   be in highmem (the mapping's gfp mask is set in bdget()), but CMA can
:   currently replace lowmem pages with highmem pages, leading to crashes in
:   filesystem code such as the one below:
:
:     Unable to handle kernel NULL pointer dereference at virtual address 00000400
:     pgd = c0c98000
:     [00000400] *pgd=00c91831, *pte=00000000, *ppte=00000000
:     Internal error: Oops: 817 [#1] PREEMPT SMP ARM
:     CPU: 0    Not tainted  (3.5.0-rc5+ #80)
:     PC is at __memzero+0x24/0x80
:     ...
:     Process fsstress (pid: 323, stack limit = 0xc0cbc2f0)
:     Backtrace:
:     [<c010e3f0>] (ext4_getblk+0x0/0x180) from [<c010e58c>] (ext4_bread+0x1c/0x98)
:     [<c010e570>] (ext4_bread+0x0/0x98) from [<c0117944>] (ext4_mkdir+0x160/0x3bc)
:      r4:c15337f0
:     [<c01177e4>] (ext4_mkdir+0x0/0x3bc) from [<c00c29e0>] (vfs_mkdir+0x8c/0x98)
:     [<c00c2954>] (vfs_mkdir+0x0/0x98) from [<c00c2a60>] (sys_mkdirat+0x74/0xac)
:      r6:00000000 r5:c152eb40 r4:000001ff r3:c14b43f0
:     [<c00c29ec>] (sys_mkdirat+0x0/0xac) from [<c00c2ab8>] (sys_mkdir+0x20/0x24)
:      r6:beccdcf0 r5:00074000 r4:beccdbbc
:     [<c00c2a98>] (sys_mkdir+0x0/0x24) from [<c000e3c0>] (ret_fast_syscall+0x0/0x30)

Memory-hotplug has same problem as CMA has so the same fix can be applied
to memory-hotplug as well.

Fix it by reusing.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b35016156c1..f9ac0955e10 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -756,13 +756,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 	return 0;
 }
 
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-	/* This should be improooooved!! */
-	return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
 #define NR_OFFLINE_AT_ONCE_PAGES	(256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +806,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			putback_lru_pages(&source);
 			goto out;
 		}
-		/* this function returns # of failed pages */
-		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+
+		/*
+		 * alloc_migrate_target should be improooooved!!
+		 * migrate_pages returns # of failed pages.
+		 */
+		ret = migrate_pages(&source, alloc_migrate_target, 0,
 							true, MIGRATE_SYNC);
 		if (ret)
 			putback_lru_pages(&source);
-- 
cgit v1.2.3


From e3b4126c556ca3a07699adf202d44bed3f453638 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:57 -0700
Subject: thp: khugepaged_prealloc_page() forgot to reset the page alloc
 indicator

If NUMA is enabled, the indicator is not reset if the previous page
request failed, ausing us to trigger the BUG_ON() in
khugepaged_alloc_page().

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7cf8b0ec11e..7153e0d1867 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1797,6 +1797,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 			return false;
 
 		*wait = false;
+		*hpage = NULL;
 		khugepaged_alloc_sleep();
 	} else if (*hpage) {
 		put_page(*hpage);
-- 
cgit v1.2.3


From eab1eef9911c36966b5d5934e6970581b3316013 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:33:01 -0700
Subject: mm: thp: fix the update_mmu_cache() last argument passing in
 mm/huge_memory.c

The update_mmu_cache() takes a pointer (to pte_t by default) as the last
argument but the huge_memory.c passes a pmd_t value.  The patch changes
the argument to the pmd_t * pointer.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7153e0d1867..0e7740923fb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -889,7 +889,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-			update_mmu_cache(vma, address, entry);
+			update_mmu_cache(vma, address, pmd);
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
@@ -941,7 +941,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmdp_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		update_mmu_cache(vma, address, entry);
+		update_mmu_cache(vma, address, pmd);
 		page_remove_rmap(page);
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
@@ -2002,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache(vma, address, _pmd);
+	update_mmu_cache(vma, address, pmd);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
-- 
cgit v1.2.3


From 05106e6a54aed321191b4bb5c9ee09538cbad3b1 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 8 Oct 2012 16:33:03 -0700
Subject: mm: enable CONFIG_COMPACTION by default

Now that lumpy reclaim has been removed, compaction is the only way left
to free up contiguous memory areas.  It is time to just enable
CONFIG_COMPACTION by default.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 3322342a1ff..a3f8dddaaab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
 # support for memory compaction
 config COMPACTION
 	bool "Allow for memory compaction"
+	def_bool y
 	select MIGRATION
 	depends on MMU
 	help
-- 
cgit v1.2.3


From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@huawei.com>
Date: Mon, 8 Oct 2012 16:33:06 -0700
Subject: mm: fix-up zone present pages

I think zone->present_pages indicates pages that buddy system can management,
it should be:

	zone->present_pages = spanned pages - absent pages - bootmem pages,

but is now:
	zone->present_pages = spanned pages - absent pages - memmap pages.

spanned pages: total size, including holes.
absent pages: holes.
bootmem pages: pages used in system boot, managed by bootmem allocator.
memmap pages: pages used by page structs.

This may cause zone->present_pages less than it should be.  For example,
numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other
bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's
present_pages should be spanned pages - absent pages, but now it also
minus memmap pages(free_area_init_core), which are actually allocated from
ZONE_MOVABLE.  When offlining all memory of a zone, this will cause
zone->present_pages less than 0, because present_pages is unsigned long
type, it is actually a very large integer, it indirectly caused
zone->watermark[WMARK_MIN] becomes a large
integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a
large integer(calculate_totalreserve_pages()), and finally cause memory
allocating failure when fork process(__vm_enough_memory()).

[root@localhost ~]# dmesg
-bash: fork: Cannot allocate memory

I think the bug described in

  http://marc.info/?l=linux-mm&m=134502182714186&w=2

is also caused by wrong zone present pages.

This patch intends to fix-up zone->present_pages when memory are freed to
buddy system on x86_64 and IA64 platforms.

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Tested-by: Petr Tesarik <ptesarik@suse.cz>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/bootmem.c        | 10 +++++++++-
 mm/memory_hotplug.c |  7 +++++++
 mm/nobootmem.c      |  3 +++
 mm/page_alloc.c     | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b2..434be4ae7a0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 			int order = ilog2(BITS_PER_LONG);
 
 			__free_pages_bootmem(pfn_to_page(start), order);
+			fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+					start, start + BITS_PER_LONG);
 			count += BITS_PER_LONG;
 			start += BITS_PER_LONG;
 		} else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 				if (vec & 1) {
 					page = pfn_to_page(start + off);
 					__free_pages_bootmem(page, 0);
+					fixup_zone_present_pages(
+						page_to_nid(page),
+						start + off, start + off + 1);
 					count++;
 				}
 				vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 	pages = bdata->node_low_pfn - bdata->node_min_pfn;
 	pages = bootmem_bootmap_pages(pages);
 	count += pages;
-	while (pages--)
+	while (pages--) {
+		fixup_zone_present_pages(page_to_nid(page),
+				page_to_pfn(page), page_to_pfn(page) + 1);
 		__free_pages_bootmem(page++, 0);
+	}
 
 	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9ac0955e10..ce690a911f1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
 	unsigned long type;
+	struct zone *zone;
 
 	type = (unsigned long) page->lru.next;
 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
 		set_page_private(page, 0);
 		INIT_LIST_HEAD(&page->lru);
 		__free_pages_bootmem(page, 0);
+
+		zone = page_zone(page);
+		zone_span_writelock(zone);
+		zone->present_pages++;
+		zone_span_writeunlock(zone);
+		totalram_pages++;
 	}
 
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b3141..714d5d65047 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 		return 0;
 
 	__free_pages_memory(start_pfn, end_pfn);
+	fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+			start_pfn, end_pfn);
 
 	return end_pfn - start_pfn;
 }
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 	phys_addr_t start, end, size;
 	u64 i;
 
+	reset_zone_present_pages();
 	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
 		count += __free_memory_core(start, end);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ac593893e6..00750bc08a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6087,3 +6087,37 @@ void dump_page(struct page *page)
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);
 }
+
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+	struct zone *z;
+	int i, nid;
+
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			z = NODE_DATA(nid)->node_zones + i;
+			z->present_pages = 0;
+		}
+	}
+}
+
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+				unsigned long end_pfn)
+{
+	struct zone *z;
+	unsigned long zone_start_pfn, zone_end_pfn;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		z = NODE_DATA(nid)->node_zones + i;
+		zone_start_pfn = z->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + z->spanned_pages;
+
+		/* if the two regions intersect */
+		if (!(zone_start_pfn >= end_pfn	|| zone_end_pfn <= start_pfn))
+			z->present_pages += min(end_pfn, zone_end_pfn) -
+					    max(start_pfn, zone_start_pfn);
+	}
+}
-- 
cgit v1.2.3


From 4bd2c1ee4b439d926e437a4841e8145230df98c9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:10 -0700
Subject: memcg: cleanup kmem tcp ifdefs

TCP kmem accounting is currently guarded by CONFIG_MEMCG_KMEM ifdefs but
the code is not used if !CONFIG_INET so we should rather test for both.
The same applies to net/sock.h, net/ip.h and net/tcp_memcontrol.h but
let's keep those outside of any ifdefs because it is considered safer wrt.
 future maintainability.

Tested with
- CONFIG_INET && CONFIG_MEMCG_KMEM
- !CONFIG_INET && CONFIG_MEMCG_KMEM
- CONFIG_INET && !CONFIG_MEMCG_KMEM
- !CONFIG_INET && !CONFIG_MEMCG_KMEM

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a72f2ffdc3d..4f5f9365724 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
+#include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 
 #include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct tcp_memcontrol tcp_mem;
 #endif
 };
@@ -412,9 +413,7 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 }
 
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
-#include <net/sock.h>
-#include <net/ip.h>
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
@@ -461,7 +460,6 @@ void sock_release_memcg(struct sock *sk)
 	}
 }
 
-#ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +468,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 	return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
 
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
-- 
cgit v1.2.3


From 7ffc0edc49d0df5dac077c1830e2533b27d3a4ed Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:13 -0700
Subject: memcg: move mem_cgroup_is_root upwards

kmem code uses this function and it is better to not use forward
declarations for static inline functions as some (older) compilers don't
like it:

gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux)

  mm/memcontrol.c:421: warning: `mem_cgroup_is_root' declared inline after being called
  mm/memcontrol.c:421: warning: previous declaration of `mem_cgroup_is_root' was here

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f5f9365724..7acf43bf04a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -412,10 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 	return container_of(s, struct mem_cgroup, css);
 }
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+	return (memcg == root_mem_cgroup);
+}
+
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
@@ -1011,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
-	return (memcg == root_mem_cgroup);
-}
-
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
-- 
cgit v1.2.3


From ec4d9f626d5908b6052c2973f37992f1db52e967 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:14 -0700
Subject: mm: fix invalidate_complete_page2() lock ordering

In fuzzing with trinity, lockdep protested "possible irq lock inversion
dependency detected" when isolate_lru_page() reenabled interrupts while
still holding the supposedly irq-safe tree_lock:

invalidate_inode_pages2
  invalidate_complete_page2
    spin_lock_irq(&mapping->tree_lock)
    clear_page_mlock
      isolate_lru_page
        spin_unlock_irq(&zone->lru_lock)

isolate_lru_page() is correct to enable interrupts unconditionally:
invalidate_complete_page2() is incorrect to call clear_page_mlock() while
holding tree_lock, which is supposed to nest inside lru_lock.

Both truncate_complete_page() and invalidate_complete_page() call
clear_page_mlock() before taking tree_lock to remove page from radix_tree.
 I guess invalidate_complete_page2() preferred to test PageDirty (again)
under tree_lock before committing to the munlock; but since the page has
already been unmapped, its state is already somewhat inconsistent, and no
worse if clear_page_mlock() moved up.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Deciphered-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/truncate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac..f38055cb8af 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
+	clear_page_mlock(page);
+
 	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
 
-	clear_page_mlock(page);
 	BUG_ON(page_has_private(page));
 	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
-- 
cgit v1.2.3


From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:18 -0700
Subject: mm: remove vma arg from page_evictable

page_evictable(page, vma) is an irritant: almost all its callers pass
NULL for vma.  Remove the vma arg and use mlocked_vma_newpage(vma, page)
explicitly in the couple of places it's needed.  But in those places we
don't even need page_evictable() itself!  They're dealing with a freshly
allocated anonymous page, which has no "mapping" and cannot be mlocked yet.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h |  5 ++---
 mm/ksm.c      |  2 +-
 mm/rmap.c     |  2 +-
 mm/swap.c     |  2 +-
 mm/vmscan.c   | 27 +++++++++------------------
 5 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index 7f72f249bc2..78f25d6cc6a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 
 /*
- * Called only in fault path via page_evictable() for a new page
- * to determine if it's being mapped into a LOCKED vma.
- * If so, mark page as mlocked.
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma.  If it is, mark page as mlocked.
  */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 				    struct page *page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 14ee5cf8a51..ecbc090cdaa 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
 		SetPageSwapBacked(new_page);
 		__set_page_locked(new_page);
 
-		if (page_evictable(new_page, vma))
+		if (!mlocked_vma_newpage(vma, new_page))
 			lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
 		else
 			add_page_to_unevictable_list(new_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 28777412de6..0d86433e42d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1080,7 +1080,7 @@ void page_add_new_anon_rmap(struct page *page,
 	else
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
-	if (page_evictable(page, vma))
+	if (!mlocked_vma_newpage(vma, page))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
 	else
 		add_page_to_unevictable_list(page);
diff --git a/mm/swap.c b/mm/swap.c
index f76c76c7501..6310dc2008f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -751,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 
 	SetPageLRU(page_tail);
 
-	if (page_evictable(page_tail, NULL)) {
+	if (page_evictable(page_tail)) {
 		if (PageActive(page)) {
 			SetPageActive(page_tail);
 			active = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b010efc4389..8b627309dd4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
 redo:
 	ClearPageUnevictable(page);
 
-	if (page_evictable(page, NULL)) {
+	if (page_evictable(page)) {
 		/*
 		 * For evictable pages, we can use the cache.
 		 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
-	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
@@ -709,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		sc->nr_scanned++;
 
-		if (unlikely(!page_evictable(page, NULL)))
+		if (unlikely(!page_evictable(page)))
 			goto cull_mlocked;
 
 		if (!sc->may_unmap && page_mapped(page))
@@ -1217,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 
 		VM_BUG_ON(PageLRU(page));
 		list_del(&page->lru);
-		if (unlikely(!page_evictable(page, NULL))) {
+		if (unlikely(!page_evictable(page))) {
 			spin_unlock_irq(&zone->lru_lock);
 			putback_lru_page(page);
 			spin_lock_irq(&zone->lru_lock);
@@ -1470,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 
-		if (unlikely(!page_evictable(page, NULL))) {
+		if (unlikely(!page_evictable(page))) {
 			putback_lru_page(page);
 			continue;
 		}
@@ -3414,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
  *
  * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.  The vma argument is !NULL when called from the
- * fault path to determine how to instantate a new page.
+ * lists vs unevictable list.
  *
  * Reasons page might not be evictable:
  * (1) page's mapping marked unevictable
  * (2) page is part of an mlocked VMA
  *
  */
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
 {
-
-	if (mapping_unevictable(page_mapping(page)))
-		return 0;
-
-	if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
-		return 0;
-
-	return 1;
+	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
 }
 
 #ifdef CONFIG_SHMEM
@@ -3472,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
 
-		if (page_evictable(page, NULL)) {
+		if (page_evictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 
 			VM_BUG_ON(PageActive(page));
-- 
cgit v1.2.3


From e6c509f85455041d3d7c4b863bf80bc294288cc1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:19 -0700
Subject: mm: use clear_page_mlock() in page_remove_rmap()

We had thought that pages could no longer get freed while still marked as
mlocked; but Johannes Weiner posted this program to demonstrate that
truncating an mlocked private file mapping containing COWed pages is still
mishandled:

#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>

int main(void)
{
	char *map;
	int fd;

	system("grep mlockfreed /proc/vmstat");
	fd = open("chigurh", O_CREAT|O_EXCL|O_RDWR);
	unlink("chigurh");
	ftruncate(fd, 4096);
	map = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE, fd, 0);
	map[0] = 11;
	mlock(map, sizeof(fd));
	ftruncate(fd, 0);
	close(fd);
	munlock(map, sizeof(fd));
	munmap(map, 4096);
	system("grep mlockfreed /proc/vmstat");
	return 0;
}

The anon COWed pages are not caught by truncation's clear_page_mlock() of
the pagecache pages; but unmap_mapping_range() unmaps them, so we ought to
look out for them there in page_remove_rmap().  Indeed, why should
truncation or invalidation be doing the clear_page_mlock() when removing
from pagecache?  mlock is a property of mapping in userspace, not a
property of pagecache: an mlocked unmapped page is nonsensical.

Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h |  7 +------
 mm/memory.c   | 10 +++++-----
 mm/mlock.c    | 16 +++-------------
 mm/rmap.c     |  4 ++++
 mm/truncate.c |  4 ----
 5 files changed, 13 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index 78f25d6cc6a..4dc93e2fe69 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -201,12 +201,7 @@ extern void munlock_vma_page(struct page *page);
  * If called for a page that is still mapped by mlocked vmas, all we do
  * is revert to lazy LRU behaviour -- semantics are not broken.
  */
-extern void __clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
-	if (unlikely(TestClearPageMlocked(page)))
-		__clear_page_mlock(page);
-}
+extern void clear_page_mlock(struct page *page);
 
 /*
  * mlock_migrate_page - called only from migrate_page_copy() to
diff --git a/mm/memory.c b/mm/memory.c
index d205e4381a3..5f5d1f039bf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1577,12 +1577,12 @@ split_fallthrough:
 		if (page->mapping && trylock_page(page)) {
 			lru_add_drain();  /* push cached pages to LRU */
 			/*
-			 * Because we lock page here and migration is
-			 * blocked by the pte's page reference, we need
-			 * only check for file-cache page truncation.
+			 * Because we lock page here, and migration is
+			 * blocked by the pte's page reference, and we
+			 * know the page is still mapped, we don't even
+			 * need to check for file-cache page truncation.
 			 */
-			if (page->mapping)
-				mlock_vma_page(page);
+			mlock_vma_page(page);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/mlock.c b/mm/mlock.c
index a948be4b7ba..de732159289 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,13 +51,10 @@ EXPORT_SYMBOL(can_do_mlock);
 /*
  *  LRU accounting for clear_page_mlock()
  */
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-
-	if (!page->mapping) {	/* truncated ? */
+	if (!TestClearPageMlocked(page))
 		return;
-	}
 
 	dec_zone_page_state(page, NR_MLOCK);
 	count_vm_event(UNEVICTABLE_PGCLEARED);
@@ -290,14 +287,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
 		if (page && !IS_ERR(page)) {
 			lock_page(page);
-			/*
-			 * Like in __mlock_vma_pages_range(),
-			 * because we lock page here and migration is
-			 * blocked by the elevated reference, we need
-			 * only check for file-cache page truncation.
-			 */
-			if (page->mapping)
-				munlock_vma_page(page);
+			munlock_vma_page(page);
 			unlock_page(page);
 			put_page(page);
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0d86433e42d..bf03149f495 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1155,7 +1155,10 @@ void page_remove_rmap(struct page *page)
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+		mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	}
+	if (unlikely(PageMlocked(page)))
+		clear_page_mlock(page);
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
 	 * but that might overwrite a racing page_add_anon_rmap
@@ -1165,6 +1168,7 @@ void page_remove_rmap(struct page *page)
 	 * Leaving it set also helps swapoff to reinstate ptes
 	 * faster for those pages still in swapcache.
 	 */
+	return;
 out:
 	if (!anon)
 		mem_cgroup_end_update_page_stat(page, &locked, &flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index f38055cb8af..d51ce92d6e8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
 
-	clear_page_mlock(page);
 	ClearPageMappedToDisk(page);
 	delete_from_page_cache(page);
 	return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	clear_page_mlock(page);
 	ret = remove_mapping(mapping, page);
 
 	return ret;
@@ -394,8 +392,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	clear_page_mlock(page);
-
 	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
-- 
cgit v1.2.3


From a0c5e813f087dffc0d9b173d2e7d3328b1482fd5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:21 -0700
Subject: mm: remove free_page_mlock

We should not be seeing non-0 unevictable_pgs_mlockfreed any longer.  So
remove free_page_mlock() from the page freeing paths: __PG_MLOCKED is
already in PAGE_FLAGS_CHECK_AT_FREE, so free_pages_check() will now be
checking it, reporting "BUG: Bad page state" if it's ever found set.
Comment UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed always 0.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 17 -----------------
 mm/vmstat.c     |  2 +-
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 00750bc08a3..dbb53866c3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -598,17 +598,6 @@ out:
 	zone->free_area[order].nr_free++;
 }
 
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-	__dec_zone_page_state(page, NR_MLOCK);
-	__count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
-
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
@@ -728,15 +717,12 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int wasMlocked = __TestClearPageMlocked(page);
 	int migratetype;
 
 	if (!free_pages_prepare(page, order))
 		return;
 
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
-		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
@@ -1310,7 +1296,6 @@ void free_hot_cold_page(struct page *page, int cold)
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
-	int wasMlocked = __TestClearPageMlocked(page);
 
 	if (!free_pages_prepare(page, 0))
 		return;
@@ -1318,8 +1303,6 @@ void free_hot_cold_page(struct page *page, int cold)
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
-		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 
 	/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index acbd85c983e..05e3a991374 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -782,7 +782,7 @@ const char * const vmstat_text[] = {
 	"unevictable_pgs_munlocked",
 	"unevictable_pgs_cleared",
 	"unevictable_pgs_stranded",
-	"unevictable_pgs_mlockfreed",
+	"unevictable_pgs_mlockfreed",	/* no longer useful: always zero */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	"thp_fault_alloc",
-- 
cgit v1.2.3


From 957f822a0ab95e88b146638bad6209bbc315bedd Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:33:24 -0700
Subject: mm, numa: reclaim from all nodes within reclaim distance

RECLAIM_DISTANCE represents the distance between nodes at which it is
deemed too costly to allocate from; it's preferred to try to reclaim from
a local zone before falling back to allocating on a remote node with such
a distance.

To do this, zone_reclaim_mode is set if the distance between any two
nodes on the system is greather than this distance.  This, however, ends
up causing the page allocator to reclaim from every zone regardless of
its affinity.

What we really want is to reclaim only from zones that are closer than
RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
represents the set of nodes that are within this distance.  During the
zone iteration, if the bit for a zone's node is set for the local node,
then reclaim is attempted; otherwise, the zone is skipped.

[akpm@linux-foundation.org: fix CONFIG_NUMA=n build]
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dbb53866c3a..9b8e6243a52 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1799,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+	int i;
+
+	for_each_online_node(i)
+		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+			zone_reclaim_mode = 1;
+		}
+}
+
 #else	/* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1819,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return true;
+}
+
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif	/* CONFIG_NUMA */
 
 /*
@@ -1903,7 +1928,8 @@ zonelist_scan:
 				did_zlc_setup = 1;
 			}
 
-			if (zone_reclaim_mode == 0)
+			if (zone_reclaim_mode == 0 ||
+			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 
 			/*
@@ -3364,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
 	j = 0;
 
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (distance != node_distance(local_node, prev_node))
+		if (node_distance(local_node, node) !=
+		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 
 		prev_node = node;
@@ -4552,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
+	init_zone_allows_reclaim(nid);
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
-- 
cgit v1.2.3


From 36e4f20af833d1ce196e6a4ade05dc26c44652d1 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:31 -0700
Subject: hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach

Commit 0c176d52b0b2 ("mm: hugetlb: fix pgoff computation when unmapping
page from vma") fixed pgoff calculation but it has replaced it by
vma_hugecache_offset() which is not approapriate for offsets used for
vma_prio_tree_foreach() because that one expects index in page units
rather than in huge_page_shift.

Johannes said:

: The resulting index may not be too big, but it can be too small: assume
: hpage size of 2M and the address to unmap to be 0x200000.  This is regular
: page index 512 and hpage index 1.  If you have a VMA that maps the file
: only starting at the second huge page, that VMAs vm_pgoff will be 512 but
: you ask for offset 1 and miss it even though it does map the page of
: interest.  hugetlb_cow() will try to unmap, miss the vma, and retry the
: cow until the allocation succeeds or the skipped vma(s) go away.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8536741f069..de5d1dcf34f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2480,7 +2480,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * from page cache lookup which is in HPAGE_SIZE units.
 	 */
 	address = address & huge_page_mask(h);
-	pgoff = vma_hugecache_offset(h, vma, address);
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+			vma->vm_pgoff;
 	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 
 	/*
-- 
cgit v1.2.3


From 2ec74c3ef2d8c58d71e0e00336fb6b891192155a Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 8 Oct 2012 16:33:33 -0700
Subject: mm: move all mmu notifier invocations to be done outside the PT lock

In order to allow sleeping during mmu notifier calls, we need to avoid
invoking them under the page table spinlock.  This patch solves the
problem by calling invalidate_page notification after releasing the lock
(but before freeing the page itself), or by wrapping the page invalidation
with calls to invalidate_range_begin and invalidate_range_end.

To prevent accidental changes to the invalidate_range_end arguments after
the call to invalidate_range_begin, the patch introduces a convention of
saving the arguments in consistently named locals:

	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;	/* For mmu_notifiers */

	...

	mmun_start = ...
	mmun_end = ...
	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

	...

	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

The patch changes code to use this convention for all calls to
mmu_notifier_invalidate_range_start/end, except those where the calls are
close enough so that anyone who glances at the code can see the values
aren't changing.

This patchset is a preliminary step towards on-demand paging design to be
added to the RDMA stack.

Why do we want on-demand paging for Infiniband?

  Applications register memory with an RDMA adapter using system calls,
  and subsequently post IO operations that refer to the corresponding
  virtual addresses directly to HW.  Until now, this was achieved by
  pinning the memory during the registration calls.  The goal of on demand
  paging is to avoid pinning the pages of registered memory regions (MRs).
   This will allow users the same flexibility they get when swapping any
  other part of their processes address spaces.  Instead of requiring the
  entire MR to fit in physical memory, we can allow the MR to be larger,
  and only fit the current working set in physical memory.

Why should anyone care?  What problems are users currently experiencing?

  This can make programming with RDMA much simpler.  Today, developers
  that are working with more data than their RAM can hold need either to
  deregister and reregister memory regions throughout their process's
  life, or keep a single memory region and copy the data to it.  On demand
  paging will allow these developers to register a single MR at the
  beginning of their process's life, and let the operating system manage
  which pages needs to be fetched at a given time.  In the future, we
  might be able to provide a single memory access key for each process
  that would provide the entire process's address as one large memory
  region, and the developers wouldn't need to register memory regions at
  all.

Is there any prospect that any other subsystems will utilise these
infrastructural changes?  If so, which and how, etc?

  As for other subsystems, I understand that XPMEM wanted to sleep in
  MMU notifiers, as Christoph Lameter wrote at
  http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and
  perhaps Andrea knows about other use cases.

  Scheduling in mmu notifications is required since we need to sync the
  hardware with the secondary page tables change.  A TLB flush of an IO
  device is inherently slower than a CPU TLB flush, so our design works by
  sending the invalidation request to the device, and waiting for an
  interrupt before exiting the mmu notifier handler.

Avi said:

  kvm may be a buyer.  kvm::mmu_lock, which serializes guest page
  faults, also protects long operations such as destroying large ranges.
  It would be good to convert it into a spinlock, but as it is used inside
  mmu notifiers, this cannot be done.

  (there are alternatives, such as keeping the spinlock and using a
  generation counter to do the teardown in O(1), which is what the "may"
  is doing up there).

[akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Liran Liss <liranl@mellanox.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap_xip.c |  4 +++-
 mm/huge_memory.c | 42 ++++++++++++++++++++++++++++++++++++------
 mm/hugetlb.c     | 21 +++++++++++++--------
 mm/memory.c      | 28 +++++++++++++++++++---------
 mm/mremap.c      |  8 ++++++--
 mm/rmap.c        | 18 +++++++++++++++---
 6 files changed, 92 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a52daee11d3..a912da6ddfd 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -192,11 +192,13 @@ retry:
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush_notify(vma, address, pte);
+			pteval = ptep_clear_flush(vma, address, pte);
 			page_remove_rmap(page);
 			dec_mm_counter(mm, MM_FILEPAGES);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
+			/* must invalidate_page _before_ freeing the page */
+			mmu_notifier_invalidate_page(mm, address);
 			page_cache_release(page);
 		}
 	}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7740923fb..08a943b9cf9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	pmd_t _pmd;
 	int ret = 0, i;
 	struct page **pages;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
 			GFP_KERNEL);
@@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		cond_resched();
 	}
 
+	mmun_start = haddr;
+	mmun_end   = haddr + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_pages;
 	VM_BUG_ON(!PageHead(page));
 
-	pmdp_clear_flush_notify(vma, haddr, pmd);
+	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
 	pgtable = pgtable_trans_huge_withdraw(mm);
@@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	page_remove_rmap(page);
 	spin_unlock(&mm->page_table_lock);
 
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
 	ret |= VM_FAULT_WRITE;
 	put_page(page);
 
@@ -859,6 +867,7 @@ out:
 
 out_free_pages:
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		mem_cgroup_uncharge_page(pages[i]);
@@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int ret = 0;
 	struct page *page, *new_page;
 	unsigned long haddr;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	VM_BUG_ON(!vma->anon_vma);
 	spin_lock(&mm->page_table_lock);
@@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 
+	mmun_start = haddr;
+	mmun_end   = haddr + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	spin_lock(&mm->page_table_lock);
 	put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
-		goto out;
+		goto out_mn;
 	} else {
 		pmd_t entry;
 		VM_BUG_ON(!PageHead(page));
 		entry = mk_pmd(new_page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		entry = pmd_mkhuge(entry);
-		pmdp_clear_flush_notify(vma, haddr, pmd);
+		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
 		update_mmu_cache(vma, address, pmd);
@@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
 	}
-out_unlock:
 	spin_unlock(&mm->page_table_lock);
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return ret;
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
 }
 
 struct page *follow_trans_huge_pmd(struct mm_struct *mm,
@@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page,
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t *pmd;
 	int ret = 0;
+	/* For mmu_notifiers */
+	const unsigned long mmun_start = address;
+	const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page,
 		 * and it won't wait on the anon_vma->root->mutex to
 		 * serialize against split_huge_page*.
 		 */
-		pmdp_splitting_flush_notify(vma, address, pmd);
+		pmdp_splitting_flush(vma, address, pmd);
 		ret = 1;
 	}
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	return ret;
 }
@@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int isolated;
 	unsigned long hstart, hend;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte = pte_offset_map(pmd, address);
 	ptl = pte_lockptr(mm, pmd);
 
+	mmun_start = address;
+	mmun_end   = address + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock); /* probably unnecessary */
 	/*
 	 * After this gup_fast can't run anymore. This also removes
@@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * huge and small TLB entries for the same virtual address
 	 * to avoid the risk of CPU bugs in that area.
 	 */
-	_pmd = pmdp_clear_flush_notify(vma, address, pmd);
+	_pmd = pmdp_clear_flush(vma, address, pmd);
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	spin_lock(ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de5d1dcf34f..993f7c1820a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
+	const unsigned long mmun_start = start;	/* For mmu_notifiers */
+	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
 	tlb_start_vma(tlb, vma);
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
@@ -2425,7 +2427,7 @@ again:
 		if (address < end && !ref_page)
 			goto again;
 	}
-	mmu_notifier_invalidate_range_end(mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	tlb_end_vma(tlb, vma);
 }
 
@@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *old_page, *new_page;
 	int avoidcopy;
 	int outside_reserve = 0;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	old_page = pte_page(pte);
 
@@ -2611,6 +2615,9 @@ retry_avoidcopy:
 			    pages_per_huge_page(h));
 	__SetPageUptodate(new_page);
 
+	mmun_start = address & huge_page_mask(h);
+	mmun_end = mmun_start + huge_page_size(h);
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	/*
 	 * Retake the page_table_lock to check for racing updates
 	 * before the page tables are altered
@@ -2619,9 +2626,6 @@ retry_avoidcopy:
 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
-		mmu_notifier_invalidate_range_start(mm,
-			address & huge_page_mask(h),
-			(address & huge_page_mask(h)) + huge_page_size(h));
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2633,11 @@ retry_avoidcopy:
 		hugepage_add_new_anon_rmap(new_page, vma, address);
 		/* Make the old page be freed below */
 		new_page = old_page;
-		mmu_notifier_invalidate_range_end(mm,
-			address & huge_page_mask(h),
-			(address & huge_page_mask(h)) + huge_page_size(h));
 	}
+	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	/* Caller expects lock to be held */
+	spin_lock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 5f5d1f039bf..b03a4a21c1d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 	add_taint(TAINT_BAD_PAGE);
 }
 
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+	bool is_cow;
 	int ret;
 
 	/*
@@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * parent mm. And a permission downgrade will only happen if
 	 * is_cow_mapping() returns true.
 	 */
-	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_start(src_mm, addr, end);
+	is_cow = is_cow_mapping(vma->vm_flags);
+	mmun_start = addr;
+	mmun_end   = end;
+	if (is_cow)
+		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+						    mmun_end);
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 
-	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_end(src_mm,
-						  vma->vm_start, end);
+	if (is_cow)
+		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
 	return ret;
 }
 
@@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 	__releases(ptl)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int ret = 0;
 	int page_mkwrite = 0;
@@ -2760,10 +2766,14 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
-	if (new_page)
-		page_cache_release(new_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (new_page) {
+		if (new_page == old_page)
+			/* cow happened, notify before releasing old_page */
+			mmu_notifier_invalidate_page(mm, address);
+		page_cache_release(new_page);
+	}
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/mremap.c b/mm/mremap.c
index 3b639a4b26b..1b61c2d3307 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
 	bool need_flush = false;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
-	mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+	mmun_start = old_addr;
+	mmun_end   = old_end;
+	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
 
 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
@@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	if (likely(need_flush))
 		flush_tlb_range(vma, old_end-len, old_addr);
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
 
 	return len + old_addr - old_end;	/* how much done */
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index bf03149f495..7df7984d476 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush_notify(vma, address, pte);
+		entry = ptep_clear_flush(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	pte_unmap_unlock(pte, ptl);
+
+	if (ret)
+		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
 }
@@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush_notify(vma, address, pte);
+	pteval = ptep_clear_flush(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+	if (ret != SWAP_FAIL)
+		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
 
@@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 	unsigned long end;
 	int ret = SWAP_AGAIN;
 	int locked_vma = 0;
@@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 	if (!pmd_present(*pmd))
 		return ret;
 
+	mmun_start = address;
+	mmun_end   = end;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	/*
 	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
 	 * keep the sem while scanning the cluster for mlocking pages.
@@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
+		pteval = ptep_clear_flush(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))
@@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	if (locked_vma)
 		up_read(&vma->vm_mm->mmap_sem);
 	return ret;
-- 
cgit v1.2.3


From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Mon, 8 Oct 2012 16:33:35 -0700
Subject: mm: wrap calls to set_pte_at_notify with invalidate_range_start and
 invalidate_range_end

In order to allow sleeping during invalidate_page mmu notifier calls, we
need to avoid calling when holding the PT lock.  In addition to its direct
calls, invalidate_page can also be called as a substitute for a change_pte
call, in case the notifier client hasn't implemented change_pte.

This patch drops the invalidate_page call from change_pte, and instead
wraps all calls to change_pte with invalidate_range_start and
invalidate_range_end calls.

Note that change_pte still cannot sleep after this patch, and that clients
implementing change_pte should not take action on it in case the number of
outstanding invalidate_range_start calls is larger than one, otherwise
they might miss a later invalidation.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Cc: Andrea Arcangeli <andrea@qumranet.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Liran Liss <liranl@mellanox.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c          | 21 +++++++++++++++++++--
 mm/memory.c       | 18 ++++++++++++------
 mm/mmu_notifier.c |  6 ------
 3 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index ecbc090cdaa..ae539f0b8aa 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 	spinlock_t *ptl;
 	int swapped;
 	int err = -EFAULT;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
 		goto out;
 
 	BUG_ON(PageTransCompound(page));
+
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	ptep = page_check_address(page, mm, addr, &ptl, 0);
 	if (!ptep)
-		goto out;
+		goto out_mn;
 
 	if (pte_write(*ptep) || pte_dirty(*ptep)) {
 		pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 
 out_unlock:
 	pte_unmap_unlock(ptep, ptl);
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return err;
 }
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	spinlock_t *ptl;
 	unsigned long addr;
 	int err = -EFAULT;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	if (!pmd_present(*pmd))
 		goto out;
 
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (!pte_same(*ptep, orig_pte)) {
 		pte_unmap_unlock(ptep, ptl);
-		goto out;
+		goto out_mn;
 	}
 
 	get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 
 	pte_unmap_unlock(ptep, ptl);
 	err = 0;
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return err;
 }
diff --git a/mm/memory.c b/mm/memory.c
index b03a4a21c1d..01ec048ece8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2527,6 +2527,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int ret = 0;
 	int page_mkwrite = 0;
 	struct page *dirty_page = NULL;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+	bool mmun_called = false;	/* For mmu_notifiers */
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page) {
@@ -2704,6 +2707,11 @@ gotten:
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
+	mmun_start  = address & PAGE_MASK;
+	mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+	mmun_called = true;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
@@ -2766,14 +2774,12 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
+	if (new_page)
+		page_cache_release(new_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
-	if (new_page) {
-		if (new_page == old_page)
-			/* cow happened, notify before releasing old_page */
-			mmu_notifier_invalidate_page(mm, address);
-		page_cache_release(new_page);
-	}
+	if (mmun_called)
+		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index c297142f0fe..479a1e751a7 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -137,12 +137,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
-		/*
-		 * Some drivers don't have change_pte,
-		 * so we must call invalidate_page in that case.
-		 */
-		else if (mn->ops->invalidate_page)
-			mn->ops->invalidate_page(mn, mm, address);
 	}
 	srcu_read_unlock(&srcu, id);
 }
-- 
cgit v1.2.3


From 082708072a4250f5c4dbc62065e7af93f5e45646 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:38 -0700
Subject: mm: revert 0def08e3 ("mm/mempolicy.c: check return code of
 check_range")

Revert commit 0def08e3acc2 because check_range can't fail in
migrate_to_node with considering current usecases.

Quote from Johannes

: I think it makes sense to revert.  Not because of the semantics, but I
: just don't see how check_range() could even fail for this callsite:
:
: 1. we pass mm->mmap->vm_start in there, so we should not fail due to
:    find_vma()
:
: 2. we pass MPOL_MF_DISCONTIG_OK, so the discontig checks do not apply
:    and so can not fail
:
: 3. we pass MPOL_MF_MOVE | MPOL_MF_MOVE_ALL, the page table loops will
:    continue until addr == end, so we never fail with -EIO

And I added a new VM_BUG_ON for checking migrate_to_node's future usecase
which might pass to MPOL_MF_STRICT.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vasiliy Kulikov <segooon@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d64b369180..0b78fb9ea65 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -946,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	nodemask_t nmask;
 	LIST_HEAD(pagelist);
 	int err = 0;
-	struct vm_area_struct *vma;
 
 	nodes_clear(nmask);
 	node_set(source, nmask);
 
-	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+	/*
+	 * This does not "check" the range but isolates all pages that
+	 * need migration.  Between passing in the full user address
+	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+	 */
+	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-	if (IS_ERR(vma))
-		return PTR_ERR(vma);
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
-- 
cgit v1.2.3


From 5a883813845a2bb5ed2bd8c9240736c0740b156f Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:39 -0700
Subject: memory-hotplug: fix zone stat mismatch

During memory-hotplug, I found NR_ISOLATED_[ANON|FILE] are increasing,
causing the kernel to hang.  When the system doesn't have enough free
pages, it enters reclaim but never reclaim any pages due to
too_many_isolated()==true and loops forever.

The cause is that when we do memory-hotadd after memory-remove,
__zone_pcp_update() clears a zone's ZONE_STAT_ITEMS in setup_pageset()
although the vm_stat_diff of all CPUs still have values.

In addtion, when we offline all pages of the zone, we reset them in
zone_pcp_reset without draining so we loss some zone stat item.

Reviewed-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c |  7 +++++++
 mm/vmstat.c     | 12 ++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9b8e6243a52..5485f0ef4ec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5916,6 +5916,7 @@ static int __meminit __zone_pcp_update(void *data)
 		local_irq_save(flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
+		drain_zonestat(zone, pset);
 		setup_pageset(pset, batch);
 		local_irq_restore(flags);
 	}
@@ -5932,10 +5933,16 @@ void __meminit zone_pcp_update(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
+	int cpu;
+	struct per_cpu_pageset *pset;
 
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
+		for_each_online_cpu(cpu) {
+			pset = per_cpu_ptr(zone->pageset, cpu);
+			drain_zonestat(zone, pset);
+		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 05e3a991374..2f11309955c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
 			atomic_long_add(global_diff[i], &vm_stat[i]);
 }
 
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+	int i;
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (pset->vm_stat_diff[i]) {
+			int v = pset->vm_stat_diff[i];
+			pset->vm_stat_diff[i] = 0;
+			atomic_long_add(v, &zone->vm_stat[i]);
+			atomic_long_add(v, &vm_stat[i]);
+		}
+}
 #endif
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 8befedfe678ff84d5bc356be4f3cb1fd84959d02 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:42 -0700
Subject: mm: remove unevictable_pgs_mlockfreed

Simply remove UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed line
from /proc/vmstat: Johannes and Mel point out that it was very unlikely to
have been used by any tool, and of course we can restore it easily enough
if that turns out to be wrong.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2f11309955c..c7370579111 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -794,7 +794,6 @@ const char * const vmstat_text[] = {
 	"unevictable_pgs_munlocked",
 	"unevictable_pgs_cleared",
 	"unevictable_pgs_stranded",
-	"unevictable_pgs_mlockfreed",	/* no longer useful: always zero */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	"thp_fault_alloc",
-- 
cgit v1.2.3


From c462f179e4d273f0da0e7cf302e29b0edf43844e Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Mon, 8 Oct 2012 16:33:43 -0700
Subject: mm/memory.c: fix typo in comment

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 01ec048ece8..45bb6d296b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2148,7 +2148,7 @@ out:
  * @addr: target user address of this page
  * @pfn: source kernel pfn
  *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
  * they've allocated into a user vma. Same comments apply.
  *
  * This function should only be called from a vm_ops->fault handler, and
-- 
cgit v1.2.3


From e46a28790e594c0876d1a84270926abf75460f61 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:48 -0700
Subject: CMA: migrate mlocked pages

Presently CMA cannot migrate mlocked pages so it ends up failing to allocate
contiguous memory space.

This patch makes mlocked pages be migrated out.  Of course, it can affect
realtime processes but in CMA usecase, contiguous memory allocation failing
is far worse than access latency to an mlocked page being variable while
CMA is running.  If someone wants to make the system realtime, he shouldn't
enable CMA because stalls can still happen at random times.

[akpm@linux-foundation.org: tweak comment text, per Mel]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 8 ++++++--
 mm/internal.h   | 2 +-
 mm/page_alloc.c | 2 +-
 mm/vmscan.c     | 4 ++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index d8187f9cabb..2c4ce17651d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -461,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
  * @cc:		Compaction control structure.
  * @low_pfn:	The first PFN of the range.
  * @end_pfn:	The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
  *
  * Isolate all pages that can be migrated from the range specified by
  * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
@@ -476,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
  */
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-			   unsigned long low_pfn, unsigned long end_pfn)
+		unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
 {
 	unsigned long last_pageblock_nr = 0, pageblock_nr;
 	unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -602,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (!cc->sync)
 			mode |= ISOLATE_ASYNC_MIGRATE;
 
+		if (unevictable)
+			mode |= ISOLATE_UNEVICTABLE;
+
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
@@ -807,7 +811,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	}
 
 	/* Perform the isolation */
-	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
 	if (!low_pfn || cc->contended)
 		return ISOLATE_ABORT;
 
diff --git a/mm/internal.h b/mm/internal.h
index 4dc93e2fe69..f5f295fe11e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -138,7 +138,7 @@ isolate_freepages_range(struct compact_control *cc,
 			unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-			   unsigned long low_pfn, unsigned long end_pfn);
+	unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
 
 #endif
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5485f0ef4ec..fd86c47de86 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5690,7 +5690,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
-							 pfn, end);
+							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8b627309dd4..2624edcfb42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1009,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	if (!PageLRU(page))
 		return ret;
 
-	/* Do not give back unevictable pages for compaction */
-	if (PageUnevictable(page))
+	/* Compaction should not handle unevictable pages but CMA can do so */
+	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
 		return ret;
 
 	ret = -EBUSY;
-- 
cgit v1.2.3


From beb51eaa88238daba698ad837222ad277d440b6d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:51 -0700
Subject: cma: decrease cc.nr_migratepages after reclaiming pagelist

reclaim_clean_pages_from_list() reclaims clean pages before migration so
cc.nr_migratepages should be updated.  Currently, there is no problem but
it can be wrong if we try to use the value in future.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd86c47de86..bb90971182b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5674,7 +5674,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
-
+	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
@@ -5701,7 +5701,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 			break;
 		}
 
-		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
+		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+							&cc->migratepages);
+		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages,
 				    alloc_migrate_target,
-- 
cgit v1.2.3


From c22331166b49b94b51424bb7fa1f83e09ad82734 Mon Sep 17 00:00:00 2001
From: Raghavendra D Prabhu <rprabhu@wnohang.net>
Date: Mon, 8 Oct 2012 16:33:55 -0700
Subject: mm: avoid section mismatch warning for memblock_type_name

Following section mismatch warning is thrown during build;

    WARNING: vmlinux.o(.text+0x32408f): Section mismatch in reference from the function memblock_type_name() to the variable .meminit.data:memblock
    The function memblock_type_name() references
    the variable __meminitdata memblock.
    This is often because memblock_type_name lacks a __meminitdata
    annotation or the annotation of memblock is wrong.

This is because memblock_type_name makes reference to memblock variable
with attribute __meminitdata.  Hence, the warning (even if the function is
inline).

[akpm@linux-foundation.org: remove inline]
Signed-off-by: Raghavendra D Prabhu <rprabhu@wnohang.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index d809f17ef75..931eef145af 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
 /* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
 {
 	if (type == &memblock.memory)
 		return "memory";
-- 
cgit v1.2.3


From a16cee10c7ab994546ed98d9abfd4de74050124a Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Mon, 8 Oct 2012 16:33:58 -0700
Subject: memory-hotplug: preparation to notify memory block's state at memory
 hot remove

remove_memory() is called in two cases:
1. echo offline >/sys/devices/system/memory/memoryXX/state
2. hot remove a memory device

In the 1st case, the memory block's state is changed and the notification
that memory block's state changed is sent to userland after calling
remove_memory().  So user can notice memory block is changed.

But in the 2nd case, the memory block's state is not changed and the
notification is not also sent to userspcae even if calling
remove_memory().  So user cannot notice memory block is changed.

For adding the notification at memory hot remove, the patch just prepare
as follows:
1st case uses offline_pages() for offlining memory.
2nd case uses remove_memory() for offlining memory and changing memory block's
    state and notifing the information.

The patch does not implement notification to remove_memory().

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce690a911f1..dfc0a6134c7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -874,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
 		  unsigned long end_pfn, unsigned long timeout)
 {
 	unsigned long pfn, nr_pages, expire;
@@ -1007,15 +1007,24 @@ out:
 	return ret;
 }
 
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
+
 int remove_memory(u64 start, u64 size)
 {
 	unsigned long start_pfn, end_pfn;
 
 	start_pfn = PFN_DOWN(start);
 	end_pfn = start_pfn + PFN_DOWN(size);
-	return offline_pages(start_pfn, end_pfn, 120 * HZ);
+	return __offline_pages(start_pfn, end_pfn, 120 * HZ);
 }
 #else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+	return -EINVAL;
+}
 int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From e90bdb7f52f94204c78fb40b0804645defdebd71 Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Mon, 8 Oct 2012 16:34:01 -0700
Subject: memory-hotplug: update memory block's state and notify userspace

remove_memory() will be called when hot removing a memory device.  But
even if offlining memory, we cannot notice it.  So the patch updates the
memory block's state and sends notification to userspace.

Additionally, the memory device may contain more than one memory block.
If the memory block has been offlined, __offline_pages() will fail.  So we
should try to offline one memory block at a time.

Thus remove_memory() also check each memory block's state.  So there is no
need to check the memory block's state before calling remove_memory().

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dfc0a6134c7..7d0797475a4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1014,11 +1014,42 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 int remove_memory(u64 start, u64 size)
 {
+	struct memory_block *mem = NULL;
+	struct mem_section *section;
 	unsigned long start_pfn, end_pfn;
+	unsigned long pfn, section_nr;
+	int ret;
 
 	start_pfn = PFN_DOWN(start);
 	end_pfn = start_pfn + PFN_DOWN(size);
-	return __offline_pages(start_pfn, end_pfn, 120 * HZ);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+		if (!present_section_nr(section_nr))
+			continue;
+
+		section = __nr_to_section(section_nr);
+		/* same memblock? */
+		if (mem)
+			if ((section_nr >= mem->start_section_nr) &&
+			    (section_nr <= mem->end_section_nr))
+				continue;
+
+		mem = find_memory_block_hinted(section, mem);
+		if (!mem)
+			continue;
+
+		ret = offline_memory_block(mem);
+		if (ret) {
+			kobject_put(&mem->dev.kobj);
+			return ret;
+		}
+	}
+
+	if (mem)
+		kobject_put(&mem->dev.kobj);
+
+	return 0;
 }
 #else
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-- 
cgit v1.2.3


From b676b293fb48672904ee1b9828cb50b4eed01717 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:34:03 -0700
Subject: mm, thp: fix mapped pages avoiding unevictable list on mlock

When a transparent hugepage is mapped and it is included in an mlock()
range, follow_page() incorrectly avoids setting the page's mlock bit and
moving it to the unevictable lru.

This is evident if you try to mlock(), munlock(), and then mlock() a
range again.  Currently:

	#define MAP_SIZE	(4 << 30)	/* 4GB */

	void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
			 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	mlock(ptr, MAP_SIZE);

		$ grep -E "Unevictable|Inactive\(anon" /proc/meminfo
		Inactive(anon):     6304 kB
		Unevictable:     4213924 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4186252 kB
		Unevictable:       19652 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):  4198556 kB
		Unevictable:       21684 kB

Notice that less than 2MB was added to the unevictable list; this is
because these pages in the range are not transparent hugepages since the
4GB range was allocated with mmap() and has no specific alignment.  If
posix_memalign() were used instead, unevictable would not have grown at
all on the second mlock().

The fix is to call mlock_vma_page() so that the mlock bit is set and the
page is added to the unevictable list.  With this patch:

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4056 kB
		Unevictable:     4213940 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4198268 kB
		Unevictable:       19636 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4008 kB
		Unevictable:     4213940 kB

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 11 ++++++++++-
 mm/memory.c      |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08a943b9cf9..3a8d6b7d95d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -971,11 +971,12 @@ out_unlock:
 	return ret;
 }
 
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr,
 				   pmd_t *pmd,
 				   unsigned int flags)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 
 	assert_spin_locked(&mm->page_table_lock);
@@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
 		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
 	}
+	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+		if (page->mapping && trylock_page(page)) {
+			lru_add_drain();
+			if (page->mapping)
+				mlock_vma_page(page);
+			unlock_page(page);
+		}
+	}
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON(!PageCompound(page));
 	if (flags & FOLL_GET)
diff --git a/mm/memory.c b/mm/memory.c
index 45bb6d296b6..fb135ba4aba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 				spin_unlock(&mm->page_table_lock);
 				wait_split_huge_page(vma->anon_vma, pmd);
 			} else {
-				page = follow_trans_huge_pmd(mm, address,
+				page = follow_trans_huge_pmd(vma, address,
 							     pmd, flags);
 				spin_unlock(&mm->page_table_lock);
 				goto out;
-- 
cgit v1.2.3


From 8449d21fb49e9824e2736c5febd6b5d287cd2ba1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:34:06 -0700
Subject: mm, thp: fix mlock statistics

NR_MLOCK is only accounted in single page units: there's no logic to
handle transparent hugepages.  This patch checks the appropriate number of
pages to adjust the statistics by so that the correct amount of memory is
reflected.

Currently:

		$ grep Mlocked /proc/meminfo
		Mlocked:           19636 kB

	#define MAP_SIZE	(4 << 30)	/* 4GB */

	void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
			 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	mlock(ptr, MAP_SIZE);

		$ grep Mlocked /proc/meminfo
		Mlocked:           29844 kB

	munlock(ptr, MAP_SIZE);

		$ grep Mlocked /proc/meminfo
		Mlocked:           19636 kB

And with this patch:

		$ grep Mlock /proc/meminfo
		Mlocked:           19636 kB

	mlock(ptr, MAP_SIZE);

		$ grep Mlock /proc/meminfo
		Mlocked:         4213664 kB

	munlock(ptr, MAP_SIZE);

		$ grep Mlock /proc/meminfo
		Mlocked:           19636 kB

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Hugh Dickens <hughd@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 3 ++-
 mm/mlock.c    | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index f5f295fe11e..a4fa284f6bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 		return 0;
 
 	if (!TestSetPageMlocked(page)) {
-		inc_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 	}
 	return 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index de732159289..f0b9ce572fc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -56,7 +56,8 @@ void clear_page_mlock(struct page *page)
 	if (!TestClearPageMlocked(page))
 		return;
 
-	dec_zone_page_state(page, NR_MLOCK);
+	mod_zone_page_state(page_zone(page), NR_MLOCK,
+			    -hpage_nr_pages(page));
 	count_vm_event(UNEVICTABLE_PGCLEARED);
 	if (!isolate_lru_page(page)) {
 		putback_lru_page(page);
@@ -78,7 +79,8 @@ void mlock_vma_page(struct page *page)
 	BUG_ON(!PageLocked(page));
 
 	if (!TestSetPageMlocked(page)) {
-		inc_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 		if (!isolate_lru_page(page))
 			putback_lru_page(page);
@@ -105,7 +107,8 @@ void munlock_vma_page(struct page *page)
 	BUG_ON(!PageLocked(page));
 
 	if (TestClearPageMlocked(page)) {
-		dec_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    -hpage_nr_pages(page));
 		if (!isolate_lru_page(page)) {
 			int ret = SWAP_AGAIN;
 
-- 
cgit v1.2.3


From 45ec16908e84e00d847e9063231fb3a75e6366bc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 8 Oct 2012 16:34:09 -0700
Subject: mm: use %pK for /proc/vmallocinfo

In the paranoid case of sysctl kernel.kptr_restrict=2, mask the kernel
virtual addresses in /proc/vmallocinfo too.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Brad Spengler <spender@grsecurity.net>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8de704679bf..78e08300db2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2571,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
 {
 	struct vm_struct *v = p;
 
-	seq_printf(m, "0x%p-0x%p %7ld",
+	seq_printf(m, "0x%pK-0x%pK %7ld",
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller)
-- 
cgit v1.2.3


From 7795912c257bc068445f1db429c94d6b4b6ee604 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 8 Oct 2012 16:34:11 -0700
Subject: mm: document PageHuge somewhat

Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 993f7c1820a..59a0059b39e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 	}
 }
 
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages.  See the PageTransHuge() documentation for more
+ * details.
+ */
 int PageHuge(struct page *page)
 {
 	compound_page_dtor *dtor;
-- 
cgit v1.2.3


From d760afd4d2570653891f94e13b848e97150dc5a6 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:34:14 -0700
Subject: memory-hotplug: suppress "Trying to free nonexistent resource
 <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning

When our x86 box calls __remove_pages(), release_mem_region() shows many
warnings.  And x86 box cannot unregister iomem_resource.

  "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>"

release_mem_region() has been changed to be called in each
PAGES_PER_SECTION by commit de7f0cba9678 ("memory hotplug: release
memory regions in PAGES_PER_SECTION chunks").  Because powerpc registers
iomem_resource in each PAGES_PER_SECTION chunk.  But when I hot add
memory on x86 box, iomem_resource is register in each _CRS not
PAGES_PER_SECTION chunk.  So x86 box unregisters iomem_resource.

The patch fixes the problem.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Nathan Fontenot <nfont@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7d0797475a4..56b758ae57d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -369,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 	BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
 	for (i = 0; i < sections_to_remove; i++) {
 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-		release_mem_region(pfn << PAGE_SHIFT,
-				   PAGES_PER_SECTION << PAGE_SHIFT);
 		ret = __remove_section(zone, __pfn_to_section(pfn));
 		if (ret)
 			break;
-- 
cgit v1.2.3


From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:25 -0700
Subject: mm: Add and use update_mmu_cache_pmd() in transparent huge page code.

The transparent huge page code passes a PMD pointer in as the third
argument of update_mmu_cache(), which expects a PTE pointer.

This never got noticed because X86 implements update_mmu_cache() as a
macro and thus we don't get any type checking, and X86 is the only
architecture which supports transparent huge pages currently.

Before other architectures can support transparent huge pages properly we
need to add a new interface which will take a PMD pointer as the third
argument rather than a PTE pointer.

[akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390]
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3a8d6b7d95d..68a3c93036f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -900,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-			update_mmu_cache(vma, address, pmd);
+			update_mmu_cache_pmd(vma, address, pmd);
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
@@ -956,7 +956,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		update_mmu_cache(vma, address, pmd);
+		update_mmu_cache_pmd(vma, address, pmd);
 		page_remove_rmap(page);
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
@@ -2041,7 +2041,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache(vma, address, pmd);
+	update_mmu_cache_pmd(vma, address, pmd);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
-- 
cgit v1.2.3


From f5c8ad47284ca01dafc37da5a72bb9644174d387 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:26 -0700
Subject: mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd().

Invalidation sequences are handled in various ways on various
architectures.

One way, which sparc64 uses, is to let the set_*_at() functions accumulate
pending flushes into a per-cpu array.  Then the flush_tlb_range() et al.
calls process the pending TLB flushes.

In this regime, the __tlb_remove_*tlb_entry() implementations are
essentially NOPs.

The canonical PTE zap in mm/memory.c is:

			ptent = ptep_get_and_clear_full(mm, addr, pte,
							tlb->fullmm);
			tlb_remove_tlb_entry(tlb, pte, addr);

With a subsequent tlb_flush_mmu() if needed.

Mirror this in the THP PMD zapping using:

		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
		page = pmd_page(orig_pmd);
		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);

And we properly accomodate TLB flush mechanims like the one described
above.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 68a3c93036f..a863af26c79 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1024,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
+		pmd_t orig_pmd;
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
-		page = pmd_page(*pmd);
-		pmd_clear(pmd);
+		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+		page = pmd_page(orig_pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		page_remove_rmap(page);
 		VM_BUG_ON(page_mapcount(page) < 0);
-- 
cgit v1.2.3


From baaf1dd491433a78826150aff7411015de7e9b65 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 5 Aug 2012 15:00:58 +0000
Subject: mm/slob: use min_t() to compare ARCH_SLAB_MINALIGN

The definition of ARCH_SLAB_MINALIGN is architecture dependent
and can be either of type size_t or int. Comparing that value
with ARCH_KMALLOC_MINALIGN can cause harmless warnings on
platforms where they are different. Since both are always
small positive integer numbers, using the size_t type to compare
them is safe and gets rid of the warning.

Without this patch, building ARM collie_defconfig results in:

mm/slob.c: In function '__kmalloc_node':
mm/slob.c:431:152: warning: comparison of distinct pointer types lacks a cast [enabled by default]
mm/slob.c: In function 'kfree':
mm/slob.c:484:153: warning: comparison of distinct pointer types lacks a cast [enabled by default]
mm/slob.c: In function 'ksize':
mm/slob.c:503:153: warning: comparison of distinct pointer types lacks a cast [enabled by default]

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
---
 mm/slob.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933..497c55e318a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -428,7 +428,7 @@ out:
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	unsigned int *m;
-	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+	int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 	void *ret;
 
 	gfp &= gfp_allowed_mask;
@@ -481,7 +481,7 @@ void kfree(const void *block)
 
 	sp = virt_to_page(block);
 	if (PageSlab(sp)) {
-		int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+		int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 		unsigned int *m = (unsigned int *)(block - align);
 		slob_free(m, *m + align);
 	} else
@@ -500,7 +500,7 @@ size_t ksize(const void *block)
 
 	sp = virt_to_page(block);
 	if (PageSlab(sp)) {
-		int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+		int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 		unsigned int *m = (unsigned int *)(block - align);
 		return SLOB_UNITS(*m) * SLOB_UNIT;
 	} else
-- 
cgit v1.2.3


From 35c2a7f4908d404c9124c2efc6ada4640ca4d5d5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 7 Oct 2012 20:32:51 -0700
Subject: tmpfs,ceph,gfs2,isofs,reiserfs,xfs: fix fh_len checking

Fuzzing with trinity oopsed on the 1st instruction of shmem_fh_to_dentry(),
	u64 inum = fid->raw[2];
which is unhelpfully reported as at the end of shmem_alloc_inode():

BUG: unable to handle kernel paging request at ffff880061cd3000
IP: [<ffffffff812190d0>] shmem_alloc_inode+0x40/0x40
Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Call Trace:
 [<ffffffff81488649>] ? exportfs_decode_fh+0x79/0x2d0
 [<ffffffff812d77c3>] do_handle_open+0x163/0x2c0
 [<ffffffff812d792c>] sys_open_by_handle_at+0xc/0x10
 [<ffffffff83a5f3f8>] tracesys+0xe1/0xe6

Right, tmpfs is being stupid to access fid->raw[2] before validating that
fh_len includes it: the buffer kmalloc'ed by do_sys_name_to_handle() may
fall at the end of a page, and the next page not be present.

But some other filesystems (ceph, gfs2, isofs, reiserfs, xfs) are being
careless about fh_len too, in fh_to_dentry() and/or fh_to_parent(), and
could oops in the same way: add the missing fh_len checks to those.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Sage Weil <sage@inktank.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/shmem.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index cc12072f878..67afba5117f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2220,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *dentry = NULL;
-	u64 inum = fid->raw[2];
-	inum = (inum << 32) | fid->raw[1];
+	u64 inum;
 
 	if (fh_len < 3)
 		return NULL;
 
+	inum = fid->raw[2];
+	inum = (inum << 32) | fid->raw[1];
+
 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
 			shmem_match, fid->raw);
 	if (inode) {
-- 
cgit v1.2.3


From 210ed9defffca13b909f040d7338d8062e5594a3 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 8 Oct 2012 09:26:01 +0200
Subject: mm, slab: release slab_mutex earlier in kmem_cache_destroy()

Commit 1331e7a1bbe1 ("rcu: Remove _rcu_barrier() dependency on
__stop_machine()") introduced slab_mutex -> cpu_hotplug.lock dependency
through kmem_cache_destroy() -> rcu_barrier() -> _rcu_barrier() ->
get_online_cpus().

Lockdep thinks that this might actually result in ABBA deadlock,
and reports it as below:

=== [ cut here ] ===
 ======================================================
 [ INFO: possible circular locking dependency detected ]
 3.6.0-rc5-00004-g0d8ee37 #143 Not tainted
 -------------------------------------------------------
 kworker/u:2/40 is trying to acquire lock:
  (rcu_sched_state.barrier_mutex){+.+...}, at: [<ffffffff810f2126>] _rcu_barrier+0x26/0x1e0

 but task is already holding lock:
  (slab_mutex){+.+.+.}, at: [<ffffffff81176e15>] kmem_cache_destroy+0x45/0xe0

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #2 (slab_mutex){+.+.+.}:
        [<ffffffff810ae1e2>] validate_chain+0x632/0x720
        [<ffffffff810ae5d9>] __lock_acquire+0x309/0x530
        [<ffffffff810ae921>] lock_acquire+0x121/0x190
        [<ffffffff8155d4cc>] __mutex_lock_common+0x5c/0x450
        [<ffffffff8155d9ee>] mutex_lock_nested+0x3e/0x50
        [<ffffffff81558cb5>] cpuup_callback+0x2f/0xbe
        [<ffffffff81564b83>] notifier_call_chain+0x93/0x140
        [<ffffffff81076f89>] __raw_notifier_call_chain+0x9/0x10
        [<ffffffff8155719d>] _cpu_up+0xba/0x14e
        [<ffffffff815572ed>] cpu_up+0xbc/0x117
        [<ffffffff81ae05e3>] smp_init+0x6b/0x9f
        [<ffffffff81ac47d6>] kernel_init+0x147/0x1dc
        [<ffffffff8156ab44>] kernel_thread_helper+0x4/0x10

 -> #1 (cpu_hotplug.lock){+.+.+.}:
        [<ffffffff810ae1e2>] validate_chain+0x632/0x720
        [<ffffffff810ae5d9>] __lock_acquire+0x309/0x530
        [<ffffffff810ae921>] lock_acquire+0x121/0x190
        [<ffffffff8155d4cc>] __mutex_lock_common+0x5c/0x450
        [<ffffffff8155d9ee>] mutex_lock_nested+0x3e/0x50
        [<ffffffff81049197>] get_online_cpus+0x37/0x50
        [<ffffffff810f21bb>] _rcu_barrier+0xbb/0x1e0
        [<ffffffff810f22f0>] rcu_barrier_sched+0x10/0x20
        [<ffffffff810f2309>] rcu_barrier+0x9/0x10
        [<ffffffff8118c129>] deactivate_locked_super+0x49/0x90
        [<ffffffff8118cc01>] deactivate_super+0x61/0x70
        [<ffffffff811aaaa7>] mntput_no_expire+0x127/0x180
        [<ffffffff811ab49e>] sys_umount+0x6e/0xd0
        [<ffffffff81569979>] system_call_fastpath+0x16/0x1b

 -> #0 (rcu_sched_state.barrier_mutex){+.+...}:
        [<ffffffff810adb4e>] check_prev_add+0x3de/0x440
        [<ffffffff810ae1e2>] validate_chain+0x632/0x720
        [<ffffffff810ae5d9>] __lock_acquire+0x309/0x530
        [<ffffffff810ae921>] lock_acquire+0x121/0x190
        [<ffffffff8155d4cc>] __mutex_lock_common+0x5c/0x450
        [<ffffffff8155d9ee>] mutex_lock_nested+0x3e/0x50
        [<ffffffff810f2126>] _rcu_barrier+0x26/0x1e0
        [<ffffffff810f22f0>] rcu_barrier_sched+0x10/0x20
        [<ffffffff810f2309>] rcu_barrier+0x9/0x10
        [<ffffffff81176ea1>] kmem_cache_destroy+0xd1/0xe0
        [<ffffffffa04c3154>] nf_conntrack_cleanup_net+0xe4/0x110 [nf_conntrack]
        [<ffffffffa04c31aa>] nf_conntrack_cleanup+0x2a/0x70 [nf_conntrack]
        [<ffffffffa04c42ce>] nf_conntrack_net_exit+0x5e/0x80 [nf_conntrack]
        [<ffffffff81454b79>] ops_exit_list+0x39/0x60
        [<ffffffff814551ab>] cleanup_net+0xfb/0x1b0
        [<ffffffff8106917b>] process_one_work+0x26b/0x4c0
        [<ffffffff81069f3e>] worker_thread+0x12e/0x320
        [<ffffffff8106f73e>] kthread+0x9e/0xb0
        [<ffffffff8156ab44>] kernel_thread_helper+0x4/0x10

 other info that might help us debug this:

 Chain exists of:
   rcu_sched_state.barrier_mutex --> cpu_hotplug.lock --> slab_mutex

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(slab_mutex);
                                lock(cpu_hotplug.lock);
                                lock(slab_mutex);
   lock(rcu_sched_state.barrier_mutex);

  *** DEADLOCK ***
=== [ cut here ] ===

This is actually a false positive. Lockdep has no way of knowing the fact
that the ABBA can actually never happen, because of special semantics of
cpu_hotplug.refcount and its handling in cpu_hotplug_begin(); the mutual
exclusion there is not achieved through mutex, but through
cpu_hotplug.refcount.

The "neither cpu_up() nor cpu_down() will proceed past cpu_hotplug_begin()
until everyone who called get_online_cpus() will call put_online_cpus()"
semantics is totally invisible to lockdep.

This patch therefore moves the unlock of slab_mutex so that rcu_barrier()
is being called with it unlocked. It has two advantages:

- it slightly reduces hold time of slab_mutex; as it's used to protect
  the cachep list, it's not necessary to hold it over kmem_cache_free()
  call any more
- it silences the lockdep false positive warning, as it avoids lockdep ever
  learning about slab_mutex -> cpu_hotplug.lock dependency

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9c217255ac4..069a24e6440 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -168,6 +168,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		list_del(&s->list);
 
 		if (!__kmem_cache_shutdown(s)) {
+			mutex_unlock(&slab_mutex);
 			if (s->flags & SLAB_DESTROY_BY_RCU)
 				rcu_barrier();
 
@@ -175,12 +176,14 @@ void kmem_cache_destroy(struct kmem_cache *s)
 			kmem_cache_free(kmem_cache, s);
 		} else {
 			list_add(&s->list, &slab_caches);
+			mutex_unlock(&slab_mutex);
 			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
 				s->name);
 			dump_stack();
 		}
+	} else {
+		mutex_unlock(&slab_mutex);
 	}
-	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
-- 
cgit v1.2.3


From 91a27b2a756784714e924e5e854b919273082d26 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Oct 2012 15:25:28 -0400
Subject: vfs: define struct filename and have getname() return it

getname() is intended to copy pathname strings from userspace into a
kernel buffer. The result is just a string in kernel space. It would
however be quite helpful to be able to attach some ancillary info to
the string.

For instance, we could attach some audit-related info to reduce the
amount of audit-related processing needed. When auditing is enabled,
we could also call getname() on the string more than once and not
need to recopy it from userspace.

This patchset converts the getname()/putname() interfaces to return
a struct instead of a string. For now, the struct just tracks the
string in kernel space and the original userland pointer for it.

Later, we'll add other information to the struct as it becomes
convenient.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/swapfile.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14e254c768f..90d2ed591de 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
 	struct inode *inode;
-	char *pathname;
+	struct filename *pathname;
 	int oom_score_adj;
 	int i, type, prev;
 	int err;
@@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (IS_ERR(pathname))
 		goto out;
 
-	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
-	putname(pathname);
+	victim = filp_open(pathname->name, O_RDWR|O_LARGEFILE, 0);
 	err = PTR_ERR(victim);
 	if (IS_ERR(victim))
 		goto out;
@@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
 	struct swap_info_struct *p;
-	char *name;
+	struct filename *name;
 	struct file *swap_file = NULL;
 	struct address_space *mapping;
 	int i;
@@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		name = NULL;
 		goto bad_swap;
 	}
-	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+	swap_file = filp_open(name->name, O_RDWR|O_LARGEFILE, 0);
 	if (IS_ERR(swap_file)) {
 		error = PTR_ERR(swap_file);
 		swap_file = NULL;
@@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	printk(KERN_INFO "Adding %uk swap on %s.  "
 			"Priority:%d extents:%d across:%lluk %s%s%s\n",
-		p->pages<<(PAGE_SHIFT-10), name, p->prio,
+		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
 		(p->flags & SWP_DISCARDABLE) ? "D" : "",
-- 
cgit v1.2.3


From 669abf4e5539c8aa48bf28c965be05c0a7b58a27 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Oct 2012 16:43:10 -0400
Subject: vfs: make path_openat take a struct filename pointer

...and fix up the callers. For do_file_open_root, just declare a
struct filename on the stack and fill out the .name field. For
do_filp_open, make it also take a struct filename pointer, and fix up its
callers to call it appropriately.

For filp_open, add a variant that takes a struct filename pointer and turn
filp_open into a wrapper around it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 90d2ed591de..71cd288b200 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1498,7 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (IS_ERR(pathname))
 		goto out;
 
-	victim = filp_open(pathname->name, O_RDWR|O_LARGEFILE, 0);
+	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
 	err = PTR_ERR(victim);
 	if (IS_ERR(victim))
 		goto out;
@@ -1966,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		name = NULL;
 		goto bad_swap;
 	}
-	swap_file = filp_open(name->name, O_RDWR|O_LARGEFILE, 0);
+	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
 	if (IS_ERR(swap_file)) {
 		error = PTR_ERR(swap_file);
 		swap_file = NULL;
-- 
cgit v1.2.3


From 325adeb55e32c50055c654e5b06a49f0bd88420b Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 15 Oct 2012 13:44:56 +0200
Subject: mm: huge_memory: Fix build error.

Certain configurations won't implicitly pull in <linux/pagemap.h> resulting
in the following build error:

  mm/huge_memory.c: In function 'release_pte_page':
  mm/huge_memory.c:1697:2: error: implicit declaration of function 'unlock_page' [-Werror=implicit-function-declaration]
  mm/huge_memory.c: In function '__collapse_huge_page_isolate':
  mm/huge_memory.c:1757:3: error: implicit declaration of function 'trylock_page' [-Werror=implicit-function-declaration]
  cc1: some warnings being treated as errors

Reported-by: David Daney <david.daney@cavium.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a863af26c79..40f17c34b41 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
-- 
cgit v1.2.3


From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Oct 2012 17:31:23 -0700
Subject: mm, mempolicy: fix printing stack contents in numa_maps

When reading /proc/pid/numa_maps, it's possible to return the contents of
the stack where the mempolicy string should be printed if the policy gets
freed from beneath us.

This happens because mpol_to_str() may return an error the
stack-allocated buffer is then printed without ever being stored.

There are two possible error conditions in mpol_to_str():

 - if the buffer allocated is insufficient for the string to be stored,
   and

 - if the mempolicy has an invalid mode.

The first error condition is not triggered in any of the callers to
mpol_to_str(): at least 50 bytes is always allocated on the stack and this
is sufficient for the string to be written.  A future patch should convert
this into BUILD_BUG_ON() since we know the maximum strlen possible, but
that's not -rc material.

The second error condition is possible if a race occurs in dropping a
reference to a task's mempolicy causing it to be freed during the read().
The slab poison value is then used for the mode and mpol_to_str() returns
-EINVAL.

This race is only possible because get_vma_policy() believes that
mm->mmap_sem protects task->mempolicy, which isn't true.  The exit path
does not hold mm->mmap_sem when dropping the reference or setting
task->mempolicy to NULL: it uses task_lock(task) instead.

Thus, it's required for the caller of a task mempolicy to hold
task_lock(task) while grabbing the mempolicy and reading it.  Callers with
a vma policy store their mempolicy earlier and can simply increment the
reference count so it's guaranteed not to be freed.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0b78fb9ea65..d04a8a54c29 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1536,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies
- * are protected by the task's mmap_sem, which must be held for read by
- * the caller.
+ * Current or other task's task mempolicy and non-shared vma policies must be
+ * protected by task_lock(task) by the caller.
  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
  * count--added by the get_policy() vm_op, as appropriate--to protect against
  * freeing by another task.  It is the caller's responsibility to free the
-- 
cgit v1.2.3


From deb521c44fa529b24cc78a64702757a683f82487 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Oct 2012 13:37:57 -0700
Subject: remap_file_pages: correctly handle the case of a NULL vm_ops pointer

In commit 0b173bc4daa8 ("mm: kill vma flag VM_CAN_NONLINEAR") we
replaced the VM_CAN_NONLINEAR test with checking whether the mapping has
a '->remap_pages()' vm operation, but there is no guarantee that there
it even has a vm_ops pointer at all.

Add the appropriate test for NULL vm_ops.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/fremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index 3899a86851c..a0aaf0e5680 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -169,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
 		goto out;
 
-	if (!vma->vm_ops->remap_pages)
+	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
 		goto out;
 
 	if (start < vma->vm_start || start + size > vma->vm_end)
-- 
cgit v1.2.3


From 0db63d7e25f96e2c6da925c002badf6f144ddf30 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 19 Oct 2012 13:56:57 -0700
Subject: mm: compaction: correct the nr_strict va isolated check for CMA

Thierry reported that the "iron out" patch for isolate_freepages_block()
had problems due to the strict check being too strict with "mm:
compaction: Iron out isolate_freepages_block() and
isolate_freepages_range() -fix1".  It's possible that more pages than
necessary are isolated but the check still fails and I missed that this
fix was not picked up before RC1.  This same problem has been identified
in 3.7-RC1 by Tony Prisk and should be addressed by the following patch.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Tony Prisk <linux@prisktech.co.nz>
Reported-by: Thierry Reding <thierry.reding@avionic-design.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 2c4ce17651d..9eef55838fc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -346,7 +346,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	 * pages requested were isolated. If there were any failures, 0 is
 	 * returned and CMA will fail.
 	 */
-	if (strict && nr_strict_required != total_isolated)
+	if (strict && nr_strict_required > total_isolated)
 		total_isolated = 0;
 
 	if (locked)
-- 
cgit v1.2.3


From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 22 Oct 2012 16:35:18 -0700
Subject: x86, mm: Trim memory in memblock to be page aligned

We will not map partial pages, so need to make sure memblock
allocation will not allocate those bytes out.

Also we will use for_each_mem_pfn_range() to loop to map memory
range to keep them consistent.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com
Acked-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@vger.kernel.org>
---
 mm/memblock.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 931eef145af..625905523c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -930,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
 	return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
 }
 
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+	int i;
+	phys_addr_t start, end, orig_start, orig_end;
+	struct memblock_type *mem = &memblock.memory;
+
+	for (i = 0; i < mem->cnt; i++) {
+		orig_start = mem->regions[i].base;
+		orig_end = mem->regions[i].base + mem->regions[i].size;
+		start = round_up(orig_start, align);
+		end = round_down(orig_end, align);
+
+		if (start == orig_start && end == orig_end)
+			continue;
+
+		if (start < end) {
+			mem->regions[i].base = start;
+			mem->regions[i].size = end - start;
+		} else {
+			memblock_remove_region(mem, i);
+			i--;
+		}
+	}
+}
 
 void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 {
-- 
cgit v1.2.3


From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Oct 2012 13:37:31 -0700
Subject: mm: fix XFS oops due to dirty pages without buffers on s390

On s390 any write to a page (even from kernel itself) sets architecture
specific page dirty bit.  Thus when a page is written to via buffered
write, HW dirty bit gets set and when we later map and unmap the page,
page_remove_rmap() finds the dirty bit and calls set_page_dirty().

Dirtying of a page which shouldn't be dirty can cause all sorts of
problems to filesystems.  The bug we observed in practice is that
buffers from the page get freed, so when the page gets later marked as
dirty and writeback writes it, XFS crashes due to an assertion
BUG_ON(!PagePrivate(page)) in page_buffers() called from
xfs_count_page_state().

Similar problem can also happen when zero_user_segment() call from
xfs_vm_writepage() (or block_write_full_page() for that matter) set the
hardware dirty bit during writeback, later buffers get freed, and then
page unmapped.

Fix the issue by ignoring s390 HW dirty bit for page cache pages of
mappings with mapping_cap_account_dirty().  This is safe because for
such mappings when a page gets marked as writeable in PTE it is also
marked dirty in do_wp_page() or do_page_fault().  When the dirty bit is
cleared by clear_page_dirty_for_io(), the page gets writeprotected in
page_mkclean().  So pagecache page is writeable if and only if it is
dirty.

Thanks to Hugh Dickins for pointing out mapping has to have
mapping_cap_account_dirty() for things to work and proposing a cleaned
up variant of the patch.

The patch has survived about two hours of running fsx-linux on tmpfs
while heavily swapping and several days of running on out build machines
where the original problem was triggered.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@vger.kernel.org>		[3.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 7df7984d476..2ee1ef0f317 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
 
 #include <asm/tlbflush.h>
 
@@ -926,11 +927,8 @@ int page_mkclean(struct page *page)
 
 	if (page_mapped(page)) {
 		struct address_space *mapping = page_mapping(page);
-		if (mapping) {
+		if (mapping)
 			ret = page_mkclean_file(mapping, page);
-			if (page_test_and_clear_dirty(page_to_pfn(page), 1))
-				ret = 1;
-		}
 	}
 
 	return ret;
@@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
 	bool anon = PageAnon(page);
 	bool locked;
 	unsigned long flags;
@@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page)
 	 * this if the page is anon, so about to be freed; but perhaps
 	 * not if it's in swapcache - there might be another pte slot
 	 * containing the swap entry, but page not yet written to swap.
+	 *
+	 * And we can skip it on file pages, so long as the filesystem
+	 * participates in dirty tracking; but need to catch shm and tmpfs
+	 * and ramfs pages which have been modified since creation by read
+	 * fault.
+	 *
+	 * Note that mapping must be decided above, before decrementing
+	 * mapcount (which luckily provides a barrier): once page is unmapped,
+	 * it could be truncated and page->mapping reset to NULL at any moment.
+	 * Note also that we are relying on page_mapping(page) to set mapping
+	 * to &swapper_space when PageSwapCache(page).
 	 */
-	if ((!anon || PageSwapCache(page)) &&
+	if (mapping && !mapping_cap_account_dirty(mapping) &&
 	    page_test_and_clear_dirty(page_to_pfn(page), 1))
 		set_page_dirty(page);
 	/*
-- 
cgit v1.2.3


From 86a595f961360c9c31d00fb111dd09e5d5ba9a40 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Thu, 25 Oct 2012 13:37:56 -0700
Subject: mm/page_alloc.c:alloc_contig_range(): return early for err path

If start_isolate_page_range() failed, unset_migratetype_isolate() has been
done inside it.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Ni zhan Chen <nizhan.chen@gmail.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971182b..b0012ab372a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5825,7 +5825,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype);
 	if (ret)
-		goto done;
+		return ret;
 
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
-- 
cgit v1.2.3


From 35cfa2b0b491c37e23527822bf365610dbb188e5 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 25 Oct 2012 13:38:01 -0700
Subject: mm/mmu_notifier: allocate mmu_notifier in advance

While allocating mmu_notifier with parameter GFP_KERNEL, swap would start
to work in case of tight available memory.  Eventually, that would lead to
a deadlock while the swap deamon swaps anonymous pages.  It was caused by
commit e0f3c3f78da29b ("mm/mmu_notifier: init notifier if necessary").

  =================================
  [ INFO: inconsistent lock state ]
  3.7.0-rc1+ #518 Not tainted
  ---------------------------------
  inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
  kswapd0/35 [HC0[0]:SC0[0]:HE1:SE1] takes:
   (&mapping->i_mmap_mutex){+.+.?.}, at: page_referenced+0x9c/0x2e0
  {RECLAIM_FS-ON-W} state was registered at:
     mark_held_locks+0x86/0x150
     lockdep_trace_alloc+0x67/0xc0
     kmem_cache_alloc_trace+0x33/0x230
     do_mmu_notifier_register+0x87/0x180
     mmu_notifier_register+0x13/0x20
     kvm_dev_ioctl+0x428/0x510
     do_vfs_ioctl+0x98/0x570
     sys_ioctl+0x91/0xb0
     system_call_fastpath+0x16/0x1b
  irq event stamp: 825
  hardirqs last  enabled at (825): _raw_spin_unlock_irq+0x30/0x60
  hardirqs last disabled at (824): _raw_spin_lock_irq+0x19/0x80
  softirqs last  enabled at (0): copy_process+0x630/0x17c0
  softirqs last disabled at (0): (null)
  ...

Simply back out the above commit, which was a small performance
optimization.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Reported-by: Andrea Righi <andrea@betterlinux.com>
Tested-by: Andrea Righi <andrea@betterlinux.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Sagi Grimberg <sagig@mellanox.co.il>
Cc: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 479a1e751a7..8a5ac8c686b 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -196,28 +196,28 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
 	/*
-	* Verify that mmu_notifier_init() already run and the global srcu is
-	* initialized.
-	*/
+	 * Verify that mmu_notifier_init() already run and the global srcu is
+	 * initialized.
+	 */
 	BUG_ON(!srcu.per_cpu_ref);
 
+	ret = -ENOMEM;
+	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+	if (unlikely(!mmu_notifier_mm))
+		goto out;
+
 	if (take_mmap_sem)
 		down_write(&mm->mmap_sem);
 	ret = mm_take_all_locks(mm);
 	if (unlikely(ret))
-		goto out;
+		goto out_clean;
 
 	if (!mm_has_notifiers(mm)) {
-		mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
-					GFP_KERNEL);
-		if (unlikely(!mmu_notifier_mm)) {
-			ret = -ENOMEM;
-			goto out_of_mem;
-		}
 		INIT_HLIST_HEAD(&mmu_notifier_mm->list);
 		spin_lock_init(&mmu_notifier_mm->lock);
 
 		mm->mmu_notifier_mm = mmu_notifier_mm;
+		mmu_notifier_mm = NULL;
 	}
 	atomic_inc(&mm->mm_count);
 
@@ -233,12 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
-out_of_mem:
 	mm_drop_all_locks(mm);
-out:
+out_clean:
 	if (take_mmap_sem)
 		up_write(&mm->mmap_sem);
-
+	kfree(mmu_notifier_mm);
+out:
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 	return ret;
 }
-- 
cgit v1.2.3


From 6b187d0260b6cd1d0904309f32659b7ed5948af8 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 25 Oct 2012 13:38:08 -0700
Subject: mm, numa: avoid setting zone_reclaim_mode unless a node is
 sufficiently distant

Commit 957f822a0ab9 ("mm, numa: reclaim from all nodes within reclaim
distance") caused zone_reclaim_mode to be set for all systems where two
nodes are within RECLAIM_DISTANCE of each other.  This is the opposite
of what we actually want: zone_reclaim_mode should be set if two nodes
are sufficiently distant.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Julian Wollrath <jwollrath@web.de>
Tested-by: Julian Wollrath <jwollrath@web.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Patrik Kullman <patrik.kullman@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b0012ab372a..5b74de6702e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid)
 	int i;
 
 	for_each_online_node(i)
-		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+		else
 			zone_reclaim_mode = 1;
-		}
 }
 
 #else	/* CONFIG_NUMA */
-- 
cgit v1.2.3