From 68a2d20b79b105f02dcbc52c211d7e62f98996b7 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 29 Apr 2013 15:05:42 -0700
Subject: drivers/video: add Hyper-V Synthetic Video Frame Buffer Driver

This is the driver for the Hyper-V Synthetic Video, which supports
screen resolution up to Full HD 1920x1080 on Windows Server 2012 host,
and 1600x1200 on Windows Server 2008 R2 or earlier.  It also solves the
double mouse cursor issue of the emulated video mode.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
Cc: Olaf Hering <olaf@aepfle.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hyperv.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 95d0850584d..c2559847d7e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1317,6 +1317,17 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
 			0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42, \
 			0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40 \
 		}
+/*
+ * Synthetic Video GUID
+ * {DA0A7802-E377-4aac-8E77-0558EB1073F8}
+ */
+#define HV_SYNTHVID_GUID \
+	.guid = { \
+			0x02, 0x78, 0x0a, 0xda, 0x77, 0xe3, 0xac, 0x4a, \
+			0x8e, 0x77, 0x05, 0x58, 0xeb, 0x10, 0x73, 0xf8 \
+		}
+
+
 /*
  * Common header for Hyper-V ICs
  */
-- 
cgit v1.2.3


From 2c2fea11957c8c45bf873d9bcd7cd9a342654e79 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 29 Apr 2013 15:06:06 -0700
Subject: debug_locks.h: make warning more verbose

The WARN_ON(1) in DEBUG_LOCKS_WARN_ON is surprisingly awkward to track
down when it's hit, as it's usually buried in macros, causing multiple
instances to land on the same line number.

This patch makes it more useful by switching to:

    WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c);

so that the particular DEBUG_LOCKS_WARN_ON is more easily identified and
grep'd for.  For example:

    WARNING: at kernel/mutex.c:198 _mutex_lock_nested+0x31c/0x380()
    DEBUG_LOCKS_WARN_ON(l->magic != l)

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debug_locks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 3bd46f76675..21ca773f77b 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -27,7 +27,7 @@ extern int debug_locks_off(void);
 									\
 	if (!oops_in_progress && unlikely(c)) {				\
 		if (debug_locks_off() && !debug_locks_silent)		\
-			WARN_ON(1);					\
+			WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c);		\
 		__ret = 1;						\
 	}								\
 	__ret;								\
-- 
cgit v1.2.3


From fe0bfaaff84429a35e4447d4ccd646aecf5999fb Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 29 Apr 2013 15:06:10 -0700
Subject: mm: trace filemap add and del

Use the events API to trace filemap loading and unloading of file pieces
into the page cache.

This patch aims at tracing the eviction reload cycle of executable and
shared libraries pages in a memory constrained environment.

The typical usage is to spot a specific device and inode (for example
/lib/libc.so) to see the eviction cycles, and find out if frequently
used code is rather spread across many pages (bad) or coallesced (good).

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/filemap.h | 58 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 include/trace/events/filemap.h

(limited to 'include')

diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h
new file mode 100644
index 00000000000..0421f49a20f
--- /dev/null
+++ b/include/trace/events/filemap.h
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM filemap
+
+#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FILEMAP_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/device.h>
+#include <linux/kdev_t.h>
+
+DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page),
+
+	TP_STRUCT__entry(
+		__field(struct page *, page)
+		__field(unsigned long, i_ino)
+		__field(unsigned long, index)
+		__field(dev_t, s_dev)
+	),
+
+	TP_fast_assign(
+		__entry->page = page;
+		__entry->i_ino = page->mapping->host->i_ino;
+		__entry->index = page->index;
+		if (page->mapping->host->i_sb)
+			__entry->s_dev = page->mapping->host->i_sb->s_dev;
+		else
+			__entry->s_dev = page->mapping->host->i_rdev;
+	),
+
+	TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu",
+		MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+		__entry->i_ino,
+		__entry->page,
+		page_to_pfn(__entry->page),
+		__entry->index << PAGE_SHIFT)
+);
+
+DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
+	TP_PROTO(struct page *page),
+	TP_ARGS(page)
+	);
+
+DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
+	TP_PROTO(struct page *page),
+	TP_ARGS(page)
+	);
+
+#endif /* _TRACE_FILEMAP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:06:11 -0700
Subject: mm, show_mem: suppress page counts in non-blockable contexts

On large systems with a lot of memory, walking all RAM to determine page
types may take a half second or even more.

In non-blockable contexts, the page allocator will emit a page allocation
failure warning unless __GFP_NOWARN is specified.  In such contexts, irqs
are typically disabled and such a lengthy delay may even result in NMI
watchdog timeouts.

To fix this, suppress the page walk in such contexts when printing the
page allocation failure warning.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2091b88d24..f3c7b1f9d1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -899,7 +899,8 @@ extern void pagefault_out_of_memory(void);
  * Flags passed to show_mem() and show_free_areas() to suppress output in
  * various contexts.
  */
-#define SHOW_MEM_FILTER_NODES	(0x0001u)	/* filter disallowed nodes */
+#define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
+#define SHOW_MEM_FILTER_PAGE_COUNT	(0x0002u)	/* page type count */
 
 extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
-- 
cgit v1.2.3


From 250297edf83292c831fbf4504df54953c2aacfe4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 15:06:12 -0700
Subject: mm/shmem.c: remove an ifdef

Create a CONFIG_MMU=y stub for ramfs_nommu_expand_for_mapping() in the
usual fashion.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Wolfram Sang <wsa@the-dreams.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ramfs.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5bf5500db83..69e37c2d1ea 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -6,7 +6,13 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir,
 extern struct dentry *ramfs_mount(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
-#ifndef CONFIG_MMU
+#ifdef CONFIG_MMU
+static inline int
+ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	return 0;
+}
+#else
 extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
 extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long addr,
-- 
cgit v1.2.3


From 8375ad98cc1defc36adf4a77d9ea1e71db51a371 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:06:13 -0700
Subject: vm: adjust ifdef for TINY_RCU

There is an ifdef in page_cache_get_speculative() that checks for !SMP
and TREE_RCU, which has been an impossible combination since the advent
of TINY_RCU.  The ifdef enables a fastpath that is valid when preemption
is disabled by rcu_read_lock() in UP systems, which is the case when
TINY_RCU is enabled.  This commit therefore adjusts the ifdef to
generate the fastpath when TINY_RCU is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0e38e13eb24..e3dea75a078 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -149,7 +149,7 @@ static inline int page_cache_get_speculative(struct page *page)
 {
 	VM_BUG_ON(in_interrupt());
 
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
+#ifdef CONFIG_TINY_RCU
 # ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
-- 
cgit v1.2.3


From 69afade72a3e13e96a065f757891d384d466123f Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Mon, 29 Apr 2013 15:06:21 -0700
Subject: mm: introduce common help functions to deal with reserved/managed
 pages

The original goal of this patchset is to fix the bug reported by

  https://bugzilla.kernel.org/show_bug.cgi?id=53501

Now it has also been expanded to reduce common code used by memory
initializion.

This is the first part, which applies to v3.9-rc1.

It introduces following common helper functions to simplify
free_initmem() and free_initrd_mem() on different architectures:

adjust_managed_page_count():
	will be used to adjust totalram_pages, totalhigh_pages,
	zone->managed_pages when reserving/unresering a page.

__free_reserved_page():
	free a reserved page into the buddy system without adjusting
	page statistics info

free_reserved_page():
	free a reserved page into the buddy system and adjust page
	statistics info

mark_page_reserved():
	mark a page as reserved and adjust page statistics info

free_reserved_area():
	free a continous ranges of pages by calling free_reserved_page()

free_initmem_default():
	default method to free __init pages.

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org.  That means
some code may not pass compilation on some architectures.  So any help to
test this patchset are welcomed!

There are several other parts still under development:
Part2: introduce free_highmem_page() to simplify freeing highmem pages
Part3: refine code to manage totalram_pages, totalhigh_pages and
	zone->managed_pages
Part4: introduce helper functions to simplify mem_init() and remove the
	global variable num_physpages.

This patch:

Code to deal with reserved/managed pages are duplicated by many
architectures, so introduce common help functions to reduce duplicated
code.  These common help functions will also be used to concentrate code
to modify totalram_pages and zone->managed_pages, which makes the code
much more clear.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3c7b1f9d1d..d064c73c925 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1295,6 +1295,54 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+/*
+ * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
+ * into the buddy system. The freed pages will be poisoned with pattern
+ * "poison" if it's non-zero.
+ * Return pages freed into the buddy system.
+ */
+extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
+					int poison, char *s);
+
+static inline void adjust_managed_page_count(struct page *page, long count)
+{
+	totalram_pages += count;
+}
+
+/* Free the reserved page into the buddy system, so it gets managed. */
+static inline void __free_reserved_page(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+}
+
+static inline void free_reserved_page(struct page *page)
+{
+	__free_reserved_page(page);
+	adjust_managed_page_count(page, 1);
+}
+
+static inline void mark_page_reserved(struct page *page)
+{
+	SetPageReserved(page);
+	adjust_managed_page_count(page, -1);
+}
+
+/*
+ * Default method to free all the __init memory into the buddy system.
+ * The freed pages will be poisoned with pattern "poison" if it is
+ * non-zero. Return pages freed into the buddy system.
+ */
+static inline unsigned long free_initmem_default(int poison)
+{
+	extern char __init_begin[], __init_end[];
+
+	return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) ,
+				  ((unsigned long)&__init_end) & PAGE_MASK,
+				  poison, "unused kernel");
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
  * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
-- 
cgit v1.2.3


From cfa11e08ed39eb28a9eff9a907b20913020c69b5 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Mon, 29 Apr 2013 15:07:00 -0700
Subject: mm: introduce free_highmem_page() helper to free highmem pages into
 buddy system

The original goal of this patchset is to fix the bug reported by

  https://bugzilla.kernel.org/show_bug.cgi?id=53501

Now it has also been expanded to reduce common code used by memory
initializion.

This is the second part, which applies to the previous part at:
  http://marc.info/?l=linux-mm&m=136289696323825&w=2

It introduces a helper function free_highmem_page() to free highmem
pages into the buddy system when initializing mm subsystem.
Introduction of free_highmem_page() is one step forward to clean up
accesses and modificaitons of totalhigh_pages, totalram_pages and
zone->managed_pages etc. I hope we could remove all references to
totalhigh_pages from the arch/ subdirectory.

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org. That means
some code may not pass compilation on some architectures. So any help
to test this patchset are welcomed!

There are several other parts still under development:
Part3: refine code to manage totalram_pages, totalhigh_pages and
	zone->managed_pages
Part4: introduce helper functions to simplify mem_init() and remove the
	global variable num_physpages.

This patch:

Introduce helper function free_highmem_page(), which will be used by
architectures with HIGHMEM enabled to free highmem pages into the buddy
system.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Suzuki K. Poulose" <suzuki@in.ibm.com>
Cc: Alexander Graf <agraf@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Attilio Rao <attilio.rao@citrix.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: David Daney <david.daney@cavium.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Michel Lespinasse <walken@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rik van Riel <riel@redhat.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d064c73c925..43b70d5f820 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1303,6 +1303,13 @@ extern void free_initmem(void);
  */
 extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
 					int poison, char *s);
+#ifdef	CONFIG_HIGHMEM
+/*
+ * Free a highmem page into the buddy system, adjusting totalhigh_pages
+ * and totalram_pages.
+ */
+extern void free_highmem_page(struct page *page);
+#endif
 
 static inline void adjust_managed_page_count(struct page *page, long count)
 {
-- 
cgit v1.2.3


From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 29 Apr 2013 15:07:21 -0700
Subject: cgroup: remove css_get_next

Now that we have generic and well ordered cgroup tree walkers there is
no need to keep css_get_next in the place.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f5..470073bf93d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 
 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 
-/*
- * Get a cgroup whose id is greater than or equal to id under tree of root.
- * Returning a cgroup_subsys_state or NULL.
- */
-struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
-		struct cgroup_subsys_state *root, int *foundid);
-
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
 		     const struct cgroup_subsys_state *root);
-- 
cgit v1.2.3


From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Mon, 29 Apr 2013 15:07:22 -0700
Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n

drop_caches.c provides code only invokable via sysctl, so don't compile it
in when CONFIG_SYSCTL=n.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43b70d5f820..98a44376e4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr);
 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+#ifdef CONFIG_SYSCTL
+extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+#endif
+
 unsigned long shrink_slab(struct shrink_control *shrink,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages);
-- 
cgit v1.2.3


From 106c992a5ebef28193cf5958e49ceff5e4aebb04 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 29 Apr 2013 15:07:23 -0700
Subject: mm/hugetlb: add more arch-defined huge_pte functions

Commit abf09bed3cce ("s390/mm: implement software dirty bits")
introduced another difference in the pte layout vs.  the pmd layout on
s390, thoroughly breaking the s390 support for hugetlbfs.  This requires
replacing some more pte_xxx functions in mm/hugetlbfs.c with a
huge_pte_xxx version.

This patch introduces those huge_pte_xxx functions and their generic
implementation in asm-generic/hugetlb.h, which will now be included on
all architectures supporting hugetlbfs apart from s390.  This change
will be a no-op for those architectures.

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>	[for !s390 parts]
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/hugetlb.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 include/asm-generic/hugetlb.h

(limited to 'include')

diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
new file mode 100644
index 00000000000..d06079c774a
--- /dev/null
+++ b/include/asm-generic/hugetlb.h
@@ -0,0 +1,40 @@
+#ifndef _ASM_GENERIC_HUGETLB_H
+#define _ASM_GENERIC_HUGETLB_H
+
+static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
+{
+	return mk_pte(page, pgprot);
+}
+
+static inline int huge_pte_write(pte_t pte)
+{
+	return pte_write(pte);
+}
+
+static inline int huge_pte_dirty(pte_t pte)
+{
+	return pte_dirty(pte);
+}
+
+static inline pte_t huge_pte_mkwrite(pte_t pte)
+{
+	return pte_mkwrite(pte);
+}
+
+static inline pte_t huge_pte_mkdirty(pte_t pte)
+{
+	return pte_mkdirty(pte);
+}
+
+static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
+{
+	return pte_modify(pte, newprot);
+}
+
+static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+				  pte_t *ptep)
+{
+	pte_clear(mm, addr, ptep);
+}
+
+#endif /* _ASM_GENERIC_HUGETLB_H */
-- 
cgit v1.2.3


From 7136851117744f1d291bed6d307432699d405109 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 29 Apr 2013 15:07:25 -0700
Subject: mm: make snapshotting pages for stable writes a per-bio operation

Walking a bio's page mappings has proved problematic, so create a new
bio flag to indicate that a bio's data needs to be snapshotted in order
to guarantee stable pages during writeback.  Next, for the one user
(ext3/jbd) of snapshotting, hook all the places where writes can be
initiated without PG_writeback set, and set BIO_SNAP_STABLE there.

We must also flag journal "metadata" bios for stable writeout, since
file data can be written through the journal.  Finally, the
MS_SNAP_STABLE mount flag (only used by ext3) is now superfluous, so get
rid of it.

[akpm@linux-foundation.org: rename _submit_bh()'s `flags' to `bio_flags', delobotomize the _submit_bh declaration]
[akpm@linux-foundation.org: teeny cleanup]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blk_types.h   | 3 ++-
 include/linux/buffer_head.h | 1 +
 include/uapi/linux/fs.h     | 1 -
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e64..22990cf4439 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -111,12 +111,13 @@ struct bio {
 #define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	10	/* Make BIO Quiet */
 #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_SNAP_STABLE	12	/* bio data must be snapshotted during write */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
  * BIO_POOL_IDX()
  */
-#define BIO_RESET_BITS	12
+#define BIO_RESET_BITS	13
 
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5afc4f94d11..4c16c4a88d4 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -181,6 +181,7 @@ void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, int rw);
 void write_dirty_buffer(struct buffer_head *bh, int rw);
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index c7fc1e6517c..a4ed56cf0ea 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -88,7 +88,6 @@ struct inodes_stat_t {
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 
 /* These sb flags are internal to the kernel */
-#define MS_SNAP_STABLE	(1<<27) /* Snapshot pages during writeback, if needed */
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
-- 
cgit v1.2.3


From db3808c1bac64740b9d830fda92801ae65f1c851 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 29 Apr 2013 15:07:28 -0700
Subject: mm, vmalloc: move get_vmalloc_info() to vmalloc.c

Now get_vmalloc_info() is in fs/proc/mmu.c.  There is no reason that this
code must be here and it's implementation needs vmlist_lock and it iterate
a vmlist which may be internal data structure for vmalloc.

It is preferable that vmlist_lock and vmlist is only used in vmalloc.c
for maintainability. So move the code to vmalloc.c

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e911c7f..698b1e50d3a 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -158,4 +158,22 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 # endif
 #endif
 
+struct vmalloc_info {
+	unsigned long   used;
+	unsigned long   largest_chunk;
+};
+
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *vmi);
+#else
+
+#define VMALLOC_TOTAL 0UL
+#define get_vmalloc_info(vmi)			\
+do {						\
+	(vmi)->used = 0;			\
+	(vmi)->largest_chunk = 0;		\
+} while (0)
+#endif
+
 #endif /* _LINUX_VMALLOC_H */
-- 
cgit v1.2.3


From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 29 Apr 2013 15:07:37 -0700
Subject: mm, vmalloc: export vmap_area_list, instead of vmlist

Although our intention is to unexport internal structure entirely, but
there is one exception for kexec.  kexec dumps address of vmlist and
makedumpfile uses this information.

We are about to remove vmlist, then another way to retrieve information
of vmalloc layer is needed for makedumpfile.  For this purpose, we
export vmap_area_list, instead of vmlist.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 698b1e50d3a..8a25f9081ed 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
 /*
  *	Internals.  Dont't use..
  */
-extern rwlock_t vmlist_lock;
-extern struct vm_struct *vmlist;
+extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
-- 
cgit v1.2.3


From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Date: Mon, 29 Apr 2013 15:07:40 -0700
Subject: kexec, vmalloc: export additional vmalloc layer information

Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get
the start address of vmalloc region (vmalloc_start).  The address which
contains vmalloc_start value is represented as below:

  vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start)

However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list)
aren't exported as VMCOREINFO.

So this patch exports them externally with small cleanup.

[akpm@linux-foundation.org: vmalloc.h should include list.h for list_head]
Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 8a25f9081ed..7d5773a99f2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -3,7 +3,9 @@
 
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <asm/page.h>		/* pgprot_t */
+#include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 
@@ -35,6 +37,17 @@ struct vm_struct {
 	const void		*caller;
 };
 
+struct vmap_area {
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long flags;
+	struct rb_node rb_node;         /* address sorted rbtree */
+	struct list_head list;          /* address sorted list */
+	struct list_head purge_list;    /* "lazy purge" list */
+	struct vm_struct *vm;
+	struct rcu_head rcu_head;
+};
+
 /*
  *	Highlevel APIs for driver use
  */
-- 
cgit v1.2.3


From 6ee8630e02be6dd89926ca0fbc21af68b23dc087 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 29 Apr 2013 15:07:44 -0700
Subject: mm: allow arch code to control the user page table ceiling

On architectures where a pgd entry may be shared between user and kernel
(e.g.  ARM+LPAE), freeing page tables needs a ceiling other than 0.
This patch introduces a generic USER_PGTABLES_CEILING that arch code can
override.  It is the responsibility of the arch code setting the ceiling
to ensure the complete freeing of the page tables (usually in
pgd_free()).

[catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes]
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: <stable@vger.kernel.org>	[3.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfd87685fc1..a59ff51b016 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -7,6 +7,16 @@
 #include <linux/mm_types.h>
 #include <linux/bug.h>
 
+/*
+ * On almost all architectures and configurations, 0 can be used as the
+ * upper ceiling to free_pgtables(): on many architectures it has the same
+ * effect as using TASK_SIZE.  However, there is one configuration which
+ * must impose a more careful limit, to avoid freeing kernel pgtables.
+ */
+#ifndef USER_PGTABLES_CEILING
+#define USER_PGTABLES_CEILING	0UL
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 extern int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
-- 
cgit v1.2.3


From 949f7ec5760b021da3cccc1eaeb0671270e4238f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:07:48 -0700
Subject: mm, hugetlb: include hugepages in meminfo

Particularly in oom conditions, it's troublesome that hugetlb memory is
not displayed.  All other meminfo that is emitted will not add up to
what is expected, and there is no artifact left in the kernel log to
show that a potentially significant amount of memory is actually
allocated as hugepages which are not available to be reclaimed.

Booting with hugepages=8192 on the command line, this memory is now
shown in oom conditions.  For example, with echo m >
/proc/sysrq-trigger:

  Node 0 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 1 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 2 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 3 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 16e4e9a643f..3a62df310f2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -58,6 +58,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
+void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
@@ -114,6 +115,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
 #define hugetlb_report_node_meminfo(n, buf)	0
+static inline void hugetlb_show_meminfo(void)
+{
+}
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
-- 
cgit v1.2.3


From 0aad818b2de455f1bfd7ef87c28cdbbaaed9a699 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Apr 2013 15:07:50 -0700
Subject: sparse-vmemmap: specify vmemmap population range in bytes

The sparse code, when asking the architecture to populate the vmemmap,
specifies the section range as a starting page and a number of pages.

This is an awkward interface, because none of the arch-specific code
actually thinks of the range in terms of 'struct page' units and always
translates it to bytes first.

In addition, later patches mix huge page and regular page backing for
the vmemmap.  For this, they need to call vmemmap_populate_basepages()
on sub-section ranges with PAGE_SIZE and PMD_SIZE in mind.  But these
are not necessarily multiples of the 'struct page' size and so this unit
is too coarse.

Just translate the section range into bytes once in the generic sparse
code, then pass byte ranges down the stack.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Bernhard Schmidt <Bernhard.Schmidt@lrz.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: David S. Miller <davem@davemloft.net>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98a44376e4f..6d7266842ab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1764,12 +1764,12 @@ pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
 void *vmemmap_alloc_block_buf(unsigned long size, int node);
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
-int vmemmap_populate_basepages(struct page *start_page,
-						unsigned long pages, int node);
-int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+int vmemmap_populate_basepages(unsigned long start, unsigned long end,
+			       int node);
+int vmemmap_populate(unsigned long start, unsigned long end, int node);
 void vmemmap_populate_print_last(void);
 #ifdef CONFIG_MEMORY_HOTPLUG
-void vmemmap_free(struct page *memmap, unsigned long nr_pages);
+void vmemmap_free(unsigned long start, unsigned long end);
 #endif
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
 				  unsigned long size);
-- 
cgit v1.2.3


From f9872caf07c1c774034b8bddde7d4a3a7f4a6484 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:01 -0700
Subject: page_alloc: make setup_nr_node_ids() usable for arch init code

powerpc and x86 were opencoding copies of setup_nr_node_ids(), which
page_alloc provides but makes static.  Make it avaliable to the archs in
linux/mm.h.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6d7266842ab..7aa11a6736e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1816,5 +1816,11 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
 static inline bool page_is_guard(struct page *page) { return false; }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#if MAX_NUMNODES > 1
+void __init setup_nr_node_ids(void);
+#else
+static inline void setup_nr_node_ids(void) {}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From f02c696800886382198df897b30bb796b46a8dae Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 15:08:04 -0700
Subject: include/linux/memory.h: implement register_hotmemory_notifier()

When CONFIG_MEMORY_HOTPLUG=n, we don't want the memory-hotplug notifier
handlers to be included in the .o files, for space reasons.

The existing hotplug_memory_notifier() tries to handle this but testing
with gcc-4.4.4 shows that it doesn't work - the hotplug functions are
still present in the .o files.

So implement a new register_hotmemory_notifier() which is a copy of
register_hotcpu_notifier(), and which actually works as desired.
hotplug_memory_notifier() and register_memory_notifier() callsites
should be converted to use this new register_hotmemory_notifier().

While we're there, let's repair the existing hotplug_memory_notifier():
it simply stomps on the register_memory_notifier() return value, so
well-behaved code cannot check for errors.  Apparently non of the
existing callers were well-behaved :(

Cc: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h   | 14 ++++++++++----
 include/linux/notifier.h |  5 ++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 45e93b46887..0ff6598ee62 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -18,6 +18,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
+#include <linux/notifier.h>
 
 #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 
@@ -127,13 +128,18 @@ enum mem_add_context { BOOT, HOTPLUG };
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-#define hotplug_memory_notifier(fn, pri) {			\
+#define hotplug_memory_notifier(fn, pri) ({		\
 	static __meminitdata struct notifier_block fn##_mem_nb =\
-		{ .notifier_call = fn, .priority = pri };	\
+		{ .notifier_call = fn, .priority = pri };\
 	register_memory_notifier(&fn##_mem_nb);			\
-}
+})
+#define register_hotmemory_notifier(nb)		register_memory_notifier(nb)
+#define unregister_hotmemory_notifier(nb) 	unregister_memory_notifier(nb)
 #else
-#define hotplug_memory_notifier(fn, pri) do { } while (0)
+#define hotplug_memory_notifier(fn, pri)	(0)
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
 #endif
 
 /*
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d65746efc95..d14a4c36246 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,8 +47,11 @@
  * runtime initialization.
  */
 
+typedef	int (*notifier_fn_t)(struct notifier_block *nb,
+			unsigned long action, void *data);
+
 struct notifier_block {
-	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+	notifier_fn_t notifier_call;
 	struct notifier_block __rcu *next;
 	int priority;
 };
-- 
cgit v1.2.3


From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:10 -0700
Subject: mm: limit growth of 3% hardcoded other user reserve

Add user_reserve_kbytes knob.

Limit the growth of the memory reserved for other user processes to
min(3% current process size, user_reserve_pages).  Only about 8MB is
necessary to enable recovery in the default mode, and only a few hundred
MB are required even when overcommit is disabled.

user_reserve_pages defaults to min(3% free pages, 128MB)

I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ...
then adding the RSS of each.

This only affects OVERCOMMIT_NEVER mode.

Background

1. user reserve

__vm_enough_memory reserves a hardcoded 3% of the current process size for
other applications when overcommit is disabled.  This was done so that a
user could recover if they launched a memory hogging process.  Without the
reserve, a user would easily run into a message such as:

bash: fork: Cannot allocate memory

2. admin reserve

Additionally, a hardcoded 3% of free memory is reserved for root in both
overcommit 'guess' and 'never' modes.  This was intended to prevent a
scenario where root-cant-log-in and perform recovery operations.

Note that this reserve shrinks, and doesn't guarantee a useful reserve.

Motivation

The two hardcoded memory reserves should be updated to account for current
memory sizes.

Also, the admin reserve would be more useful if it didn't shrink too much.

When the current code was originally written, 1GB was considered
"enterprise".  Now the 3% reserve can grow to multiple GB on large memory
systems, and it only needs to be a few hundred MB at most to enable a user
or admin to recover a system with an unwanted memory hogging process.

I've found that reducing these reserves is especially beneficial for a
specific type of application load:

 * single application system
 * one or few processes (e.g. one per core)
 * allocating all available memory
 * not initializing every page immediately
 * long running

I've run scientific clusters with this sort of load.  A long running job
sometimes failed many hours (weeks of CPU time) into a calculation.  They
weren't initializing all of their memory immediately, and they weren't
using calloc, so I put systems into overcommit 'never' mode.  These
clusters run diskless and have no swap.

However, with the current reserves, a user wishing to allocate as much
memory as possible to one process may be prevented from using, for
example, almost 2GB out of 32GB.

The effect is less, but still significant when a user starts a job with
one process per core.  I have repeatedly seen a set of processes
requesting the same amount of memory fail because one of them could not
allocate the amount of memory a user would expect to be able to allocate.
For example, Message Passing Interfce (MPI) processes, one per core.  And
it is similar for other parallel programming frameworks.

Changing this reserve code will make the overcommit never mode more useful
by allowing applications to allocate nearly all of the available memory.

Also, the new admin_reserve_kbytes will be safer than the current behavior
since the hardcoded 3% of available memory reserve can shrink to something
useless in the case where applications have grabbed all available memory.

Risks

* "bash: fork: Cannot allocate memory"

  The downside of the first patch-- which creates a tunable user reserve
  that is only used in overcommit 'never' mode--is that an admin can set
  it so low that a user may not be able to kill their process, even if
  they already have a shell prompt.

  Of course, a user can get in the same predicament with the current 3%
  reserve--they just have to launch processes until 3% becomes negligible.

* root-cant-log-in problem

  The second patch, adding the tunable rootuser_reserve_pages, allows
  the admin to shoot themselves in the foot by setting it too small.  They
  can easily get the system into a state where root-can't-log-in.

  However, the new admin_reserve_kbytes will be safer than the current
  behavior since the hardcoded 3% of available memory reserve can shrink
  to something useless in the case where applications have grabbed all
  available memory.

Alternatives

 * Memory cgroups provide a more flexible way to limit application memory.

   Not everyone wants to set up cgroups or deal with their overhead.

 * We could create a fourth overcommit mode which provides smaller reserves.

   The size of useful reserves may be drastically different depending
   on the whether the system is embedded or enterprise.

 * Force users to initialize all of their memory or use calloc.

   Some users don't want/expect the system to overcommit when they malloc.
   Overcommit 'never' mode is for this scenario, and it should work well.

The new user and admin reserve tunables are simple to use, with low
overhead compared to cgroups.  The patches preserve current behavior where
3% of memory is less than 128MB, except that the admin reserve doesn't
shrink to an unusable size under pressure.  The code allows admins to tune
for embedded and enterprise usage.

FAQ

 * How is the root-cant-login problem addressed?
   What happens if admin_reserve_pages is set to 0?

   Root is free to shoot themselves in the foot by setting
   admin_reserve_kbytes too low.

   On x86_64, the minimum useful reserve is:
     8MB for overcommit 'guess'
   128MB for overcommit 'never'

   admin_reserve_pages defaults to min(3% free memory, 8MB)

   So, anyone switching to 'never' mode needs to adjust
   admin_reserve_pages.

 * How do you calculate a minimum useful reserve?

   A user or the admin needs enough memory to login and perform
   recovery operations, which includes, at a minimum:

   sshd or login + bash (or some other shell) + top (or ps, kill, etc.)

   For overcommit 'guess', we can sum resident set sizes (RSS)
   because we only need enough memory to handle what the recovery
   programs will typically use. On x86_64 this is about 8MB.

   For overcommit 'never', we can take the max of their virtual sizes (VSZ)
   and add the sum of their RSS. We use VSZ instead of RSS because mode
   forces us to ensure we can fulfill all of the requested memory allocations--
   even if the programs only use a fraction of what they ask for.
   On x86_64 this is about 128MB.

   When swap is enabled, reserves are useful even when they are as
   small as 10MB, regardless of overcommit mode.

   When both swap and overcommit are disabled, then the admin should
   tune the reserves higher to be absolutley safe. Over 230MB each
   was safest in my testing.

 * What happens if user_reserve_pages is set to 0?

   Note, this only affects overcomitt 'never' mode.

   Then a user will be able to allocate all available memory minus
   admin_reserve_kbytes.

   However, they will easily see a message such as:

   "bash: fork: Cannot allocate memory"

   And they won't be able to recover/kill their application.
   The admin should be able to recover the system if
   admin_reserve_kbytes is set appropriately.

 * What's the difference between overcommit 'guess' and 'never'?

   "Guess" allows an allocation if there are enough free + reclaimable
   pages. It has a hardcoded 3% of free pages reserved for root.

   "Never" allows an allocation if there is enough swap + a configurable
   percentage (default is 50) of physical RAM. It has a hardcoded 3% of
   free pages reserved for root, like "Guess" mode. It also has a
   hardcoded 3% of the current process size reserved for additional
   applications.

 * Why is overcommit 'guess' not suitable even when an app eventually
   writes to every page? It takes free pages, file pages, available
   swap pages, reclaimable slab pages into consideration. In other words,
   these are all pages available, then why isn't overcommit suitable?

   Because it only looks at the present state of the system. It
   does not take into account the memory that other applications have
   malloced, but haven't initialized yet. It overcommits the system.

Test Summary

There was little change in behavior in the default overcommit 'guess'
mode with swap enabled before and after the patch. This was expected.

Systems run most predictably (i.e. no oom kills) in overcommit 'never'
mode with swap enabled. This also allowed the most memory to be allocated
to a user application.

Overcommit 'guess' mode without swap is a bad idea. It is easy to
crash the system. None of the other tested combinations crashed.
This matches my experience on the Roadrunner supercomputer.

Without the tunable user reserve, a system in overcommit 'never' mode
and without swap does not allow the admin to recover, although the
admin can.

With the new tunable reserves, a system in overcommit 'never' mode
and without swap can be configured to:

1. maximize user-allocatable memory, running close to the edge of
recoverability

2. maximize recoverability, sacrificing allocatable memory to
ensure that a user cannot take down a system

Test Description

Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap

System is booted into multiuser console mode, with unnecessary services
turned off. Caches were dropped before each test.

Hogs are user memtester processes that attempt to allocate all free memory
as reported by /proc/meminfo

In overcommit 'never' mode, memory_ratio=100

Test Results

3.9.0-rc1-mm1

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5432/5432       no     yes             yes
guess        yes    4      5444/5444       1      yes             yes
guess        no     1      5302/5449       no     yes             yes
guess        no     4      -               crash  no              no

never        yes    1      5460/5460       1      yes             yes
never        yes    4      5460/5460       1      yes             yes
never        no     1      5218/5432       no     no              yes
never        no     4      5203/5448       no     no              yes

3.9.0-rc1-mm1-tunablereserves

User and Admin Recovery show their respective reserves, if applicable.

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5419/5419       no     - yes           8MB yes
guess        yes    4      5436/5436       1      - yes           8MB yes
guess        no     1      5440/5440       *      - yes           8MB yes
guess        no     4      -               crash  - no            8MB no

* process would successfully mlock, then the oom killer would pick it

never        yes    1      5446/5446       no     10MB yes        20MB yes
never        yes    4      5456/5456       no     10MB yes        20MB yes
never        no     1      5387/5429       no     128MB no        8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely

never        no     1      5359/5448       no     10MB no         10MB barely

never        no     1      5323/5428       no     0MB no          10MB barely
never        no     1      5332/5428       no     0MB no          50MB yes
never        no     1      5293/5429       no     0MB no          90MB yes

never        no     1      5001/5427       no     230MB yes       338MB yes
never        no     4*     4998/5424       no     230MB yes       338MB yes

* more memtesters were launched, able to allocate approximately another 100MB

Future Work

 - Test larger memory systems.

 - Test an embedded image.

 - Test other architectures.

 - Time malloc microbenchmarks.

 - Would it be useful to be able to set overcommit policy for
   each memory cgroup?

 - Some lines are slightly above 80 chars.
   Perhaps define a macro to convert between pages and kb?
   Other places in the kernel do this.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_user_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7aa11a6736e..43cfaabbde4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout;
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 
+extern unsigned long sysctl_user_reserve_kbytes;
+
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
 /* to align the pointer to the (next) page boundary */
-- 
cgit v1.2.3


From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:11 -0700
Subject: mm: replace hardcoded 3% with admin_reserve_pages knob

Add an admin_reserve_kbytes knob to allow admins to change the hardcoded
memory reserve to something other than 3%, which may be multiple
gigabytes on large memory systems.  Only about 8MB is necessary to
enable recovery in the default mode, and only a few hundred MB are
required even when overcommit is disabled.

This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER.

admin_reserve_kbytes is initialized to min(3% free pages, 8MB)

I arrived at 8MB by summing the RSS of sshd or login, bash, and top.

Please see first patch in this series for full background, motivation,
testing, and full changelog.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_admin_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43cfaabbde4..c05d7cfbb6b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout;
 #include <asm/processor.h>
 
 extern unsigned long sysctl_user_reserve_kbytes;
+extern unsigned long sysctl_admin_reserve_kbytes;
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
-- 
cgit v1.2.3


From f1cb08798e2497238b28f377bd131426f0b9835d Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Mon, 29 Apr 2013 15:08:14 -0700
Subject: mm: remove CONFIG_HOTPLUG ifdefs

CONFIG_HOTPLUG is going away as an option, cleanup CONFIG_HOTPLUG
ifdefs in mm files.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 5fd71a7d0df..c586679b6fe 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -48,13 +48,8 @@ static inline void count_vm_events(enum vm_event_item item, long delta)
 }
 
 extern void all_vm_events(unsigned long *);
-#ifdef CONFIG_HOTPLUG
+
 extern void vm_events_fold_cpu(int cpu);
-#else
-static inline void vm_events_fold_cpu(int cpu)
-{
-}
-#endif
 
 #else
 
-- 
cgit v1.2.3


From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 29 Apr 2013 15:08:19 -0700
Subject: resource: add release_mem_region_adjustable()

Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource.  This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.

This new interface is intended for memory hot-delete.  During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820.  Each memory resource entry usually covers
the whole contigous memory range.  Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory.  Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.

This new interface is restrictive (i.e.  release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource().  Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.

There is no change to the existing interfaces since their restriction is
valid for I/O resources.

[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 85ac9b9b72a..89b7c24a36e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *,
 extern int __check_region(struct resource *, resource_size_t, resource_size_t);
 extern void __release_region(struct resource *, resource_size_t,
 				resource_size_t);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+extern int release_mem_region_adjustable(struct resource *, resource_size_t,
+				resource_size_t);
+#endif
 
 static inline int __deprecated check_region(resource_size_t s,
 						resource_size_t n)
-- 
cgit v1.2.3


From 4edd7ceff0662afde195da6f6c43e7cbe1ed2dc4 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:08:22 -0700
Subject: mm, hotplug: avoid compiling memory hotremove functions when disabled

__remove_pages() is only necessary for CONFIG_MEMORY_HOTREMOVE.  PowerPC
pseries will return -EOPNOTSUPP if unsupported.

Adding an #ifdef causes several other functions it depends on to also
become unnecessary, which saves in .text when disabled (it's disabled in
most defconfigs besides powerpc, including x86).  remove_memory_block()
becomes static since it is not referenced outside of
drivers/base/memory.c.

Build tested on x86 and powerpc with CONFIG_MEMORY_HOTREMOVE both enabled
and disabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h         | 3 ++-
 include/linux/memory_hotplug.h | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0ff6598ee62..73817af8b48 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -115,9 +115,10 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 extern int register_new_memory(int, struct mem_section *);
+#ifdef CONFIG_MEMORY_HOTREMOVE
 extern int unregister_memory_section(struct mem_section *);
+#endif
 extern int memory_dev_init(void);
-extern int remove_memory_block(unsigned long, struct mem_section *, int);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7d47b..3e622c61092 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,13 +97,13 @@ extern void __online_page_free(struct page *page);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size);
+extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
-- 
cgit v1.2.3


From 70ddf637eebe47e61fb2be08a59315581b6d2f38 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Mon, 29 Apr 2013 15:08:31 -0700
Subject: memcg: add memory.pressure_level events

With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications.  The levels are defined like this:

The "low" level means that the system is reclaiming memory for new
allocations.  Monitoring this reclaiming activity might be useful for
maintaining cache level.  Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).

The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc.  Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.

The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger.  Applications should do whatever they can to help the
system.  It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.

The events are propagated upward until the event is handled, i.e.  the
events are not pass-through.  Here is what this means: for example you
have three cgroups: A->B->C.  Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure.  In
this situation, only group C will receive the notification, i.e.  groups
A and B will not receive it.  This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing.  So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)

Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features.  Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off.  The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g.  embedded) is also a viable
option, so it will not require any changes on the userland side.

[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmpressure.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 include/linux/vmpressure.h

(limited to 'include')

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
new file mode 100644
index 00000000000..76be077340e
--- /dev/null
+++ b/include/linux/vmpressure.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_VMPRESSURE_H
+#define __LINUX_VMPRESSURE_H
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/cgroup.h>
+
+struct vmpressure {
+	unsigned long scanned;
+	unsigned long reclaimed;
+	/* The lock is used to keep the scanned/reclaimed above in sync. */
+	struct mutex sr_lock;
+
+	/* The list of vmpressure_event structs. */
+	struct list_head events;
+	/* Have to grab the lock on events traversal or modifications. */
+	struct mutex events_lock;
+
+	struct work_struct work;
+};
+
+struct mem_cgroup;
+
+#ifdef CONFIG_MEMCG
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+		       unsigned long scanned, unsigned long reclaimed);
+extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
+
+extern void vmpressure_init(struct vmpressure *vmpr);
+extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
+extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+				     struct eventfd_ctx *eventfd,
+				     const char *args);
+extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+					struct eventfd_ctx *eventfd);
+#else
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+			      unsigned long scanned, unsigned long reclaimed) {}
+static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
+				   int prio) {}
+#endif /* CONFIG_MEMCG */
+#endif /* __LINUX_VMPRESSURE_H */
-- 
cgit v1.2.3


From 2f772e6cadf8ad8fca38927b17e6be028be669f5 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:34 -0700
Subject: mm: break up swap_writepage() for frontswap backends

swap_writepage() is currently where frontswap hooks into the swap write
path to capture pages with the frontswap_store() function.  However, if
a frontswap backend wants to "resume" the writeback of a page to the
swap device, it can't call swap_writepage() as the page will simply
reenter the backend.

This patch separates swap_writepage() into a top and bottom half, the
bottom half named __swap_writepage() to allow a frontswap backend, like
zswap, to resume writeback beyond the frontswap_store() hook.

__add_to_swap_cache() is also made non-static so that the page for which
writeback is to be resumed can be added to the swap cache.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2818a123f3e..76f6c3b3123 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,6 +330,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
@@ -345,6 +346,7 @@ extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
-- 
cgit v1.2.3


From 1eec6702a80e04416d528846a5ff2122484d95ec Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:35 -0700
Subject: mm: allow for outstanding swap writeback accounting

To prevent flooding the swap device with writebacks, frontswap backends
need to count and limit the number of outstanding writebacks.  The
incrementing of the counter can be done before the call to
__swap_writepage().  However, the caller must receive a notification
when the writeback completes in order to decrement the counter.

To achieve this functionality, this patch modifies __swap_writepage() to
take the bio completion callback function as an argument.

end_swap_bio_write(), the normal bio completion function, is also made
non-static so that code doing the accounting can call it after the
accounting is done.

There should be no behavioural change to existing code.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 76f6c3b3123..b5b12c71a2a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,7 +330,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern int __swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void end_swap_bio_write(struct bio *bio, int err);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
+	void (*end_write_func)(struct bio *, int));
 extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
-- 
cgit v1.2.3


From 5bc7b8aca942d03bf2716ddcfcb4e0b57e43a1b8 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 29 Apr 2013 15:08:36 -0700
Subject: mm: thp: add split tail pages to shrink page list in page reclaim

In page reclaim, huge page is split.  split_huge_page() adds tail pages
to LRU list.  Since we are reclaiming a huge page, it's better we
reclaim all subpages of the huge page instead of just the head page.
This patch adds split tail pages to shrink page list so the tail pages
can be reclaimed soon.

Before this patch, run a swap workload:
  thp_fault_alloc 3492
  thp_fault_fallback 608
  thp_collapse_alloc 6
  thp_collapse_alloc_failed 0
  thp_split 916

With this patch:
  thp_fault_alloc 4085
  thp_fault_fallback 16
  thp_collapse_alloc 90
  thp_collapse_alloc_failed 0
  thp_split 1272

fallback allocation is reduced a lot.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 11 ++++++++++-
 include/linux/swap.h    |  6 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee1c244a62a..528454c2caa 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -99,7 +99,11 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_to_list(page, NULL);
+}
 extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd);
 #define split_huge_page_pmd(__vma, __address, __pmd)			\
@@ -186,6 +190,11 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm
 #define transparent_hugepage_enabled(__vma) 0
 
 #define transparent_hugepage_flags 0UL
+static inline int
+split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+	return 0;
+}
 static inline int split_huge_page(struct page *page)
 {
 	return 0;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b5b12c71a2a..1701ce4be74 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -236,7 +236,7 @@ extern unsigned long nr_free_pagecache_pages(void);
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
-			      struct lruvec *lruvec);
+			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -346,7 +346,7 @@ extern struct address_space swapper_spaces[];
 #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, struct list_head *list);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *);
@@ -465,7 +465,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
 	return NULL;
 }
 
-static inline int add_to_swap(struct page *page)
+static inline int add_to_swap(struct page *page, struct list_head *list)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b4def3509d18c1db9198f92d4c35065e029a09a1 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Mon, 29 Apr 2013 15:08:52 -0700
Subject: mm, nobootmem: clean-up of free_low_memory_core_early()

Remove unused argument and make function static, because there is no user
outside of nobootmem.c

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab0183..5f0b0e1f7c0 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
 				       unsigned long endpfn);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
-extern unsigned long free_low_memory_core_early(int nodeid);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
 
-- 
cgit v1.2.3