From 68a2d20b79b105f02dcbc52c211d7e62f98996b7 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Mon, 29 Apr 2013 15:05:42 -0700 Subject: drivers/video: add Hyper-V Synthetic Video Frame Buffer Driver This is the driver for the Hyper-V Synthetic Video, which supports screen resolution up to Full HD 1920x1080 on Windows Server 2012 host, and 1600x1200 on Windows Server 2008 R2 or earlier. It also solves the double mouse cursor issue of the emulated video mode. Signed-off-by: Haiyang Zhang Reviewed-by: K. Y. Srinivasan Cc: Greg Kroah-Hartman , Cc: Olaf Hering Cc: Geert Uytterhoeven Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hyperv.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 95d0850584d..c2559847d7e 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1317,6 +1317,17 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver); 0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42, \ 0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4, 0x40 \ } +/* + * Synthetic Video GUID + * {DA0A7802-E377-4aac-8E77-0558EB1073F8} + */ +#define HV_SYNTHVID_GUID \ + .guid = { \ + 0x02, 0x78, 0x0a, 0xda, 0x77, 0xe3, 0xac, 0x4a, \ + 0x8e, 0x77, 0x05, 0x58, 0xeb, 0x10, 0x73, 0xf8 \ + } + + /* * Common header for Hyper-V ICs */ -- cgit v1.2.3 From 2c2fea11957c8c45bf873d9bcd7cd9a342654e79 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 29 Apr 2013 15:06:06 -0700 Subject: debug_locks.h: make warning more verbose The WARN_ON(1) in DEBUG_LOCKS_WARN_ON is surprisingly awkward to track down when it's hit, as it's usually buried in macros, causing multiple instances to land on the same line number. This patch makes it more useful by switching to: WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c); so that the particular DEBUG_LOCKS_WARN_ON is more easily identified and grep'd for. For example: WARNING: at kernel/mutex.c:198 _mutex_lock_nested+0x31c/0x380() DEBUG_LOCKS_WARN_ON(l->magic != l) Signed-off-by: James Hogan Cc: Paul Gortmaker Cc: David Howells Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 3bd46f76675..21ca773f77b 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -27,7 +27,7 @@ extern int debug_locks_off(void); \ if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ - WARN_ON(1); \ + WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c); \ __ret = 1; \ } \ __ret; \ -- cgit v1.2.3 From fe0bfaaff84429a35e4447d4ccd646aecf5999fb Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 29 Apr 2013 15:06:10 -0700 Subject: mm: trace filemap add and del Use the events API to trace filemap loading and unloading of file pieces into the page cache. This patch aims at tracing the eviction reload cycle of executable and shared libraries pages in a memory constrained environment. The typical usage is to spot a specific device and inode (for example /lib/libc.so) to see the eviction cycles, and find out if frequently used code is rather spread across many pages (bad) or coallesced (good). Signed-off-by: Robert Jarzmik Cc: Dave Chinner Cc: Hugh Dickins Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/filemap.h | 58 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 include/trace/events/filemap.h (limited to 'include') diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h new file mode 100644 index 00000000000..0421f49a20f --- /dev/null +++ b/include/trace/events/filemap.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM filemap + +#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FILEMAP_H + +#include +#include +#include +#include +#include +#include + +DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, + + TP_PROTO(struct page *page), + + TP_ARGS(page), + + TP_STRUCT__entry( + __field(struct page *, page) + __field(unsigned long, i_ino) + __field(unsigned long, index) + __field(dev_t, s_dev) + ), + + TP_fast_assign( + __entry->page = page; + __entry->i_ino = page->mapping->host->i_ino; + __entry->index = page->index; + if (page->mapping->host->i_sb) + __entry->s_dev = page->mapping->host->i_sb->s_dev; + else + __entry->s_dev = page->mapping->host->i_rdev; + ), + + TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, + __entry->page, + page_to_pfn(__entry->page), + __entry->index << PAGE_SHIFT) +); + +DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache, + TP_PROTO(struct page *page), + TP_ARGS(page) + ); + +DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, + TP_PROTO(struct page *page), + TP_ARGS(page) + ); + +#endif /* _TRACE_FILEMAP_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:06:11 -0700 Subject: mm, show_mem: suppress page counts in non-blockable contexts On large systems with a lot of memory, walking all RAM to determine page types may take a half second or even more. In non-blockable contexts, the page allocator will emit a page allocation failure warning unless __GFP_NOWARN is specified. In such contexts, irqs are typically disabled and such a lengthy delay may even result in NMI watchdog timeouts. To fix this, suppress the page walk in such contexts when printing the page allocation failure warning. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e2091b88d24..f3c7b1f9d1d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -899,7 +899,8 @@ extern void pagefault_out_of_memory(void); * Flags passed to show_mem() and show_free_areas() to suppress output in * various contexts. */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); -- cgit v1.2.3 From 250297edf83292c831fbf4504df54953c2aacfe4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 29 Apr 2013 15:06:12 -0700 Subject: mm/shmem.c: remove an ifdef Create a CONFIG_MMU=y stub for ramfs_nommu_expand_for_mapping() in the usual fashion. Cc: Al Viro Cc: Wolfram Sang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ramfs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 5bf5500db83..69e37c2d1ea 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -6,7 +6,13 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, extern struct dentry *ramfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); -#ifndef CONFIG_MMU +#ifdef CONFIG_MMU +static inline int +ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +{ + return 0; +} +#else extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize); extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, -- cgit v1.2.3 From 8375ad98cc1defc36adf4a77d9ea1e71db51a371 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Apr 2013 15:06:13 -0700 Subject: vm: adjust ifdef for TINY_RCU There is an ifdef in page_cache_get_speculative() that checks for !SMP and TREE_RCU, which has been an impossible combination since the advent of TINY_RCU. The ifdef enables a fastpath that is valid when preemption is disabled by rcu_read_lock() in UP systems, which is the case when TINY_RCU is enabled. This commit therefore adjusts the ifdef to generate the fastpath when TINY_RCU is enabled. Signed-off-by: Paul E. McKenney Reported-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0e38e13eb24..e3dea75a078 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -149,7 +149,7 @@ static inline int page_cache_get_speculative(struct page *page) { VM_BUG_ON(in_interrupt()); -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif -- cgit v1.2.3 From 69afade72a3e13e96a065f757891d384d466123f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:06:21 -0700 Subject: mm: introduce common help functions to deal with reserved/managed pages The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the first part, which applies to v3.9-rc1. It introduces following common helper functions to simplify free_initmem() and free_initrd_mem() on different architectures: adjust_managed_page_count(): will be used to adjust totalram_pages, totalhigh_pages, zone->managed_pages when reserving/unresering a page. __free_reserved_page(): free a reserved page into the buddy system without adjusting page statistics info free_reserved_page(): free a reserved page into the buddy system and adjust page statistics info mark_page_reserved(): mark a page as reserved and adjust page statistics info free_reserved_area(): free a continous ranges of pages by calling free_reserved_page() free_initmem_default(): default method to free __init pages. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part2: introduce free_highmem_page() to simplify freeing highmem pages Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Code to deal with reserved/managed pages are duplicated by many architectures, so introduce common help functions to reduce duplicated code. These common help functions will also be used to concentrate code to modify totalram_pages and zone->managed_pages, which makes the code much more clear. Signed-off-by: Jiang Liu Acked-by: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Anatolij Gustschin Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Eric Biederman Cc: Fenghua Yu Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Hirokazu Takata Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Mikael Starvik Cc: Mike Frysinger Cc: Paul Mackerras Cc: Paul Mundt Cc: Ralf Baechle Cc: Richard Henderson Cc: Russell King Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f3c7b1f9d1d..d064c73c925 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1295,6 +1295,54 @@ extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); +/* + * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) + * into the buddy system. The freed pages will be poisoned with pattern + * "poison" if it's non-zero. + * Return pages freed into the buddy system. + */ +extern unsigned long free_reserved_area(unsigned long start, unsigned long end, + int poison, char *s); + +static inline void adjust_managed_page_count(struct page *page, long count) +{ + totalram_pages += count; +} + +/* Free the reserved page into the buddy system, so it gets managed. */ +static inline void __free_reserved_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); +} + +static inline void free_reserved_page(struct page *page) +{ + __free_reserved_page(page); + adjust_managed_page_count(page, 1); +} + +static inline void mark_page_reserved(struct page *page) +{ + SetPageReserved(page); + adjust_managed_page_count(page, -1); +} + +/* + * Default method to free all the __init memory into the buddy system. + * The freed pages will be poisoned with pattern "poison" if it is + * non-zero. Return pages freed into the buddy system. + */ +static inline unsigned long free_initmem_default(int poison) +{ + extern char __init_begin[], __init_end[]; + + return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) , + ((unsigned long)&__init_end) & PAGE_MASK, + poison, "unused kernel"); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its -- cgit v1.2.3 From cfa11e08ed39eb28a9eff9a907b20913020c69b5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:07:00 -0700 Subject: mm: introduce free_highmem_page() helper to free highmem pages into buddy system The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the second part, which applies to the previous part at: http://marc.info/?l=linux-mm&m=136289696323825&w=2 It introduces a helper function free_highmem_page() to free highmem pages into the buddy system when initializing mm subsystem. Introduction of free_highmem_page() is one step forward to clean up accesses and modificaitons of totalhigh_pages, totalram_pages and zone->managed_pages etc. I hope we could remove all references to totalhigh_pages from the arch/ subdirectory. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Introduce helper function free_highmem_page(), which will be used by architectures with HIGHMEM enabled to free highmem pages into the buddy system. Signed-off-by: Jiang Liu Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "Suzuki K. Poulose" Cc: Alexander Graf Cc: Arnd Bergmann Cc: Attilio Rao Cc: Benjamin Herrenschmidt Cc: Cong Wang Cc: David Daney Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Konstantin Khlebnikov Cc: Linus Walleij Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Michal Simek Cc: Michel Lespinasse Cc: Minchan Kim Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Rik van Riel Cc: Russell King Cc: Sam Ravnborg Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d064c73c925..43b70d5f820 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1303,6 +1303,13 @@ extern void free_initmem(void); */ extern unsigned long free_reserved_area(unsigned long start, unsigned long end, int poison, char *s); +#ifdef CONFIG_HIGHMEM +/* + * Free a highmem page into the buddy system, adjusting totalhigh_pages + * and totalram_pages. + */ +extern void free_highmem_page(struct page *page); +#endif static inline void adjust_managed_page_count(struct page *page, long count) { -- cgit v1.2.3 From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:21 -0700 Subject: cgroup: remove css_get_next Now that we have generic and well ordered cgroup tree walkers there is no need to keep css_get_next in the place. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f5..470073bf93d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); -/* - * Get a cgroup whose id is greater than or equal to id under tree of root. - * Returning a cgroup_subsys_state or NULL. - */ -struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid); - /* Returns true if root is ancestor of cg */ bool css_is_ancestor(struct cgroup_subsys_state *cg, const struct cgroup_subsys_state *root); -- cgit v1.2.3 From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 29 Apr 2013 15:07:22 -0700 Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n drop_caches.c provides code only invokable via sysctl, so don't compile it in when CONFIG_SYSCTL=n. Signed-off-by: Josh Triplett Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 43b70d5f820..98a44376e4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr); #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_SYSCTL +extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +#endif + unsigned long shrink_slab(struct shrink_control *shrink, unsigned long nr_pages_scanned, unsigned long lru_pages); -- cgit v1.2.3 From 106c992a5ebef28193cf5958e49ceff5e4aebb04 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 29 Apr 2013 15:07:23 -0700 Subject: mm/hugetlb: add more arch-defined huge_pte functions Commit abf09bed3cce ("s390/mm: implement software dirty bits") introduced another difference in the pte layout vs. the pmd layout on s390, thoroughly breaking the s390 support for hugetlbfs. This requires replacing some more pte_xxx functions in mm/hugetlbfs.c with a huge_pte_xxx version. This patch introduces those huge_pte_xxx functions and their generic implementation in asm-generic/hugetlb.h, which will now be included on all architectures supporting hugetlbfs apart from s390. This change will be a no-op for those architectures. [akpm@linux-foundation.org: fix warning] Signed-off-by: Gerald Schaefer Cc: Mel Gorman Cc: Hugh Dickins Cc: Hillf Danton Acked-by: Michal Hocko [for !s390 parts] Cc: Tony Luck Cc: Fenghua Yu Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/hugetlb.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/asm-generic/hugetlb.h (limited to 'include') diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h new file mode 100644 index 00000000000..d06079c774a --- /dev/null +++ b/include/asm-generic/hugetlb.h @@ -0,0 +1,40 @@ +#ifndef _ASM_GENERIC_HUGETLB_H +#define _ASM_GENERIC_HUGETLB_H + +static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) +{ + return mk_pte(page, pgprot); +} + +static inline int huge_pte_write(pte_t pte) +{ + return pte_write(pte); +} + +static inline int huge_pte_dirty(pte_t pte) +{ + return pte_dirty(pte); +} + +static inline pte_t huge_pte_mkwrite(pte_t pte) +{ + return pte_mkwrite(pte); +} + +static inline pte_t huge_pte_mkdirty(pte_t pte) +{ + return pte_mkdirty(pte); +} + +static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) +{ + return pte_modify(pte, newprot); +} + +static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} + +#endif /* _ASM_GENERIC_HUGETLB_H */ -- cgit v1.2.3 From 7136851117744f1d291bed6d307432699d405109 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Apr 2013 15:07:25 -0700 Subject: mm: make snapshotting pages for stable writes a per-bio operation Walking a bio's page mappings has proved problematic, so create a new bio flag to indicate that a bio's data needs to be snapshotted in order to guarantee stable pages during writeback. Next, for the one user (ext3/jbd) of snapshotting, hook all the places where writes can be initiated without PG_writeback set, and set BIO_SNAP_STABLE there. We must also flag journal "metadata" bios for stable writeout, since file data can be written through the journal. Finally, the MS_SNAP_STABLE mount flag (only used by ext3) is now superfluous, so get rid of it. [akpm@linux-foundation.org: rename _submit_bh()'s `flags' to `bio_flags', delobotomize the _submit_bh declaration] [akpm@linux-foundation.org: teeny cleanup] Signed-off-by: Darrick J. Wong Cc: Andy Lutomirski Cc: Adrian Hunter Cc: Artem Bityutskiy Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blk_types.h | 3 ++- include/linux/buffer_head.h | 1 + include/uapi/linux/fs.h | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cdf11191e64..22990cf4439 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -111,12 +111,13 @@ struct bio { #define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ #define BIO_QUIET 10 /* Make BIO Quiet */ #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ +#define BIO_SNAP_STABLE 12 /* bio data must be snapshotted during write */ /* * Flags starting here get preserved by bio_reset() - this includes * BIO_POOL_IDX() */ -#define BIO_RESET_BITS 12 +#define BIO_RESET_BITS 13 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5afc4f94d11..4c16c4a88d4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -181,6 +181,7 @@ void ll_rw_block(int, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, int rw); void write_dirty_buffer(struct buffer_head *bh, int rw); +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags); int submit_bh(int, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index c7fc1e6517c..a4ed56cf0ea 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -88,7 +88,6 @@ struct inodes_stat_t { #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ /* These sb flags are internal to the kernel */ -#define MS_SNAP_STABLE (1<<27) /* Snapshot pages during writeback, if needed */ #define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) -- cgit v1.2.3 From db3808c1bac64740b9d830fda92801ae65f1c851 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:28 -0700 Subject: mm, vmalloc: move get_vmalloc_info() to vmalloc.c Now get_vmalloc_info() is in fs/proc/mmu.c. There is no reason that this code must be here and it's implementation needs vmlist_lock and it iterate a vmlist which may be internal data structure for vmalloc. It is preferable that vmlist_lock and vmlist is only used in vmalloc.c for maintainability. So move the code to vmalloc.c Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6071e911c7f..698b1e50d3a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -158,4 +158,22 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) # endif #endif +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +#ifdef CONFIG_MMU +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +extern void get_vmalloc_info(struct vmalloc_info *vmi); +#else + +#define VMALLOC_TOTAL 0UL +#define get_vmalloc_info(vmi) \ +do { \ + (vmi)->used = 0; \ + (vmi)->largest_chunk = 0; \ +} while (0) +#endif + #endif /* _LINUX_VMALLOC_H */ -- cgit v1.2.3 From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:37 -0700 Subject: mm, vmalloc: export vmap_area_list, instead of vmlist Although our intention is to unexport internal structure entirely, but there is one exception for kexec. kexec dumps address of vmlist and makedumpfile uses this information. We are about to remove vmlist, then another way to retrieve information of vmalloc layer is needed for makedumpfile. For this purpose, we export vmap_area_list, instead of vmlist. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Eric Biederman Cc: Dave Anderson Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Chris Metcalf Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 698b1e50d3a..8a25f9081ed 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count); /* * Internals. Dont't use.. */ -extern rwlock_t vmlist_lock; -extern struct vm_struct *vmlist; +extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); -- cgit v1.2.3 From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Mon, 29 Apr 2013 15:07:40 -0700 Subject: kexec, vmalloc: export additional vmalloc layer information Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get the start address of vmalloc region (vmalloc_start). The address which contains vmalloc_start value is represented as below: vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start) However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list) aren't exported as VMCOREINFO. So this patch exports them externally with small cleanup. [akpm@linux-foundation.org: vmalloc.h should include list.h for list_head] Signed-off-by: Atsushi Kumagai Cc: Joonsoo Kim Cc: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 8a25f9081ed..7d5773a99f2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -3,7 +3,9 @@ #include #include +#include #include /* pgprot_t */ +#include struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -35,6 +37,17 @@ struct vm_struct { const void *caller; }; +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + struct vm_struct *vm; + struct rcu_head rcu_head; +}; + /* * Highlevel APIs for driver use */ -- cgit v1.2.3 From 6ee8630e02be6dd89926ca0fbc21af68b23dc087 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 29 Apr 2013 15:07:44 -0700 Subject: mm: allow arch code to control the user page table ceiling On architectures where a pgd entry may be shared between user and kernel (e.g. ARM+LPAE), freeing page tables needs a ceiling other than 0. This patch introduces a generic USER_PGTABLES_CEILING that arch code can override. It is the responsibility of the arch code setting the ceiling to ensure the complete freeing of the page tables (usually in pgd_free()). [catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes] Signed-off-by: Hugh Dickins Signed-off-by: Catalin Marinas Cc: Russell King Cc: [3.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index bfd87685fc1..a59ff51b016 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -7,6 +7,16 @@ #include #include +/* + * On almost all architectures and configurations, 0 can be used as the + * upper ceiling to free_pgtables(): on many architectures it has the same + * effect as using TASK_SIZE. However, there is one configuration which + * must impose a more careful limit, to avoid freeing kernel pgtables. + */ +#ifndef USER_PGTABLES_CEILING +#define USER_PGTABLES_CEILING 0UL +#endif + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, -- cgit v1.2.3 From 949f7ec5760b021da3cccc1eaeb0671270e4238f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:07:48 -0700 Subject: mm, hugetlb: include hugepages in meminfo Particularly in oom conditions, it's troublesome that hugetlb memory is not displayed. All other meminfo that is emitted will not add up to what is expected, and there is no artifact left in the kernel log to show that a potentially significant amount of memory is actually allocated as hugepages which are not available to be reclaimed. Booting with hugepages=8192 on the command line, this memory is now shown in oom conditions. For example, with echo m > /proc/sysrq-trigger: Node 0 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 1 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 2 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 3 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 16e4e9a643f..3a62df310f2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -58,6 +58,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, int hugetlb_prefault(struct address_space *, struct vm_area_struct *); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); +void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -114,6 +115,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) { } #define hugetlb_report_node_meminfo(n, buf) 0 +static inline void hugetlb_show_meminfo(void) +{ +} #define follow_huge_pmd(mm, addr, pmd, write) NULL #define follow_huge_pud(mm, addr, pud, write) NULL #define prepare_hugepage_range(file, addr, len) (-EINVAL) -- cgit v1.2.3 From 0aad818b2de455f1bfd7ef87c28cdbbaaed9a699 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Apr 2013 15:07:50 -0700 Subject: sparse-vmemmap: specify vmemmap population range in bytes The sparse code, when asking the architecture to populate the vmemmap, specifies the section range as a starting page and a number of pages. This is an awkward interface, because none of the arch-specific code actually thinks of the range in terms of 'struct page' units and always translates it to bytes first. In addition, later patches mix huge page and regular page backing for the vmemmap. For this, they need to call vmemmap_populate_basepages() on sub-section ranges with PAGE_SIZE and PMD_SIZE in mind. But these are not necessarily multiples of the 'struct page' size and so this unit is too coarse. Just translate the section range into bytes once in the generic sparse code, then pass byte ranges down the stack. Signed-off-by: Johannes Weiner Cc: Ben Hutchings Cc: Bernhard Schmidt Cc: Johannes Weiner Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: "Luck, Tony" Cc: Heiko Carstens Acked-by: David S. Miller Tested-by: David S. Miller Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 98a44376e4f..6d7266842ab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1764,12 +1764,12 @@ pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); void *vmemmap_alloc_block_buf(unsigned long size, int node); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); -int vmemmap_populate_basepages(struct page *start_page, - unsigned long pages, int node); -int vmemmap_populate(struct page *start_page, unsigned long pages, int node); +int vmemmap_populate_basepages(unsigned long start, unsigned long end, + int node); +int vmemmap_populate(unsigned long start, unsigned long end, int node); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG -void vmemmap_free(struct page *memmap, unsigned long nr_pages); +void vmemmap_free(unsigned long start, unsigned long end); #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long size); -- cgit v1.2.3 From f9872caf07c1c774034b8bddde7d4a3a7f4a6484 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Mon, 29 Apr 2013 15:08:01 -0700 Subject: page_alloc: make setup_nr_node_ids() usable for arch init code powerpc and x86 were opencoding copies of setup_nr_node_ids(), which page_alloc provides but makes static. Make it avaliable to the archs in linux/mm.h. Signed-off-by: Cody P Schafer Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d7266842ab..7aa11a6736e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1816,5 +1816,11 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ +#if MAX_NUMNODES > 1 +void __init setup_nr_node_ids(void); +#else +static inline void setup_nr_node_ids(void) {} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From f02c696800886382198df897b30bb796b46a8dae Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 29 Apr 2013 15:08:04 -0700 Subject: include/linux/memory.h: implement register_hotmemory_notifier() When CONFIG_MEMORY_HOTPLUG=n, we don't want the memory-hotplug notifier handlers to be included in the .o files, for space reasons. The existing hotplug_memory_notifier() tries to handle this but testing with gcc-4.4.4 shows that it doesn't work - the hotplug functions are still present in the .o files. So implement a new register_hotmemory_notifier() which is a copy of register_hotcpu_notifier(), and which actually works as desired. hotplug_memory_notifier() and register_memory_notifier() callsites should be converted to use this new register_hotmemory_notifier(). While we're there, let's repair the existing hotplug_memory_notifier(): it simply stomps on the register_memory_notifier() return value, so well-behaved code cannot check for errors. Apparently non of the existing callers were well-behaved :( Cc: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 14 ++++++++++---- include/linux/notifier.h | 5 ++++- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 45e93b46887..0ff6598ee62 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -18,6 +18,7 @@ #include #include #include +#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -127,13 +128,18 @@ enum mem_add_context { BOOT, HOTPLUG }; #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG -#define hotplug_memory_notifier(fn, pri) { \ +#define hotplug_memory_notifier(fn, pri) ({ \ static __meminitdata struct notifier_block fn##_mem_nb =\ - { .notifier_call = fn, .priority = pri }; \ + { .notifier_call = fn, .priority = pri };\ register_memory_notifier(&fn##_mem_nb); \ -} +}) +#define register_hotmemory_notifier(nb) register_memory_notifier(nb) +#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #else -#define hotplug_memory_notifier(fn, pri) do { } while (0) +#define hotplug_memory_notifier(fn, pri) (0) +/* These aren't inline functions due to a GCC bug. */ +#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) +#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) #endif /* diff --git a/include/linux/notifier.h b/include/linux/notifier.h index d65746efc95..d14a4c36246 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -47,8 +47,11 @@ * runtime initialization. */ +typedef int (*notifier_fn_t)(struct notifier_block *nb, + unsigned long action, void *data); + struct notifier_block { - int (*notifier_call)(struct notifier_block *, unsigned long, void *); + notifier_fn_t notifier_call; struct notifier_block __rcu *next; int priority; }; -- cgit v1.2.3 From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:10 -0700 Subject: mm: limit growth of 3% hardcoded other user reserve Add user_reserve_kbytes knob. Limit the growth of the memory reserved for other user processes to min(3% current process size, user_reserve_pages). Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. user_reserve_pages defaults to min(3% free pages, 128MB) I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ... then adding the RSS of each. This only affects OVERCOMMIT_NEVER mode. Background 1. user reserve __vm_enough_memory reserves a hardcoded 3% of the current process size for other applications when overcommit is disabled. This was done so that a user could recover if they launched a memory hogging process. Without the reserve, a user would easily run into a message such as: bash: fork: Cannot allocate memory 2. admin reserve Additionally, a hardcoded 3% of free memory is reserved for root in both overcommit 'guess' and 'never' modes. This was intended to prevent a scenario where root-cant-log-in and perform recovery operations. Note that this reserve shrinks, and doesn't guarantee a useful reserve. Motivation The two hardcoded memory reserves should be updated to account for current memory sizes. Also, the admin reserve would be more useful if it didn't shrink too much. When the current code was originally written, 1GB was considered "enterprise". Now the 3% reserve can grow to multiple GB on large memory systems, and it only needs to be a few hundred MB at most to enable a user or admin to recover a system with an unwanted memory hogging process. I've found that reducing these reserves is especially beneficial for a specific type of application load: * single application system * one or few processes (e.g. one per core) * allocating all available memory * not initializing every page immediately * long running I've run scientific clusters with this sort of load. A long running job sometimes failed many hours (weeks of CPU time) into a calculation. They weren't initializing all of their memory immediately, and they weren't using calloc, so I put systems into overcommit 'never' mode. These clusters run diskless and have no swap. However, with the current reserves, a user wishing to allocate as much memory as possible to one process may be prevented from using, for example, almost 2GB out of 32GB. The effect is less, but still significant when a user starts a job with one process per core. I have repeatedly seen a set of processes requesting the same amount of memory fail because one of them could not allocate the amount of memory a user would expect to be able to allocate. For example, Message Passing Interfce (MPI) processes, one per core. And it is similar for other parallel programming frameworks. Changing this reserve code will make the overcommit never mode more useful by allowing applications to allocate nearly all of the available memory. Also, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Risks * "bash: fork: Cannot allocate memory" The downside of the first patch-- which creates a tunable user reserve that is only used in overcommit 'never' mode--is that an admin can set it so low that a user may not be able to kill their process, even if they already have a shell prompt. Of course, a user can get in the same predicament with the current 3% reserve--they just have to launch processes until 3% becomes negligible. * root-cant-log-in problem The second patch, adding the tunable rootuser_reserve_pages, allows the admin to shoot themselves in the foot by setting it too small. They can easily get the system into a state where root-can't-log-in. However, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Alternatives * Memory cgroups provide a more flexible way to limit application memory. Not everyone wants to set up cgroups or deal with their overhead. * We could create a fourth overcommit mode which provides smaller reserves. The size of useful reserves may be drastically different depending on the whether the system is embedded or enterprise. * Force users to initialize all of their memory or use calloc. Some users don't want/expect the system to overcommit when they malloc. Overcommit 'never' mode is for this scenario, and it should work well. The new user and admin reserve tunables are simple to use, with low overhead compared to cgroups. The patches preserve current behavior where 3% of memory is less than 128MB, except that the admin reserve doesn't shrink to an unusable size under pressure. The code allows admins to tune for embedded and enterprise usage. FAQ * How is the root-cant-login problem addressed? What happens if admin_reserve_pages is set to 0? Root is free to shoot themselves in the foot by setting admin_reserve_kbytes too low. On x86_64, the minimum useful reserve is: 8MB for overcommit 'guess' 128MB for overcommit 'never' admin_reserve_pages defaults to min(3% free memory, 8MB) So, anyone switching to 'never' mode needs to adjust admin_reserve_pages. * How do you calculate a minimum useful reserve? A user or the admin needs enough memory to login and perform recovery operations, which includes, at a minimum: sshd or login + bash (or some other shell) + top (or ps, kill, etc.) For overcommit 'guess', we can sum resident set sizes (RSS) because we only need enough memory to handle what the recovery programs will typically use. On x86_64 this is about 8MB. For overcommit 'never', we can take the max of their virtual sizes (VSZ) and add the sum of their RSS. We use VSZ instead of RSS because mode forces us to ensure we can fulfill all of the requested memory allocations-- even if the programs only use a fraction of what they ask for. On x86_64 this is about 128MB. When swap is enabled, reserves are useful even when they are as small as 10MB, regardless of overcommit mode. When both swap and overcommit are disabled, then the admin should tune the reserves higher to be absolutley safe. Over 230MB each was safest in my testing. * What happens if user_reserve_pages is set to 0? Note, this only affects overcomitt 'never' mode. Then a user will be able to allocate all available memory minus admin_reserve_kbytes. However, they will easily see a message such as: "bash: fork: Cannot allocate memory" And they won't be able to recover/kill their application. The admin should be able to recover the system if admin_reserve_kbytes is set appropriately. * What's the difference between overcommit 'guess' and 'never'? "Guess" allows an allocation if there are enough free + reclaimable pages. It has a hardcoded 3% of free pages reserved for root. "Never" allows an allocation if there is enough swap + a configurable percentage (default is 50) of physical RAM. It has a hardcoded 3% of free pages reserved for root, like "Guess" mode. It also has a hardcoded 3% of the current process size reserved for additional applications. * Why is overcommit 'guess' not suitable even when an app eventually writes to every page? It takes free pages, file pages, available swap pages, reclaimable slab pages into consideration. In other words, these are all pages available, then why isn't overcommit suitable? Because it only looks at the present state of the system. It does not take into account the memory that other applications have malloced, but haven't initialized yet. It overcommits the system. Test Summary There was little change in behavior in the default overcommit 'guess' mode with swap enabled before and after the patch. This was expected. Systems run most predictably (i.e. no oom kills) in overcommit 'never' mode with swap enabled. This also allowed the most memory to be allocated to a user application. Overcommit 'guess' mode without swap is a bad idea. It is easy to crash the system. None of the other tested combinations crashed. This matches my experience on the Roadrunner supercomputer. Without the tunable user reserve, a system in overcommit 'never' mode and without swap does not allow the admin to recover, although the admin can. With the new tunable reserves, a system in overcommit 'never' mode and without swap can be configured to: 1. maximize user-allocatable memory, running close to the edge of recoverability 2. maximize recoverability, sacrificing allocatable memory to ensure that a user cannot take down a system Test Description Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap System is booted into multiuser console mode, with unnecessary services turned off. Caches were dropped before each test. Hogs are user memtester processes that attempt to allocate all free memory as reported by /proc/meminfo In overcommit 'never' mode, memory_ratio=100 Test Results 3.9.0-rc1-mm1 Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5432/5432 no yes yes guess yes 4 5444/5444 1 yes yes guess no 1 5302/5449 no yes yes guess no 4 - crash no no never yes 1 5460/5460 1 yes yes never yes 4 5460/5460 1 yes yes never no 1 5218/5432 no no yes never no 4 5203/5448 no no yes 3.9.0-rc1-mm1-tunablereserves User and Admin Recovery show their respective reserves, if applicable. Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5419/5419 no - yes 8MB yes guess yes 4 5436/5436 1 - yes 8MB yes guess no 1 5440/5440 * - yes 8MB yes guess no 4 - crash - no 8MB no * process would successfully mlock, then the oom killer would pick it never yes 1 5446/5446 no 10MB yes 20MB yes never yes 4 5456/5456 no 10MB yes 20MB yes never no 1 5387/5429 no 128MB no 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5359/5448 no 10MB no 10MB barely never no 1 5323/5428 no 0MB no 10MB barely never no 1 5332/5428 no 0MB no 50MB yes never no 1 5293/5429 no 0MB no 90MB yes never no 1 5001/5427 no 230MB yes 338MB yes never no 4* 4998/5424 no 230MB yes 338MB yes * more memtesters were launched, able to allocate approximately another 100MB Future Work - Test larger memory systems. - Test an embedded image. - Test other architectures. - Time malloc microbenchmarks. - Would it be useful to be able to set overcommit policy for each memory cgroup? - Some lines are slightly above 80 chars. Perhaps define a macro to convert between pages and kb? Other places in the kernel do this. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_user_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7aa11a6736e..43cfaabbde4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout; #include #include +extern unsigned long sysctl_user_reserve_kbytes; + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ -- cgit v1.2.3 From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:11 -0700 Subject: mm: replace hardcoded 3% with admin_reserve_pages knob Add an admin_reserve_kbytes knob to allow admins to change the hardcoded memory reserve to something other than 3%, which may be multiple gigabytes on large memory systems. Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER. admin_reserve_kbytes is initialized to min(3% free pages, 8MB) I arrived at 8MB by summing the RSS of sshd or login, bash, and top. Please see first patch in this series for full background, motivation, testing, and full changelog. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_admin_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 43cfaabbde4..c05d7cfbb6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout; #include extern unsigned long sysctl_user_reserve_kbytes; +extern unsigned long sysctl_admin_reserve_kbytes; #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -- cgit v1.2.3 From f1cb08798e2497238b28f377bd131426f0b9835d Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Mon, 29 Apr 2013 15:08:14 -0700 Subject: mm: remove CONFIG_HOTPLUG ifdefs CONFIG_HOTPLUG is going away as an option, cleanup CONFIG_HOTPLUG ifdefs in mm files. Signed-off-by: Yijing Wang Acked-by: Greg Kroah-Hartman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 5fd71a7d0df..c586679b6fe 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -48,13 +48,8 @@ static inline void count_vm_events(enum vm_event_item item, long delta) } extern void all_vm_events(unsigned long *); -#ifdef CONFIG_HOTPLUG + extern void vm_events_fold_cpu(int cpu); -#else -static inline void vm_events_fold_cpu(int cpu) -{ -} -#endif #else -- cgit v1.2.3 From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:19 -0700 Subject: resource: add release_mem_region_adjustable() Add release_mem_region_adjustable(), which releases a requested region from a currently busy memory resource. This interface adjusts the matched memory resource accordingly even if the requested region does not match exactly but still fits into. This new interface is intended for memory hot-delete. During bootup, memory resources are inserted from the boot descriptor table, such as EFI Memory Table and e820. Each memory resource entry usually covers the whole contigous memory range. Memory hot-delete request, on the other hand, may target to a particular range of memory resource, and its size can be much smaller than the whole contiguous memory. Since the existing release interfaces like __release_region() require a requested region to be exactly matched to a resource entry, they do not allow a partial resource to be released. This new interface is restrictive (i.e. release under certain conditions), which is consistent with other release interfaces, __release_region() and __release_resource(). Additional release conditions, such as an overlapping region to a resource entry, can be supported after they are confirmed as valid cases. There is no change to the existing interfaces since their restriction is valid for I/O resources. [akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()] [akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily] [akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi] Signed-off-by: Toshi Kani Reviewed-by : Yasuaki Ishimatsu Cc: David Rientjes Reviewed-by: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 85ac9b9b72a..89b7c24a36e 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *, extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern int release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); +#endif static inline int __deprecated check_region(resource_size_t s, resource_size_t n) -- cgit v1.2.3 From 4edd7ceff0662afde195da6f6c43e7cbe1ed2dc4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:08:22 -0700 Subject: mm, hotplug: avoid compiling memory hotremove functions when disabled __remove_pages() is only necessary for CONFIG_MEMORY_HOTREMOVE. PowerPC pseries will return -EOPNOTSUPP if unsupported. Adding an #ifdef causes several other functions it depends on to also become unnecessary, which saves in .text when disabled (it's disabled in most defconfigs besides powerpc, including x86). remove_memory_block() becomes static since it is not referenced outside of drivers/base/memory.c. Build tested on x86 and powerpc with CONFIG_MEMORY_HOTREMOVE both enabled and disabled. Signed-off-by: David Rientjes Acked-by: Toshi Kani Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Greg Kroah-Hartman Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 3 ++- include/linux/memory_hotplug.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0ff6598ee62..73817af8b48 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -115,9 +115,10 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); extern int register_new_memory(int, struct mem_section *); +#ifdef CONFIG_MEMORY_HOTREMOVE extern int unregister_memory_section(struct mem_section *); +#endif extern int memory_dev_init(void); -extern int remove_memory_block(unsigned long, struct mem_section *, int); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block_hinted(struct mem_section *, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b6a3be7d47b..3e622c61092 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -97,13 +97,13 @@ extern void __online_page_free(struct page *page); #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size); +extern int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); -- cgit v1.2.3 From 70ddf637eebe47e61fb2be08a59315581b6d2f38 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 29 Apr 2013 15:08:31 -0700 Subject: memcg: add memory.pressure_level events With this patch userland applications that want to maintain the interactivity/memory allocation cost can use the pressure level notifications. The levels are defined like this: The "low" level means that the system is reclaiming memory for new allocations. Monitoring this reclaiming activity might be useful for maintaining cache level. Upon notification, the program (typically "Activity Manager") might analyze vmstat and act in advance (i.e. prematurely shutdown unimportant services). The "medium" level means that the system is experiencing medium memory pressure, the system might be making swap, paging out active file caches, etc. Upon this event applications may decide to further analyze vmstat/zoneinfo/memcg or internal memory usage statistics and free any resources that can be easily reconstructed or re-read from a disk. The "critical" level means that the system is actively thrashing, it is about to out of memory (OOM) or even the in-kernel OOM killer is on its way to trigger. Applications should do whatever they can to help the system. It might be too late to consult with vmstat or any other statistics, so it's advisable to take an immediate action. The events are propagated upward until the event is handled, i.e. the events are not pass-through. Here is what this means: for example you have three cgroups: A->B->C. Now you set up an event listener on cgroups A, B and C, and suppose group C experiences some pressure. In this situation, only group C will receive the notification, i.e. groups A and B will not receive it. This is done to avoid excessive "broadcasting" of messages, which disturbs the system and which is especially bad if we are low on memory or thrashing. So, organize the cgroups wisely, or propagate the events manually (or, ask us to implement the pass-through events, explaining why would you need them.) Performance wise, the memory pressure notifications feature itself is lightweight and does not require much of bookkeeping, in contrast to the rest of memcg features. Unfortunately, as of current memcg implementation, pages accounting is an inseparable part and cannot be turned off. The good news is that there are some efforts[1] to improve the situation; plus, implementing the same, fully API-compatible[2] interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable option, so it will not require any changes on the userland side. [1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291 [2] http://lkml.org/lkml/2013/2/21/454 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings] Signed-off-by: Anton Vorontsov Acked-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: David Rientjes Cc: Pekka Enberg Cc: Mel Gorman Cc: Glauber Costa Cc: Michal Hocko Cc: Luiz Capitulino Cc: Greg Thelen Cc: Leonid Moiseichuk Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 include/linux/vmpressure.h (limited to 'include') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h new file mode 100644 index 00000000000..76be077340e --- /dev/null +++ b/include/linux/vmpressure.h @@ -0,0 +1,47 @@ +#ifndef __LINUX_VMPRESSURE_H +#define __LINUX_VMPRESSURE_H + +#include +#include +#include +#include +#include +#include + +struct vmpressure { + unsigned long scanned; + unsigned long reclaimed; + /* The lock is used to keep the scanned/reclaimed above in sync. */ + struct mutex sr_lock; + + /* The list of vmpressure_event structs. */ + struct list_head events; + /* Have to grab the lock on events traversal or modifications. */ + struct mutex events_lock; + + struct work_struct work; +}; + +struct mem_cgroup; + +#ifdef CONFIG_MEMCG +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed); +extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); + +extern void vmpressure_init(struct vmpressure *vmpr); +extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); +extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); +extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd, + const char *args); +extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd); +#else +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) {} +static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, + int prio) {} +#endif /* CONFIG_MEMCG */ +#endif /* __LINUX_VMPRESSURE_H */ -- cgit v1.2.3 From 2f772e6cadf8ad8fca38927b17e6be028be669f5 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 29 Apr 2013 15:08:34 -0700 Subject: mm: break up swap_writepage() for frontswap backends swap_writepage() is currently where frontswap hooks into the swap write path to capture pages with the frontswap_store() function. However, if a frontswap backend wants to "resume" the writeback of a page to the swap device, it can't call swap_writepage() as the page will simply reenter the backend. This patch separates swap_writepage() into a top and bottom half, the bottom half named __swap_writepage() to allow a frontswap backend, like zswap, to resume writeback beyond the frontswap_store() hook. __add_to_swap_cache() is also made non-static so that the page for which writeback is to be resumed can be added to the swap cache. Signed-off-by: Seth Jennings Signed-off-by: Bob Liu Acked-by: Minchan Kim Reviewed-by: Dan Magenheimer Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2818a123f3e..76f6c3b3123 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -330,6 +330,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int __swap_writepage(struct page *page, struct writeback_control *wbc); extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_read(struct bio *bio, int err); @@ -345,6 +346,7 @@ extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); +extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); -- cgit v1.2.3 From 1eec6702a80e04416d528846a5ff2122484d95ec Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 29 Apr 2013 15:08:35 -0700 Subject: mm: allow for outstanding swap writeback accounting To prevent flooding the swap device with writebacks, frontswap backends need to count and limit the number of outstanding writebacks. The incrementing of the counter can be done before the call to __swap_writepage(). However, the caller must receive a notification when the writeback completes in order to decrement the counter. To achieve this functionality, this patch modifies __swap_writepage() to take the bio completion callback function as an argument. end_swap_bio_write(), the normal bio completion function, is also made non-static so that code doing the accounting can call it after the accounting is done. There should be no behavioural change to existing code. Signed-off-by: Seth Jennings Signed-off-by: Bob Liu Acked-by: Minchan Kim Reviewed-by: Dan Magenheimer Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 76f6c3b3123..b5b12c71a2a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -330,7 +330,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern int __swap_writepage(struct page *page, struct writeback_control *wbc); +extern void end_swap_bio_write(struct bio *bio, int err); +extern int __swap_writepage(struct page *page, struct writeback_control *wbc, + void (*end_write_func)(struct bio *, int)); extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_read(struct bio *bio, int err); -- cgit v1.2.3 From 5bc7b8aca942d03bf2716ddcfcb4e0b57e43a1b8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 29 Apr 2013 15:08:36 -0700 Subject: mm: thp: add split tail pages to shrink page list in page reclaim In page reclaim, huge page is split. split_huge_page() adds tail pages to LRU list. Since we are reclaiming a huge page, it's better we reclaim all subpages of the huge page instead of just the head page. This patch adds split tail pages to shrink page list so the tail pages can be reclaimed soon. Before this patch, run a swap workload: thp_fault_alloc 3492 thp_fault_fallback 608 thp_collapse_alloc 6 thp_collapse_alloc_failed 0 thp_split 916 With this patch: thp_fault_alloc 4085 thp_fault_fallback 16 thp_collapse_alloc 90 thp_collapse_alloc_failed 0 thp_split 1272 fallback allocation is reduced a lot. [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Signed-off-by: Shaohua Li Acked-by: Rik van Riel Acked-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: Wanpeng Li Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 11 ++++++++++- include/linux/swap.h | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ee1c244a62a..528454c2caa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -99,7 +99,11 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, extern int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags); -extern int split_huge_page(struct page *page); +extern int split_huge_page_to_list(struct page *page, struct list_head *list); +static inline int split_huge_page(struct page *page) +{ + return split_huge_page_to_list(page, NULL); +} extern void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd); #define split_huge_page_pmd(__vma, __address, __pmd) \ @@ -186,6 +190,11 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL +static inline int +split_huge_page_to_list(struct page *page, struct list_head *list) +{ + return 0; +} static inline int split_huge_page(struct page *page) { return 0; diff --git a/include/linux/swap.h b/include/linux/swap.h index b5b12c71a2a..1701ce4be74 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -236,7 +236,7 @@ extern unsigned long nr_free_pagecache_pages(void); extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); extern void lru_add_page_tail(struct page *page, struct page *page_tail, - struct lruvec *lruvec); + struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); @@ -346,7 +346,7 @@ extern struct address_space swapper_spaces[]; #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *); +extern int add_to_swap(struct page *, struct list_head *list); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); @@ -465,7 +465,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -static inline int add_to_swap(struct page *page) +static inline int add_to_swap(struct page *page, struct list_head *list) { return 0; } -- cgit v1.2.3 From b4def3509d18c1db9198f92d4c35065e029a09a1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:08:52 -0700 Subject: mm, nobootmem: clean-up of free_low_memory_core_early() Remove unused argument and make function static, because there is no user outside of nobootmem.c Signed-off-by: Joonsoo Kim Cc: Yinghai Lu Acked-by: Johannes Weiner Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index cdc3bab0183..5f0b0e1f7c0 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); -extern unsigned long free_low_memory_core_early(int nodeid); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); -- cgit v1.2.3