summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 15:45:43 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 15:45:43 +0900
commit5cbb3d216e2041700231bcfc383ee5f8b7fc8b74 (patch)
treea738fa82dbcefa9bd283c08bc67f38827be63937 /mm
parent9bc9ccd7db1c9f043f75380b5a5b94912046a60e (diff)
parent4e9b45a19241354daec281d7a785739829b52359 (diff)
downloadkernel-common-5cbb3d216e2041700231bcfc383ee5f8b7fc8b74.tar.gz
kernel-common-5cbb3d216e2041700231bcfc383ee5f8b7fc8b74.tar.bz2
kernel-common-5cbb3d216e2041700231bcfc383ee5f8b7fc8b74.zip
Merge branch 'akpm' (patches from Andrew Morton)
Merge first patch-bomb from Andrew Morton: "Quite a lot of other stuff is banked up awaiting further next->mainline merging, but this batch contains: - Lots of random misc patches - OCFS2 - Most of MM - backlight updates - lib/ updates - printk updates - checkpatch updates - epoll tweaking - rtc updates - hfs - hfsplus - documentation - procfs - update gcov to gcc-4.7 format - IPC" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (269 commits) ipc, msg: fix message length check for negative values ipc/util.c: remove unnecessary work pending test devpts: plug the memory leak in kill_sb ./Makefile: export initial ramdisk compression config option init/Kconfig: add option to disable kernel compression drivers: w1: make w1_slave::flags long to avoid memory corruption drivers/w1/masters/ds1wm.cuse dev_get_platdata() drivers/memstick/core/ms_block.c: fix unreachable state in h_msb_read_page() drivers/memstick/core/mspro_block.c: fix attributes array allocation drivers/pps/clients/pps-gpio.c: remove redundant of_match_ptr kernel/panic.c: reduce 1 byte usage for print tainted buffer gcov: reuse kbasename helper kernel/gcov/fs.c: use pr_warn() kernel/module.c: use pr_foo() gcov: compile specific gcov implementation based on gcc version gcov: add support for gcc 4.7 gcov format gcov: move gcov structs definitions to a gcc version specific file kernel/taskstats.c: return -ENOMEM when alloc memory fails in add_del_listener() kernel/taskstats.c: add nla_nest_cancel() for failure processing between nla_nest_start() and nla_nest_end() kernel/sysctl_binary.c: use scnprintf() instead of snprintf() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c124
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-failure.c36
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c65
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/page_alloc.c34
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c48
-rw-r--r--mm/vmstat.c22
-rw-r--r--mm/zswap.c195
28 files changed, 569 insertions, 390 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 394838f489eb..3f4ffda152bb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,11 +153,18 @@ config MOVABLE_NODE
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows users to
- online all the memory of a node as movable memory so that the whole
- node can be hotplugged. Users who don't use the memory hotplug
- feature are fine with this option on since they don't online memory
- as movable.
+ memory device cannot be hotplugged. This option allows the following
+ two things:
+ - When the system is booting, node full of hotpluggable memory can
+ be arranged to have only movable memory so that the whole node can
+ be hot-removed. (need movable_node boot option specified).
+ - After the system is up, the option allows users to online all the
+ memory of a node as movable memory so that the whole node can be
+ hot-removed.
+
+ Users who don't use the memory hotplug feature are fine with this
+ option on since they don't specify movable_node boot option or they
+ don't online memory as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..90bd3507b413 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long start, end, pages, count = 0;
+ unsigned long *map, start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
+ map = bdata->node_bootmem_map;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdata - bootmem_node_data, start, end);
while (start < end) {
- unsigned long *map, idx, vec;
+ unsigned long idx, vec;
unsigned shift;
- map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
/* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ end_pfn = pgdat_end_pfn(pgdat);
if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
(goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/compaction.c b/mm/compaction.c
index b5326b141a25..805165bcd3dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
}
/*
- * Isolate free pages onto a private freelist. Caller must hold zone->lock.
- * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
- * pages inside of the pageblock (even though it may still end up isolating
- * some pages).
+ * Isolate free pages onto a private freelist. If @strict is true, will abort
+ * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
+ * (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long blockpfn,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2612f60f53ee..0556c6a44959 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
#include "internal.h"
/*
- * By default transparent hugepage support is enabled for all mappings
- * and khugepaged scans all mappings. Defrag is only invoked by
- * khugepaged hugepage allocations and by page faults inside
- * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
- * allocations.
+ * By default transparent hugepage support is disabled in order that avoid
+ * to risk increase the memory footprint of applications without a guaranteed
+ * benefit. When transparent hugepage support is enabled, is for all mappings,
+ * and khugepaged scans all mappings.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-#endif
-
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -2198,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
+static int khugepaged_node_load[MAX_NUMNODES];
+
#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2232,9 +2252,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
- node, __GFP_OTHER_NODE);
-
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2250,6 +2269,17 @@ static struct page
return *hpage;
}
#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2456,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2472,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Chose the node of the first page. This could
- * be more sophisticated and look at more pages,
- * but isn't for now.
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
*/
- if (node == NUMA_NO_NODE)
- node = page_to_nid(page);
+ node = page_to_nid(page);
+ khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2492,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
+ node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
+ }
out:
return ret;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0ef9ad2..31f01c5011e5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
- if (ptr + size > object->pointer + object->size) {
+ if (size == SIZE_MAX) {
+ size = object->pointer + object->size - ptr;
+ } else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bea2b262a47..175fff79dc95 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
* Allocate stable and unstable together:
* MAXSMP NODES_SHIFT 10 will use 16kB.
*/
- buf = kcalloc(nr_node_ids + nr_node_ids,
- sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
+ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
+ GFP_KERNEL);
/* Let us assume that RB_ROOT is NULL is zero */
if (!buf)
err = -ENOMEM;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ac412a0a7ee..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <asm-generic/sections.h>
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
+ .bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
return (i < type->cnt) ? i : -1;
}
+/*
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ cand = round_up(this_start, align);
+ if (cand < this_end && this_end - cand >= size)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * __memblock_find_range_top_down - find free area utility, in top-down
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area top-down.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ if (this_end < size)
+ continue;
+
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
+ }
+
+ return 0;
+}
+
/**
* memblock_find_in_range_node - find free area in given range and node
* @start: start of candidate range
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*
* Find @size free area aligned to @align in the specified range and node.
*
+ * When allocation direction is bottom-up, the @start should be greater
+ * than the end of the kernel image. Otherwise, it will be trimmed. The
+ * reason is that we want the bottom-up allocation just near the kernel
+ * image so it is highly likely that the allocated memory and the kernel
+ * will reside in the same node.
+ *
+ * If bottom-up allocation failed, will try to allocate memory top-down.
+ *
* RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
{
- phys_addr_t this_start, this_end, cand;
- u64 i;
+ int ret;
+ phys_addr_t kernel_end;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
+ kernel_end = __pa_symbol(_end);
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
- this_start = clamp(this_start, start, end);
- this_end = clamp(this_end, start, end);
+ /*
+ * try bottom-up allocation only when bottom-up mode
+ * is set and @end is above the kernel image.
+ */
+ if (memblock_bottom_up() && end > kernel_end) {
+ phys_addr_t bottom_up_start;
- if (this_end < size)
- continue;
+ /* make sure we will allocate above the kernel */
+ bottom_up_start = max(start, kernel_end);
- cand = round_down(this_end - size, align);
- if (cand >= this_start)
- return cand;
+ /* ok, try bottom-up allocation first */
+ ret = __memblock_find_range_bottom_up(bottom_up_start, end,
+ size, align, nid);
+ if (ret)
+ return ret;
+
+ /*
+ * we always limit bottom-up allocation above the kernel,
+ * but top-down allocation doesn't have the limit, so
+ * retrying top-down allocation may succeed when bottom-up
+ * allocation failed.
+ *
+ * bottom-up allocation is expected to be fail very rarely,
+ * so we use WARN_ONCE() here to see the stack trace if
+ * fail happens.
+ */
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, "
+ "memory hotunplug may be affected\n");
}
- return 0;
+
+ return __memblock_find_range_top_down(start, end, size, align, nid);
}
/**
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 796820925de0..f20a57b7faf2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
+#include "slab.h"
#include <asm/uaccess.h>
@@ -2968,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
@@ -2997,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
- bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- /*
- * Conditions under which we can wait for the oom_killer. Those are
- * the same conditions tested by the core page allocator
- */
- may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, may_oom);
+ &_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -3151,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3412,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
- new_cachep = cachep->memcg_params->memcg_caches[idx];
+ new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3458,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- c = s->memcg_params->memcg_caches[i];
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3592,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (likely(cachep->memcg_params->memcg_caches[idx])) {
- cachep = cachep->memcg_params->memcg_caches[idx];
+ if (likely(cache_from_memcg_idx(cachep, idx))) {
+ cachep = cache_from_memcg_idx(cachep, idx);
goto out;
}
@@ -5389,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
{
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
+
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
int nid;
- unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
- unsigned long node_nr;
+ unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
- seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
-
- file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
- seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_FILE);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
-
- anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
- seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_ANON);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ seq_printf(m, "%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
+ }
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ struct mem_cgroup *iter;
+
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_node_nr_lru_pages(
+ iter, nid, stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- BIT(LRU_UNEVICTABLE));
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bf3351b5115e..f9d78ec7831f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1423,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_memory_hotplug prevents a race with memory hotplug.
- * This is a big hammer, a better would be nicer.
- */
- lock_memory_hotplug();
-
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
- set_migratetype_isolate(p, true);
- /*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
@@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
- unlock_memory_hotplug();
return ret;
}
@@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags)
}
}
+ /*
+ * The lock_memory_hotplug prevents a race with memory hotplug.
+ * This is a big hammer, a better would be nicer.
+ */
+ lock_memory_hotplug();
+
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
+ */
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(page, true);
+
ret = get_any_page(page, pfn, flags);
- if (ret < 0)
- goto unset;
- if (ret) { /* for in-use pages */
+ unlock_memory_hotplug();
+ if (ret > 0) { /* for in-use pages */
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
- } else { /* for free pages */
+ } else if (ret == 0) { /* for free pages */
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
@@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
-unset:
unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 33a3dbec3cc8..bf8665849a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..489f235502db 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
#include <linux/firmware-map.h>
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
+#include <linux/memblock.h>
#include <asm/tlbflush.h>
@@ -365,8 +366,7 @@ out_fail:
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long old_pgdat_end_pfn =
- pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
static int __meminit __add_section(int nid, struct zone *zone,
unsigned long phys_start_pfn)
{
- int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+ ret = sparse_add_one_section(zone, phys_start_pfn);
if (ret < 0)
return ret;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
static void shrink_pgdat_span(struct pglist_data *pgdat,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long pgdat_end_pfn =
- pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+ unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
+ unsigned long pgdat_end_pfn = p;
unsigned long pfn;
struct mem_section *ms;
int nid = pgdat->node_id;
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = page_to_nid(pfn_to_page(pfn));
+ nid = pfn_to_nid(pfn);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
-/*
+/**
+ * try_online_node - online a node if offlined
+ *
* called by cpu_up() to online a node without onlined memory.
*/
-int mem_online_node(int nid)
+int try_online_node(int nid)
{
pg_data_t *pgdat;
int ret;
+ if (node_online(nid))
+ return 0;
+
lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
+ pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);
+ if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
+
out:
unlock_memory_hotplug();
return ret;
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
}
#endif /* CONFIG_MOVABLE_NODE */
+static int __init cmdline_parse_movable_node(char *p)
+{
+#ifdef CONFIG_MOVABLE_NODE
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ memblock_set_bottom_up(true);
+#else
+ pr_warn("movable_node option not supported\n");
+#endif
+ return 0;
+}
+early_param("movable_node", cmdline_parse_movable_node);
+
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
* if this is not the case.
*/
ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- is_memblock_offlined_cb);
+ check_memblock_offlined_cb);
if (ret) {
unlock_memory_hotplug();
BUG();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71cb253368cb..4cc19f6ab6c6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
- int source = -1;
+ int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
- if (source == -1)
+ if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
@@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int c;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
if (!nnodes)
return numa_node_id();
@@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
/*
* Return the bit number of a random bit set in the nodemask.
- * (returns -1 if nodemask is empty)
+ * (returns NUMA_NO_NODE if nodemask is empty)
*/
int node_random(const nodemask_t *maskp)
{
- int w, bit = -1;
+ int w, bit = NUMA_NO_NODE;
w = nodes_weight(*maskp);
if (w)
@@ -2914,62 +2914,45 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+ * Convert @pol into a string. If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
- int l;
- nodemask_t nodes;
- unsigned short mode;
- unsigned short flags = pol ? pol->flags : 0;
-
- /*
- * Sanity check: room for longest mode, flag and some nodes
- */
- VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+ nodemask_t nodes = NODE_MASK_NONE;
+ unsigned short mode = MPOL_DEFAULT;
+ unsigned short flags = 0;
- if (!pol || pol == &default_policy)
- mode = MPOL_DEFAULT;
- else
+ if (pol && pol != &default_policy) {
mode = pol->mode;
+ flags = pol->flags;
+ }
switch (mode) {
case MPOL_DEFAULT:
- nodes_clear(nodes);
break;
-
case MPOL_PREFERRED:
- nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
-
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
-
default:
- return -EINVAL;
+ WARN_ON_ONCE(1);
+ snprintf(p, maxlen, "unknown");
+ return;
}
- l = strlen(policy_modes[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
-
- strcpy(p, policy_modes[mode]);
- p += l;
+ p += snprintf(p, maxlen, policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = '=';
+ p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
}
if (!nodes_empty(nodes)) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = ':';
+ p += snprintf(p, buffer + maxlen - p, ":");
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
- return p - buffer;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index ab199dfc9e26..5a6baddde15d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
+ info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a597f2ffcd6f..26667971c824 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
+ unsigned long nr_huge_updates = 0;
pmd = pmd_offset(pud, addr);
do {
@@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
newprot, prot_numa);
if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR)
- pages++;
-
+ if (nr_ptes == HPAGE_PMD_NR) {
+ pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
+ }
continue;
}
}
@@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pages += this_pages;
} while (pmd++, addr = next, addr != end);
+ if (nr_huge_updates)
+ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 61107cf55bb3..2c254d374655 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
- unsigned long i, start_aligned, end_aligned;
- int order = ilog2(BITS_PER_LONG);
+ int order;
- start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
- end_aligned = end & ~(BITS_PER_LONG - 1);
+ while (start < end) {
+ order = min(MAX_ORDER - 1UL, __ffs(start));
- if (end_aligned <= start_aligned) {
- for (i = start; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
+ while (start + (1UL << order) > end)
+ order--;
- return;
- }
-
- for (i = start; i < start_aligned; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
+ __free_pages_bootmem(pfn_to_page(start), order);
- for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
- __free_pages_bootmem(pfn_to_page(i), order);
-
- for (i = end_aligned; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
+ start += (1UL << order);
+ }
}
static unsigned long __init __free_memory_core(phys_addr_t start,
diff --git a/mm/nommu.c b/mm/nommu.c
index 9e6cb02cba64..fec093adad9c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
* Reserve some 3% for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73d812f16dde..580a5f075ed0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
void set_pageblock_migratetype(struct page *page, int migratetype)
{
-
- if (unlikely(page_group_by_mobility_disabled))
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
{
int current_order = page_order(page);
+ /*
+ * When borrowing from MIGRATE_CMA, we need to release the excess
+ * buddy pages to CMA itself.
+ */
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
- /*
- * Borrow the excess buddy pages as well, irrespective
- * of whether we stole freepages, or took ownership of
- * the pageblock or not.
- *
- * Exception: When borrowing from MIGRATE_CMA, release
- * the excess buddy pages to CMA itself.
- */
expand(zone, page, order, current_order, area,
- is_migrate_cma(migratetype)
- ? migratetype : start_migratetype);
+ new_type);
- trace_mm_page_alloc_extfrag(page, order,
- current_order, start_migratetype, migratetype,
- new_type == start_migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype, new_type);
return page;
}
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
- * If the zonelist cache is present in the passed in zonelist, then
+ * If the zonelist cache is present in the passed zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_MEMORY].)
*
@@ -2593,7 +2587,7 @@ rebalance:
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ if (oom_gfp_allowed(gfp_mask)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
return ffz(~size);
}
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
/*
* Check if a pageblock contains reserved pages
*/
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
*/
zone->pageset = &boot_pageset;
- if (zone->present_pages)
+ if (populated_zone(zone))
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (zone->present_pages) {
+ if (populated_zone(zone)) {
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..7cdbb44aa90b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
+ pgoff_t prev_offset;
/*
* start of file
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
+ * trivial case: (offset - prev_offset) == 1
+ * unaligned reads: (offset - prev_offset) == 0
*/
- if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
@@ -569,7 +573,7 @@ static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
- if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+ if (!mapping || !mapping->a_ops)
return -EINVAL;
force_page_cache_readahead(mapping, filp, index, nr);
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..0c8967bb2018 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
- c = cache_from_memcg(cachep, i);
+ c = cache_from_memcg_idx(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
diff --git a/mm/slab.h b/mm/slab.h
index a535033f7e9a..0859c4241ba1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
if (!s->memcg_params)
return NULL;
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
return NULL;
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2e98af703ea..0b7bb399b0e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
return;
for_each_memcg_cache_index(i) {
- c = cache_from_memcg(s, i);
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
diff --git a/mm/slub.c b/mm/slub.c
index c3eb3d3ca835..92737a0b787b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* through the descendants with best-effort propagation.
*/
for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg(s, i);
+ struct kmem_cache *c = cache_from_memcg_idx(s, i);
if (c)
attribute->store(c, buf, len);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 4ac1d7ef548f..8cc7be0e9590 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -590,33 +590,32 @@ void __init sparse_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
- unsigned long nr_pages)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
/* This will make the necessary allocations eventually. */
return sparse_mem_map_populate(pnum, nid);
}
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+static void __kfree_section_memmap(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + nr_pages);
+ unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
vmemmap_free(start, end);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + nr_pages);
+ unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
vmemmap_free(start, end);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+static struct page *__kmalloc_section_memmap(void)
{
struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * nr_pages;
+ unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
@@ -634,28 +633,30 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
- unsigned long nr_pages)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
- return __kmalloc_section_memmap(nr_pages);
+ return __kmalloc_section_memmap();
}
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+static void __kfree_section_memmap(struct page *memmap)
{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * nr_pages));
+ get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
- unsigned long magic;
+ unsigned long magic, nr_pages;
struct page *page = virt_to_page(memmap);
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
- int nr_pages)
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
if (!usemap) {
- __kfree_section_memmap(memmap, nr_pages);
+ __kfree_section_memmap(memmap);
return -ENOMEM;
}
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
- memset(memmap, 0, sizeof(struct page) * nr_pages);
+ memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
ms->section_mem_map |= SECTION_MARKED_PRESENT;
@@ -729,7 +729,7 @@ out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
kfree(usemap);
- __kfree_section_memmap(memmap, nr_pages);
+ __kfree_section_memmap(memmap);
}
return ret;
}
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
static void free_section_usemap(struct page *memmap, unsigned long *usemap)
{
struct page *usemap_page;
- unsigned long nr_pages;
if (!usemap)
return;
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
kfree(usemap);
if (memmap)
- __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ __kfree_section_memmap(memmap);
return;
}
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
* on the section which has pgdat at boot time. Just keep it as is now.
*/
- if (memmap) {
- nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
- >> PAGE_SHIFT;
-
- free_map_bootmem(memmap, nr_pages);
- }
+ if (memmap)
+ free_map_bootmem(memmap);
}
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de7c904e52e5..612a7c9795f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -707,7 +707,7 @@ noswap:
return (swp_entry_t) {0};
}
-/* The only caller of this function is now susupend routine */
+/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
/*
- * Caller has made sure that the swapdevice corresponding to entry
+ * Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
- * Hibration suspends storage while it is writing the image
+ * Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
* of unmatched parts which look like swp_pte, so unuse_pte must
* recheck under pte lock. Scanning without pte lock lets it be
- * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
*/
pte = pte_offset_map(pmd, addr);
do {
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->cluster_info = NULL;
p->flags = 0;
frontswap_map = frontswap_map_get(p);
- frontswap_map_set(p, NULL);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
frontswap_invalidate_area(type);
+ frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
vfree(cluster_info);
vfree(frontswap_map);
- /* Destroy swap account informatin */
+ /* Destroy swap account information */
swap_cgroup_swapoff(type);
inode = mapping->host;
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
/*
* We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel pagetables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+ * no architecture is using highmem pages for kernel page tables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
*/
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+
#include <asm/uaccess.h>
#include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
return mapping;
}
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+ return ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100) + total_swap_pages;
+}
+
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 107454312d5e..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+
retry:
spin_lock(&vmap_area_lock);
/*
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
+ pgprot_t prot, int node)
{
const int order = 0;
struct page **pages;
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
- PAGE_KERNEL, node, caller);
+ PAGE_KERNEL, node, area->caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
- area->caller = caller;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
- if (node < 0)
+ if (node == NUMA_NO_NODE)
page = alloc_page(tmp_mask);
else
page = alloc_pages_node(node, tmp_mask, order);
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!area)
goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, node);
if (!addr)
- goto fail;
+ return NULL;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
clear_vm_uninitialized_flag(area);
/*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
+ * A ref_count = 2 is needed because vm_struct allocated in
+ * __get_vm_area_node() contains a reference to the virtual address of
+ * the vmalloc'ed block.
*/
- kmemleak_alloc(addr, real_size, 3, gfp_mask);
+ kmemleak_alloc(addr, real_size, 2, gfp_mask);
return addr;
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+ if (v->flags & VM_UNINITIALIZED)
+ return;
+
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p)
struct vmap_area *va = p;
struct vm_struct *v;
- if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ /*
+ * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
+ * behalf of vmap area is being tear down or vm_map_ram allocation.
+ */
+ if (!(va->flags & VM_VM_AREA))
return 0;
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
- return 0;
- }
-
v = va->vm;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
- if (v->flags & VM_UNINITIALIZED)
- return 0;
-
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9bb314577911..72496140ac08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
+ "numa_huge_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu)
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
+static void vmstat_cpu_dead(int node)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (cpu_to_node(cpu) == node)
+ goto end;
+
+ node_clear_state(node, N_CPU);
+end:
+ put_online_cpus();
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
refresh_zone_stat_thresholds();
+ vmstat_cpu_dead(cpu_to_node(cpu));
break;
default:
break;
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
- for_each_online_cpu(cpu)
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
start_cpu_timer(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ }
+ put_online_cpus();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zswap.c b/mm/zswap.c
index d93510c6aa2d..5a63f78a5601 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
if (!entry)
return NULL;
entry->refcount = 1;
+ RB_CLEAR_NODE(&entry->rbnode);
return entry;
}
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
- entry->refcount++;
-}
-
-/* caller must hold the tree lock */
-static int zswap_entry_put(struct zswap_entry *entry)
-{
- entry->refcount--;
- return entry->refcount;
-}
-
/*********************************
* rbtree functions
**********************************/
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ if (!RB_EMPTY_NODE(&entry->rbnode)) {
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+ }
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ zbud_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_pool_pages = zbud_get_pool_size(tree->pool);
+}
+
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+ entry->refcount++;
+}
+
+/* caller must hold the tree lock
+* remove from the tree and free it, if nobody reference the entry
+*/
+static void zswap_entry_put(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ int refcount = --entry->refcount;
+
+ BUG_ON(refcount < 0);
+ if (refcount == 0) {
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_free_entry(tree, entry);
+ }
+}
+
+/* caller must hold the tree lock */
+static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+ pgoff_t offset)
+{
+ struct zswap_entry *entry = NULL;
+
+ entry = zswap_rb_search(root, offset);
+ if (entry)
+ zswap_entry_get(entry);
+
+ return entry;
+}
+
/*********************************
* per-cpu code
**********************************/
@@ -368,18 +411,6 @@ static bool zswap_is_full(void)
zswap_pool_pages);
}
-/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
-{
- zbud_free(tree->pool, entry->handle);
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
-}
-
/*********************************
* writeback code
**********************************/
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
enum zswap_get_swap_ret {
ZSWAP_SWAPCACHE_NEW,
ZSWAP_SWAPCACHE_EXIST,
- ZSWAP_SWAPCACHE_NOMEM
+ ZSWAP_SWAPCACHE_FAIL,
};
/*
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret {
* added to the swap cache, and returned in retpage.
*
* If success, the swap cache page is returned in retpage
- * Returns 0 if page was already in the swap cache, page is not locked
- * Returns 1 if the new page needs to be populated, page is locked
- * Returns <0 on error
+ * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
+ * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
+ * the new page is added to swapcache and locked
+ * Returns ZSWAP_SWAPCACHE_FAIL on error
*/
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
if (new_page)
page_cache_release(new_page);
if (!found_page)
- return ZSWAP_SWAPCACHE_NOMEM;
+ return ZSWAP_SWAPCACHE_FAIL;
*retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
struct page *page;
u8 *src, *dst;
unsigned int dlen;
- int ret, refcount;
+ int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
/* find and ref zswap entry */
spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
return 0;
}
- zswap_entry_get(entry);
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
- case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
+ case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
ret = -ENOMEM;
goto fail;
- case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
+ case ZSWAP_SWAPCACHE_EXIST:
/* page is already in the swap cache, ignore for now */
page_cache_release(page);
ret = -EEXIST;
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
SetPageUptodate(page);
}
+ /* move it to the tail of the inactive list after end_writeback */
+ SetPageReclaim(page);
+
/* start writeback */
__swap_writepage(page, &wbc, end_swap_bio_write);
page_cache_release(page);
zswap_written_back_pages++;
spin_lock(&tree->lock);
-
/* drop local reference */
- zswap_entry_put(entry);
- /* drop the initial reference from entry creation */
- refcount = zswap_entry_put(entry);
+ zswap_entry_put(tree, entry);
/*
- * There are three possible values for refcount here:
- * (1) refcount is 1, load is in progress, unlink from rbtree,
- * load will free
- * (2) refcount is 0, (normal case) entry is valid,
- * remove from rbtree and free entry
- * (3) refcount is -1, invalidate happened during writeback,
- * free entry
- */
- if (refcount >= 0) {
- /* no invalidate yet, remove from rbtree */
- rb_erase(&entry->rbnode, &tree->rbroot);
- }
+ * There are two possible situations for entry here:
+ * (1) refcount is 1(normal case), entry is valid and on the tree
+ * (2) refcount is 0, entry is freed and not on the tree
+ * because invalidate happened during writeback
+ * search the tree and free the entry if find entry
+ */
+ if (entry == zswap_rb_search(&tree->rbroot, offset))
+ zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- if (refcount <= 0) {
- /* free the entry */
- zswap_free_entry(tree, entry);
- return 0;
- }
- return -EAGAIN;
+ goto end;
+
+ /*
+ * if we get here due to ZSWAP_SWAPCACHE_EXIST
+ * a load may happening concurrently
+ * it is safe and okay to not free the entry
+ * if we free the entry in the following put
+ * it it either okay to return !0
+ */
fail:
spin_lock(&tree->lock);
- zswap_entry_put(entry);
+ zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
+
+end:
return ret;
}
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
if (ret == -EEXIST) {
zswap_duplicate_entry++;
/* remove from rbtree */
- rb_erase(&dupentry->rbnode, &tree->rbroot);
- if (!zswap_entry_put(dupentry)) {
- /* free */
- zswap_free_entry(tree, dupentry);
- }
+ zswap_rb_erase(&tree->rbroot, dupentry);
+ zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
spin_unlock(&tree->lock);
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
struct zswap_entry *entry;
u8 *src, *dst;
unsigned int dlen;
- int refcount, ret;
+ int ret;
/* find */
spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
return -1;
}
- zswap_entry_get(entry);
spin_unlock(&tree->lock);
/* decompress */
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
BUG_ON(ret);
spin_lock(&tree->lock);
- refcount = zswap_entry_put(entry);
- if (likely(refcount)) {
- spin_unlock(&tree->lock);
- return 0;
- }
+ zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- /*
- * We don't have to unlink from the rbtree because
- * zswap_writeback_entry() or zswap_frontswap_invalidate page()
- * has already done this for us if we are the last reference.
- */
- /* free */
-
- zswap_free_entry(tree, entry);
-
return 0;
}
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
- int refcount;
/* find */
spin_lock(&tree->lock);
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
}
/* remove from rbtree */
- rb_erase(&entry->rbnode, &tree->rbroot);
+ zswap_rb_erase(&tree->rbroot, entry);
/* drop the initial reference from entry creation */
- refcount = zswap_entry_put(entry);
+ zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
-
- if (refcount) {
- /* writeback in progress, writeback will free */
- return;
- }
-
- /* free */
- zswap_free_entry(tree, entry);
}
/* frees all zswap entries for the given swap type */
@@ -797,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type)
/* walk the tree and free everything */
spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
- zbud_free(tree->pool, entry->handle);
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- }
+ rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
+ zswap_free_entry(tree, entry);
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);