summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/page-flags.h29
-rw-r--r--mm/internal.h3
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/slab.c192
-rw-r--r--mm/slub.c29
6 files changed, 264 insertions, 25 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 074eb98fe15..375e79eb009 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,15 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* slub/slob first free object */
+ bool pfmemalloc; /* If set by the page allocator,
+ * ALLOC_PFMEMALLOC was set
+ * and the low watermark was not
+ * met implying that the system
+ * is under some pressure. The
+ * caller should try ensure
+ * this page is only used to
+ * free other pages.
+ */
};
union {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c88d2a9451a..b5d13841604 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/bug.h>
+#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
@@ -453,6 +454,34 @@ static inline int PageTransTail(struct page *page)
}
#endif
+/*
+ * If network-based swap is enabled, sl*b must keep track of whether pages
+ * were allocated from pfmemalloc reserves.
+ */
+static inline int PageSlabPfmemalloc(struct page *page)
+{
+ VM_BUG_ON(!PageSlab(page));
+ return PageActive(page);
+}
+
+static inline void SetPageSlabPfmemalloc(struct page *page)
+{
+ VM_BUG_ON(!PageSlab(page));
+ SetPageActive(page);
+}
+
+static inline void __ClearPageSlabPfmemalloc(struct page *page)
+{
+ VM_BUG_ON(!PageSlab(page));
+ __ClearPageActive(page);
+}
+
+static inline void ClearPageSlabPfmemalloc(struct page *page)
+{
+ VM_BUG_ON(!PageSlab(page));
+ ClearPageActive(page);
+}
+
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked)
#else
diff --git a/mm/internal.h b/mm/internal.h
index 3314f79d775..eb76b67890d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -279,6 +279,9 @@ static inline struct page *mem_map_next(struct page *iter,
#define __paginginit __init
#endif
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a29ed8e6e6..38e5be65f24 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1513,6 +1513,7 @@ failed:
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_PFMEMALLOC 0x80 /* Caller has PF_MEMALLOC set */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -2293,16 +2294,22 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
+ if ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))) {
+ alloc_flags |= ALLOC_PFMEMALLOC;
+
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
alloc_flags |= ALLOC_NO_WATERMARKS;
}
return alloc_flags;
}
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2490,10 +2497,18 @@ nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
+ /*
+ * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
+ * been OOM killed. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the
+ * page being used for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = !!(alloc_flags & ALLOC_PFMEMALLOC);
+
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
+ return page;
}
/*
@@ -2544,6 +2559,8 @@ retry_cpuset:
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ else
+ page->pfmemalloc = false;
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
diff --git a/mm/slab.c b/mm/slab.c
index 1fcf3ac94b6..55d84a22ad9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -124,6 +124,8 @@
#include <trace/events/kmem.h>
+#include "internal.h"
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -152,6 +154,12 @@
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
+
/* Legal flag mask for kmem_cache_create(). */
#if DEBUG
# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -257,9 +265,30 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
+ *
+ * Entries should not be directly dereferenced as
+ * entries belonging to slabs marked pfmemalloc will
+ * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
+#define SLAB_OBJ_PFMEMALLOC 1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+ return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+
+static inline void set_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+ return;
+}
+
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
+
/*
* bootstrap: The caches do not work without cpuarrays anymore, but the
* cpuarrays are allocated from the generic caches...
@@ -900,6 +929,102 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+ struct page *page = virt_to_page(slabp->s_mem);
+
+ return PageSlabPfmemalloc(page);
+}
+
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+ struct array_cache *ac)
+{
+ struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+ struct slab *slabp;
+ unsigned long flags;
+
+ if (!pfmemalloc_active)
+ return;
+
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ list_for_each_entry(slabp, &l3->slabs_free, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ pfmemalloc_active = false;
+out:
+ spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+
+static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ gfp_t flags, bool force_refill)
+{
+ int i;
+ void *objp = ac->entry[--ac->avail];
+
+ /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+ if (unlikely(is_obj_pfmemalloc(objp))) {
+ struct kmem_list3 *l3;
+
+ if (gfp_pfmemalloc_allowed(flags)) {
+ clear_obj_pfmemalloc(&objp);
+ return objp;
+ }
+
+ /* The caller cannot use PFMEMALLOC objects, find another one */
+ for (i = 1; i < ac->avail; i++) {
+ /* If a !PFMEMALLOC object is found, swap them */
+ if (!is_obj_pfmemalloc(ac->entry[i])) {
+ objp = ac->entry[i];
+ ac->entry[i] = ac->entry[ac->avail];
+ ac->entry[ac->avail] = objp;
+ return objp;
+ }
+ }
+
+ /*
+ * If there are empty slabs on the slabs_free list and we are
+ * being forced to refill the cache, mark this one !pfmemalloc.
+ */
+ l3 = cachep->nodelists[numa_mem_id()];
+ if (!list_empty(&l3->slabs_free) && force_refill) {
+ struct slab *slabp = virt_to_slab(objp);
+ ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+ clear_obj_pfmemalloc(&objp);
+ recheck_pfmemalloc_active(cachep, ac);
+ return objp;
+ }
+
+ /* No !PFMEMALLOC objects available */
+ ac->avail++;
+ objp = NULL;
+ }
+
+ return objp;
+}
+
+static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(pfmemalloc_active)) {
+ /* Some pfmemalloc slabs exist, check if this is one */
+ struct page *page = virt_to_page(objp);
+ if (PageSlabPfmemalloc(page))
+ set_obj_pfmemalloc(&objp);
+ }
+
+ ac->entry[ac->avail++] = objp;
+}
+
/*
* Transfer objects in one arraycache to another.
* Locking must be handled by the caller.
@@ -1076,7 +1201,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, alien, nodeid);
}
- alien->entry[alien->avail++] = objp;
+ ac_put_obj(cachep, alien, objp);
spin_unlock(&alien->lock);
} else {
spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1759,6 +1884,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
return NULL;
}
+ /* Record if ALLOC_PFMEMALLOC was set when allocating the slab */
+ if (unlikely(page->pfmemalloc))
+ pfmemalloc_active = true;
+
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1766,9 +1895,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_pages; i++) {
__SetPageSlab(page + i);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page + i);
+ }
+
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1800,6 +1933,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
NR_SLAB_UNRECLAIMABLE, nr_freed);
while (i--) {
BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page++;
}
@@ -3015,16 +3149,19 @@ bad:
#define check_slabp(x,y) do { } while(0)
#endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+ bool force_refill)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
-retry:
check_irq_off();
node = numa_mem_id();
+ if (unlikely(force_refill))
+ goto force_grow;
+retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3074,8 +3211,8 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
- node);
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
+ node));
}
check_slabp(cachep, slabp);
@@ -3094,18 +3231,22 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
+force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
- if (!x && ac->avail == 0) /* no objects in sight? abort */
+
+ /* no objects in sight? abort */
+ if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
- return ac->entry[--ac->avail];
+
+ return ac_get_obj(cachep, ac, flags, force_refill);
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3187,23 +3328,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
+ bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
- STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac->entry[--ac->avail];
- } else {
- STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags);
+ objp = ac_get_obj(cachep, ac, flags, false);
+
/*
- * the 'ac' may be updated by cache_alloc_refill(),
- * and kmemleak_erase() requires its correct value.
+ * Allow for the possibility all avail objects are not allowed
+ * by the current flags
*/
- ac = cpu_cache_get(cachep);
+ if (objp) {
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
+ }
+ force_refill = true;
}
+
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags, force_refill);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
+
+out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3525,9 +3678,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
struct kmem_list3 *l3;
for (i = 0; i < nr_objects; i++) {
- void *objp = objpp[i];
+ void *objp;
struct slab *slabp;
+ clear_obj_pfmemalloc(&objpp[i]);
+ objp = objpp[i];
+
slabp = virt_to_slab(objp);
l3 = cachep->nodelists[node];
list_del(&slabp->list);
@@ -3645,7 +3801,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac->entry[ac->avail++] = objp;
+ ac_put_obj(cachep, ac, objp);
}
/**
diff --git a/mm/slub.c b/mm/slub.c
index e517d435e5d..c3f05e1599c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,8 @@
#include <trace/events/kmem.h>
+#include "internal.h"
+
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
__SetPageSlab(page);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page);
start = page_address(page);
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
+ __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
reset_page_mapcount(page);
if (current->reclaim_state)
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
return freelist;
}
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+ if (unlikely(PageSlabPfmemalloc(page)))
+ return gfp_pfmemalloc_allowed(gfpflags);
+
+ return true;
+}
+
/*
* Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
* or deactivate the page.
@@ -2206,6 +2219,18 @@ redo:
goto new_slab;
}
+ /*
+ * By rights, we should be searching for a slab page that was
+ * PFMEMALLOC but right now, we are losing the pfmemalloc
+ * information when the page leaves the per-cpu allocator
+ */
+ if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
+ goto new_slab;
+ }
+
/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
@@ -2312,8 +2337,8 @@ redo:
object = c->freelist;
page = c->page;
- if (unlikely(!object || !node_match(page, node)))
-
+ if (unlikely(!object || !node_match(page, node) ||
+ !pfmemalloc_match(page, gfpflags)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {