summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug10
-rw-r--r--mm/Makefile6
-rw-r--r--mm/bootmem.c9
-rw-r--r--mm/cma.c108
-rw-r--r--mm/compaction.c157
-rw-r--r--mm/debug-pagealloc.c45
-rw-r--r--mm/debug.c5
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/filemap_xip.c23
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/frontswap.c6
-rw-r--r--mm/gup.c81
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/hugetlb_cgroup.c103
-rw-r--r--mm/internal.h32
-rw-r--r--mm/iov_iter.c1062
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c1884
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c74
-rw-r--r--mm/memory_hotplug.c30
-rw-r--r--mm/mempolicy.c15
-rw-r--r--mm/migrate.c52
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mmap.c36
-rw-r--r--mm/mmu_notifier.c25
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nobootmem.c8
-rw-r--r--mm/nommu.c50
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c344
-rw-r--r--mm/page_cgroup.c530
-rw-r--r--mm/page_counter.c192
-rw-r--r--mm/page_ext.c403
-rw-r--r--mm/page_isolation.c45
-rw-r--r--mm/page_owner.c311
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/rmap.c30
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c29
-rw-r--r--mm/slab.h8
-rw-r--r--mm/slab_common.c44
-rw-r--r--mm/slub.c37
-rw-r--r--mm/swap_cgroup.c208
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmpressure.c8
-rw-r--r--mm/vmscan.c239
-rw-r--r--mm/vmstat.c102
-rw-r--r--mm/zbud.c4
-rw-r--r--mm/zsmalloc.c390
-rw-r--r--mm/zswap.c11
60 files changed, 3557 insertions, 3410 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..56badfc4810a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,8 +1,18 @@
+config PAGE_EXTENSION
+ bool "Extend memmap on extra space for more information on page"
+ ---help---
+ Extend memmap on extra space for more information on page. This
+ could be used for debugging features that need to insert extra
+ field for every page. This extension enables us to save memory
+ by not allocating this extra memory according to boottime
+ configuration.
+
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
+ select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0023a9..4bf586e66378 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,12 +55,15 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
@@ -69,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8a000cebb0d7..477be696511d 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
static int reset_managed_pages_done __initdata;
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+void reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
-
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
z->managed_pages = 0;
}
@@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void)
{
struct pglist_data *pgdat;
+ if (reset_managed_pages_done)
+ return;
+
for_each_online_pgdat(pgdat)
reset_node_managed_pages(pgdat);
+
reset_managed_pages_done = 1;
}
diff --git a/mm/cma.c b/mm/cma.c
index 963bc4add9af..a85ae28709a3 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
#include <linux/log2.h>
#include <linux/cma.h>
#include <linux/highmem.h>
+#include <linux/io.h>
struct cma {
unsigned long base_pfn;
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
return (1UL << (align_order - cma->order_per_bit)) - 1;
}
+static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+{
+ unsigned int alignment;
+
+ if (align_order <= cma->order_per_bit)
+ return 0;
+ alignment = 1UL << (align_order - cma->order_per_bit);
+ return ALIGN(cma->base_pfn, alignment) -
+ (cma->base_pfn >> cma->order_per_bit);
+}
+
static unsigned long cma_bitmap_maxno(struct cma *cma)
{
return cma->count >> cma->order_per_bit;
@@ -124,6 +136,7 @@ static int __init cma_activate_area(struct cma *cma)
err:
kfree(cma->bitmap);
+ cma->count = 0;
return -EINVAL;
}
@@ -214,12 +227,23 @@ int __init cma_declare_contiguous(phys_addr_t base,
bool fixed, struct cma **res_cma)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start = __pa(high_memory);
+ phys_addr_t highmem_start;
int ret = 0;
- pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
- __func__, (unsigned long)size, (unsigned long)base,
- (unsigned long)limit, (unsigned long)alignment);
+#ifdef CONFIG_X86
+ /*
+ * high_memory isn't direct mapped memory so retrieving its physical
+ * address isn't appropriate. But it would be useful to check the
+ * physical address of the highmem boundary so it's justfiable to get
+ * the physical address from it. On x86 there is a validation check for
+ * this case, so the following workaround is needed to avoid it.
+ */
+ highmem_start = __pa_nodebug(high_memory);
+#else
+ highmem_start = __pa(high_memory);
+#endif
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
if (cma_area_count == ARRAY_SIZE(cma_areas)) {
pr_err("Not enough slots for CMA reserved regions!\n");
@@ -244,52 +268,78 @@ int __init cma_declare_contiguous(phys_addr_t base,
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
+ if (!base)
+ fixed = false;
+
/* size should be aligned with order_per_bit */
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
/*
- * adjust limit to avoid crossing low/high memory boundary for
- * automatically allocated regions
+ * If allocating at a fixed base the request region must not cross the
+ * low/high memory boundary.
*/
- if (((limit == 0 || limit > memblock_end) &&
- (memblock_end - size < highmem_start &&
- memblock_end > highmem_start)) ||
- (!fixed && limit > highmem_start && limit - size < highmem_start)) {
- limit = highmem_start;
- }
-
- if (fixed && base < highmem_start && base+size > highmem_start) {
+ if (fixed && base < highmem_start && base + size > highmem_start) {
ret = -EINVAL;
- pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
- (unsigned long)base, (unsigned long)highmem_start);
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
goto err;
}
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
/* Reserve memory */
- if (base && fixed) {
+ if (fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
ret = -EBUSY;
goto err;
}
} else {
- phys_addr_t addr = memblock_alloc_range(size, alignment, base,
- limit);
+ phys_addr_t addr = 0;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem_start && limit > highmem_start) {
+ addr = memblock_alloc_range(size, alignment,
+ highmem_start, limit);
+ limit = highmem_start;
+ }
+
if (!addr) {
- ret = -ENOMEM;
- goto err;
- } else {
- base = addr;
+ addr = memblock_alloc_range(size, alignment, base,
+ limit);
+ if (!addr) {
+ ret = -ENOMEM;
+ goto err;
+ }
}
+
+ /*
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
+ */
+ kmemleak_ignore(phys_to_virt(addr));
+ base = addr;
}
ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
if (ret)
goto err;
- pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
- (unsigned long)base);
+ totalcma_pages += (size / PAGE_SIZE);
+ pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
+ &base);
return 0;
err:
@@ -308,7 +358,7 @@ err:
*/
struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
{
- unsigned long mask, pfn, start = 0;
+ unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
int ret;
@@ -323,13 +373,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
return NULL;
mask = cma_bitmap_aligned_mask(cma, align);
+ offset = cma_bitmap_aligned_offset(cma, align);
bitmap_maxno = cma_bitmap_maxno(cma);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
for (;;) {
mutex_lock(&cma->lock);
- bitmap_no = bitmap_find_next_zero_area(cma->bitmap,
- bitmap_maxno, start, bitmap_count, mask);
+ bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ bitmap_maxno, start, bitmap_count, mask,
+ offset);
if (bitmap_no >= bitmap_maxno) {
mutex_unlock(&cma->lock);
break;
diff --git a/mm/compaction.c b/mm/compaction.c
index ec74cf0123ef..546e571e9d60 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
- unsigned long count = 0;
+ unsigned long high_pfn = 0;
list_for_each_entry_safe(page, next, freelist, lru) {
+ unsigned long pfn = page_to_pfn(page);
list_del(&page->lru);
__free_page(page);
- count++;
+ if (pfn > high_pfn)
+ high_pfn = pfn;
}
- return count;
+ return high_pfn;
}
static void map_pages(struct list_head *list)
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc,
/* Update where async and sync compaction should restart */
if (migrate_scanner) {
- if (cc->finished_update_migrate)
- return;
if (pfn > zone->compact_cached_migrate_pfn[0])
zone->compact_cached_migrate_pfn[0] = pfn;
if (cc->mode != MIGRATE_ASYNC &&
pfn > zone->compact_cached_migrate_pfn[1])
zone->compact_cached_migrate_pfn[1] = pfn;
} else {
- if (cc->finished_update_free)
- return;
if (pfn < zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = pfn;
}
@@ -479,6 +477,16 @@ isolate_freepages_range(struct compact_control *cc,
block_end_pfn = min(block_end_pfn, end_pfn);
+ /*
+ * pfn could pass the block_end_pfn if isolated freepage
+ * is more than pageblock order. In this case, we adjust
+ * scanning range to right one.
+ */
+ if (pfn >= block_end_pfn) {
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+ }
+
if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
break;
@@ -705,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
- cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
@@ -879,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc)
block_start_pfn - pageblock_nr_pages;
/*
- * Set a flag that we successfully isolated in this pageblock.
- * In the next loop iteration, zone->compact_cached_free_pfn
- * will not be updated and thus it will effectively contain the
- * highest pageblock we isolated pages from.
- */
- if (isolated)
- cc->finished_update_free = true;
-
- /*
* isolate_freepages_block() might have aborted due to async
* compaction being contended
*/
@@ -1029,8 +1027,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
acct_isolated(zone, cc);
- /* Record where migration scanner will be restarted */
- cc->migrate_pfn = low_pfn;
+ /*
+ * Record where migration scanner will be restarted. If we end up in
+ * the same pageblock as the free scanner, make the scanners fully
+ * meet so that compact_finished() terminates compaction.
+ */
+ cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1072,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
/* Compaction run is not finished if the watermark is not met */
watermark = low_wmark_pages(zone);
- watermark += (1 << cc->order);
- if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
+ cc->alloc_flags))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
@@ -1100,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-unsigned long compaction_suitable(struct zone *zone, int order)
+unsigned long compaction_suitable(struct zone *zone, int order,
+ int alloc_flags, int classzone_idx)
{
int fragindex;
unsigned long watermark;
@@ -1112,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order)
if (order == -1)
return COMPACT_CONTINUE;
+ watermark = low_wmark_pages(zone);
+ /*
+ * If watermarks for high-order allocation are already met, there
+ * should be no need for compaction at all.
+ */
+ if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+ alloc_flags))
+ return COMPACT_PARTIAL;
+
/*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
*/
- watermark = low_wmark_pages(zone) + (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ watermark += (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
return COMPACT_SKIPPED;
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
- * index of -1000 implies allocations might succeed depending on
- * watermarks
+ * index of -1000 would imply allocations might succeed depending on
+ * watermarks, but we already failed the high-order watermark check
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
@@ -1136,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_SKIPPED;
- if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
- 0, 0))
- return COMPACT_PARTIAL;
-
return COMPACT_CONTINUE;
}
@@ -1150,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ unsigned long last_migrated_pfn = 0;
- ret = compaction_suitable(zone, cc->order);
+ ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+ cc->classzone_idx);
switch (ret) {
case COMPACT_PARTIAL:
case COMPACT_SKIPPED:
@@ -1194,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
+ unsigned long isolate_start_pfn = cc->migrate_pfn;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
@@ -1202,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
- continue;
+ /*
+ * We haven't isolated and migrated anything, but
+ * there might still be unflushed migrations from
+ * previous cc->order aligned block.
+ */
+ goto check_drain;
case ISOLATE_SUCCESS:
;
}
@@ -1227,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
goto out;
}
}
+
+ /*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator. We use the pfn that
+ * isolate_migratepages() started from in this loop iteration
+ * - this is the lowest page that could have been isolated and
+ * then freed by migration.
+ */
+ if (!last_migrated_pfn)
+ last_migrated_pfn = isolate_start_pfn;
+
+check_drain:
+ /*
+ * Has the migration scanner moved away from the previous
+ * cc->order aligned block where we migrated from? If yes,
+ * flush the pages that were freed, so that they can merge and
+ * compact_finished() can detect immediately if allocation
+ * would succeed.
+ */
+ if (cc->order > 0 && last_migrated_pfn) {
+ int cpu;
+ unsigned long current_block_start =
+ cc->migrate_pfn & ~((1UL << cc->order) - 1);
+
+ if (last_migrated_pfn < current_block_start) {
+ cpu = get_cpu();
+ lru_add_drain_cpu(cpu);
+ drain_local_pages(zone);
+ put_cpu();
+ /* No more flushing until we migrate again */
+ last_migrated_pfn = 0;
+ }
+ }
+
}
out:
- /* Release free pages and check accounting */
- cc->nr_freepages -= release_freepages(&cc->freepages);
- VM_BUG_ON(cc->nr_freepages != 0);
+ /*
+ * Release free pages and update where the free scanner should restart,
+ * so we don't leave any returned pages behind in the next attempt.
+ */
+ if (cc->nr_freepages > 0) {
+ unsigned long free_pfn = release_freepages(&cc->freepages);
+
+ cc->nr_freepages = 0;
+ VM_BUG_ON(free_pfn == 0);
+ /* The cached pfn is always the first in a pageblock */
+ free_pfn &= ~(pageblock_nr_pages-1);
+ /*
+ * Only go back, not forward. The cached pfn might have been
+ * already reset to zone end in compact_finished()
+ */
+ if (free_pfn > zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = free_pfn;
+ }
trace_mm_compaction_end(ret);
@@ -1240,7 +1305,8 @@ out:
}
static unsigned long compact_zone_order(struct zone *zone, int order,
- gfp_t gfp_mask, enum migrate_mode mode, int *contended)
+ gfp_t gfp_mask, enum migrate_mode mode, int *contended,
+ int alloc_flags, int classzone_idx)
{
unsigned long ret;
struct compact_control cc = {
@@ -1250,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = mode,
+ .alloc_flags = alloc_flags,
+ .classzone_idx = classzone_idx,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1274,14 +1342,13 @@ int sysctl_extfrag_threshold = 500;
* @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
- * @candidate_zone: Return the zone where we think allocation should succeed
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
enum migrate_mode mode, int *contended,
- struct zone **candidate_zone)
+ int alloc_flags, int classzone_idx)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1289,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_DEFERRED;
- int alloc_flags = 0;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
@@ -1298,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
-#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
@@ -1312,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
- &zone_contended);
+ &zone_contended, alloc_flags, classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1321,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
- alloc_flags)) {
- *candidate_zone = zone;
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
+ classzone_idx, alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
@@ -1345,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
goto break_loop;
}
- if (mode != MIGRATE_ASYNC) {
+ if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 789ff70c8a4a..5bf5906ce13b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -2,23 +2,55 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+static bool page_poisoning_enabled __read_mostly;
+
+static bool need_page_poisoning(void)
+{
+ if (!debug_pagealloc_enabled())
+ return false;
+
+ return true;
+}
+
+static void init_page_poisoning(void)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ page_poisoning_enabled = true;
+}
+
+struct page_ext_operations page_poisoning_ops = {
+ .need = need_page_poisoning,
+ .init = init_page_poisoning,
+};
+
static inline void set_page_poison(struct page *page)
{
- __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static inline void clear_page_poison(struct page *page)
{
- __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static inline bool page_poison(struct page *page)
{
- return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static void poison_page(struct page *page)
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
+ if (!page_poisoning_enabled)
+ return;
+
if (enable)
unpoison_pages(page, numpages);
else
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
dump_flags(page->flags & badflags,
pageflag_names, ARRAY_SIZE(pageflag_names));
}
- mem_cgroup_print_bad_page(page);
+#ifdef CONFIG_MEMCG
+ if (page->mem_cgroup)
+ pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+#endif
}
void dump_page(struct page *page, const char *reason)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45e..2ad7adf4f0a4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
- /* First and last FULL page! */
+ /*
+ * First and last FULL page! Partial pages are deliberately
+ * preserved on the expectation that it is better to preserve
+ * needed memory than to discard unneeded memory.
+ */
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642279f1..bd8543c6508f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
/*
* Lock ordering:
*
- * ->i_mmap_mutex (truncate_pagecache)
+ * ->i_mmap_rwsem (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_mutex (truncate->unmap_mapping_range)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
@@ -85,7 +85,7 @@
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
@@ -105,7 +105,7 @@
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
@@ -2464,7 +2464,7 @@ ssize_t generic_perform_write(struct file *file,
/*
* Copies from kernel address space cannot fail (NFSD is a big user).
*/
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (!iter_is_iovec(i))
flags |= AOP_FLAG_UNINTERRUPTIBLE;
do {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685c..0d105aeff82f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
EXPORT_SYMBOL_GPL(xip_file_read);
/*
- * __xip_unmap is invoked from xip_unmap and
- * xip_write
+ * __xip_unmap is invoked from xip_unmap and xip_write
*
* This function walks all vmas of the address_space and unmaps the
* __xip_sparse_page when found at pgoff.
*/
-static void
-__xip_unmap (struct address_space * mapping,
- unsigned long pgoff)
+static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long address;
- pte_t *pte;
- pte_t pteval;
- spinlock_t *ptl;
struct page *page;
unsigned count;
int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
return;
retry:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- mm = vma->vm_mm;
- address = vma->vm_start +
+ pte_t *pte, pteval;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
pte = page_check_address(page, mm, address, &ptl, 1);
if (pte) {
@@ -202,7 +197,7 @@ retry:
page_cache_release(page);
}
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (locked) {
mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa361433..2805d71cf476 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte_present(pte)) {
flush_cache_page(vma, addr, pte_pfn(pte));
- pte = ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush_notify(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
@@ -238,13 +238,13 @@ get_write_lock:
}
goto out_freed;
}
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_interval_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index c30eec536f03..8d82809eb085 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
if (frontswap_ops)
frontswap_ops->init(type);
else {
- BUG_ON(type > MAX_SWAPFILES);
+ BUG_ON(type >= MAX_SWAPFILES);
set_bit(type, need_init);
}
}
@@ -244,8 +244,10 @@ int __frontswap_store(struct page *page)
the (older) page from frontswap
*/
inc_frontswap_failed_stores();
- if (dup)
+ if (dup) {
__frontswap_clear(sis, offset);
+ frontswap_ops->invalidate_page(type, offset);
+ }
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
diff --git a/mm/gup.c b/mm/gup.c
index f2305deaef50..a900759cc807 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3,7 +3,6 @@
#include <linux/err.h>
#include <linux/spinlock.h>
-#include <linux/hugetlb.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
@@ -12,6 +11,7 @@
#include <linux/sched.h>
#include <linux/rwsem.h>
+#include <linux/hugetlb.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -875,6 +875,49 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
+static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
+ unsigned long end, int write,
+ struct page **pages, int *nr)
+{
+ int refs;
+ struct page *head, *page, *tail;
+
+ if (write && !pgd_write(orig))
+ return 0;
+
+ refs = 0;
+ head = pgd_page(orig);
+ page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
+}
+
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
@@ -902,6 +945,14 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pages, nr))
return 0;
+ } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
+ /*
+ * architecture have different format for hugetlbfs
+ * pmd format and THP pmd format
+ */
+ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
+ PMD_SHIFT, next, write, pages, nr))
+ return 0;
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -909,22 +960,26 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}
-static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
- pudp = pud_offset(pgdp, addr);
+ pudp = pud_offset(&pgd, addr);
do {
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (pud_huge(pud)) {
+ if (unlikely(pud_huge(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, write,
- pages, nr))
+ pages, nr))
+ return 0;
+ } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
+ if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
+ PUD_SHIFT, next, write, pages, nr))
return 0;
} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
@@ -970,10 +1025,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
+ pgd_t pgd = ACCESS_ONCE(*pgdp);
+
next = pgd_addr_end(addr, end);
- if (pgd_none(*pgdp))
+ if (pgd_none(pgd))
break;
- else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+ if (unlikely(pgd_huge(pgd))) {
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ pages, &nr))
+ break;
+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
+ PGDIR_SHIFT, next, write, pages, &nr))
+ break;
+ } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de984159cf0b..817a875f2b8c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
if (!pmd_none(*pmd))
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
- entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -805,7 +804,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) &&
+ if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
@@ -1036,7 +1035,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1179,7 +1178,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1400,7 +1399,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+ orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1512,7 +1512,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
pmd_t entry;
ret = 1;
if (!prot_numa) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmdp_get_and_clear_notify(mm, addr, pmd);
if (pmd_numa(entry))
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
@@ -1644,6 +1644,7 @@ static int __split_huge_page_splitting(struct page *page,
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
+
ret = 1;
spin_unlock(ptl);
}
@@ -2834,7 +2835,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd722769927..85032de5e20f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -582,7 +582,7 @@ retry_cpuset:
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
+ if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
if (avoid_reserve)
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
return 0;
found:
- BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node)
* devices of nodes that have memory. All on-line nodes should have
* registered their associated device by this time.
*/
-static void hugetlb_register_all_nodes(void)
+static void __init hugetlb_register_all_nodes(void)
{
int nid;
@@ -2598,8 +2598,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
- if (cow)
+ if (cow) {
huge_ptep_set_wrprotect(src, addr, src_pte);
+ mmu_notifier_invalidate_range(src, mmun_start,
+ mmun_end);
+ }
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
@@ -2638,8 +2641,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ address = start;
again:
- for (address = start; address < end; address += sz) {
+ for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
@@ -2686,6 +2690,7 @@ again:
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
+ address += sz;
spin_unlock(ptl);
break;
}
@@ -2724,9 +2729,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
* on its way out. We're lucky that the flag has such an appropriate
* name, and can in fact be safely cleared here. We could clear it
* before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_mutex. This works
+ * is to clear it before releasing the i_mmap_rwsem. This works
* because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_mutex is held.
+ * destroyed and the i_mmap_rwsem is held.
*/
vma->vm_flags &= ~VM_MAYSHARE;
}
@@ -2772,7 +2777,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
@@ -2789,7 +2794,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
unmap_hugepage_range(iter_vma, address,
address + huge_page_size(h), page);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
@@ -2899,6 +2904,7 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, address, ptep);
+ mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page);
@@ -3346,7 +3352,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, address, end);
mmu_notifier_invalidate_range_start(mm, start, end);
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address);
@@ -3368,13 +3374,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
spin_unlock(ptl);
}
/*
- * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
- * once we release i_mmap_mutex, another task can do the final put_page
+ * once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk.
*/
flush_tlb_range(vma, start, end);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ mmu_notifier_invalidate_range(mm, start, end);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
@@ -3523,7 +3530,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
* bad pmd for sharing.
*/
@@ -3542,7 +3549,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -3570,7 +3577,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
return pte;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a67c26e0f360..037e1c00a5b7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -14,6 +14,7 @@
*/
#include <linux/cgroup.h>
+#include <linux/page_counter.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
@@ -23,7 +24,7 @@ struct hugetlb_cgroup {
/*
* the counter to account for hugepages from hugetlb.
*/
- struct res_counter hugepage[HUGE_MAX_HSTATE];
+ struct page_counter hugepage[HUGE_MAX_HSTATE];
};
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
int idx;
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
- if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+ if (page_counter_read(&h_cg->hugepage[idx]))
return true;
}
return false;
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_h_cgroup) {
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- res_counter_init(&h_cgroup->hugepage[idx],
- &parent_h_cgroup->hugepage[idx]);
+ page_counter_init(&h_cgroup->hugepage[idx],
+ &parent_h_cgroup->hugepage[idx]);
} else {
root_h_cgroup = h_cgroup;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- res_counter_init(&h_cgroup->hugepage[idx], NULL);
+ page_counter_init(&h_cgroup->hugepage[idx], NULL);
}
return &h_cgroup->css;
}
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page *page)
{
- int csize;
- struct res_counter *counter;
- struct res_counter *fail_res;
+ unsigned int nr_pages;
+ struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
if (!page_hcg || page_hcg != h_cg)
goto out;
- csize = PAGE_SIZE << compound_order(page);
+ nr_pages = 1 << compound_order(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
- res_counter_charge_nofail(&parent->hugepage[idx],
- csize, &fail_res);
+ page_counter_charge(&parent->hugepage[idx], nr_pages);
}
counter = &h_cg->hugepage[idx];
- res_counter_uncharge_until(counter, counter->parent, csize);
+ /* Take the pages off the local counter */
+ page_counter_cancel(counter, nr_pages);
set_hugetlb_cgroup(page, parent);
out:
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
{
int ret = 0;
- struct res_counter *fail_res;
+ struct page_counter *counter;
struct hugetlb_cgroup *h_cg = NULL;
- unsigned long csize = nr_pages * PAGE_SIZE;
if (hugetlb_cgroup_disabled())
goto done;
@@ -187,7 +186,7 @@ again:
}
rcu_read_unlock();
- ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+ ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
css_put(&h_cg->css);
done:
*ptr = h_cg;
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
struct page *page)
{
struct hugetlb_cgroup *h_cg;
- unsigned long csize = nr_pages * PAGE_SIZE;
if (hugetlb_cgroup_disabled())
return;
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (unlikely(!h_cg))
return;
set_hugetlb_cgroup(page, NULL);
- res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
return;
}
void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg)
{
- unsigned long csize = nr_pages * PAGE_SIZE;
-
if (hugetlb_cgroup_disabled() || !h_cg)
return;
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
return;
- res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
return;
}
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+};
+
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- int idx, name;
+ struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
- idx = MEMFILE_IDX(cft->private);
- name = MEMFILE_ATTR(cft->private);
+ counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
- return res_counter_read_u64(&h_cg->hugepage[idx], name);
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_LIMIT:
+ return (u64)counter->limit * PAGE_SIZE;
+ case RES_MAX_USAGE:
+ return (u64)counter->watermark * PAGE_SIZE;
+ case RES_FAILCNT:
+ return counter->failcnt;
+ default:
+ BUG();
+ }
}
+static DEFINE_MUTEX(hugetlb_limit_mutex);
+
static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- int idx, name, ret;
- unsigned long long val;
+ int ret, idx;
+ unsigned long nr_pages;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
+ return -EINVAL;
+
buf = strstrip(buf);
+ ret = page_counter_memparse(buf, &nr_pages);
+ if (ret)
+ return ret;
+
idx = MEMFILE_IDX(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
- switch (name) {
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
- if (hugetlb_cgroup_is_root(h_cg)) {
- /* Can't set limit on root */
- ret = -EINVAL;
- break;
- }
- /* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buf, &val);
- if (ret)
- break;
- val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx]));
- ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+ mutex_lock(&hugetlb_limit_mutex);
+ ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+ mutex_unlock(&hugetlb_limit_mutex);
break;
default:
ret = -EINVAL;
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- int idx, name, ret = 0;
+ int ret = 0;
+ struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
- idx = MEMFILE_IDX(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
- switch (name) {
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
- res_counter_reset_max(&h_cg->hugepage[idx]);
+ page_counter_reset_watermark(counter);
break;
case RES_FAILCNT:
- res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+ counter->failcnt = 0;
break;
default:
ret = -EINVAL;
diff --git a/mm/internal.h b/mm/internal.h
index 829304090b90..efad241f7014 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
*/
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ * B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ * P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ */
+static inline unsigned long
+__find_buddy_index(unsigned long page_idx, unsigned int order)
+{
+ return page_idx ^ (1 << order);
+}
+
+extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
@@ -136,13 +161,10 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
- bool finished_update_free; /* True when the zone cached pfns are
- * no longer being updated
- */
- bool finished_update_migrate;
-
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
+ const int alloc_flags; /* alloc flags of a direct compactor */
+ const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone;
int contended; /* Signal need_sched() or lock
* contention detected during
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index eafcf60f6b83..a1599ca4ab0e 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -3,95 +3,136 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-
-static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
+#include <net/checksum.h>
+
+#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
+ size_t left; \
+ size_t wanted = n; \
+ __p = i->iov; \
+ __v.iov_len = min(n, __p->iov_len - skip); \
+ if (likely(__v.iov_len)) { \
+ __v.iov_base = __p->iov_base + skip; \
+ left = (STEP); \
+ __v.iov_len -= left; \
+ skip += __v.iov_len; \
+ n -= __v.iov_len; \
+ } else { \
+ left = 0; \
+ } \
+ while (unlikely(!left && n)) { \
+ __p++; \
+ __v.iov_len = min(n, __p->iov_len); \
+ if (unlikely(!__v.iov_len)) \
+ continue; \
+ __v.iov_base = __p->iov_base; \
+ left = (STEP); \
+ __v.iov_len -= left; \
+ skip = __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ n = wanted - n; \
+}
+
+#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
+ size_t wanted = n; \
+ __p = i->kvec; \
+ __v.iov_len = min(n, __p->iov_len - skip); \
+ if (likely(__v.iov_len)) { \
+ __v.iov_base = __p->iov_base + skip; \
+ (void)(STEP); \
+ skip += __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ while (unlikely(n)) { \
+ __p++; \
+ __v.iov_len = min(n, __p->iov_len); \
+ if (unlikely(!__v.iov_len)) \
+ continue; \
+ __v.iov_base = __p->iov_base; \
+ (void)(STEP); \
+ skip = __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ n = wanted; \
+}
+
+#define iterate_bvec(i, n, __v, __p, skip, STEP) { \
+ size_t wanted = n; \
+ __p = i->bvec; \
+ __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \
+ if (likely(__v.bv_len)) { \
+ __v.bv_page = __p->bv_page; \
+ __v.bv_offset = __p->bv_offset + skip; \
+ (void)(STEP); \
+ skip += __v.bv_len; \
+ n -= __v.bv_len; \
+ } \
+ while (unlikely(n)) { \
+ __p++; \
+ __v.bv_len = min_t(size_t, n, __p->bv_len); \
+ if (unlikely(!__v.bv_len)) \
+ continue; \
+ __v.bv_page = __p->bv_page; \
+ __v.bv_offset = __p->bv_offset; \
+ (void)(STEP); \
+ skip = __v.bv_len; \
+ n -= __v.bv_len; \
+ } \
+ n = wanted; \
+}
+
+#define iterate_all_kinds(i, n, v, I, B, K) { \
+ size_t skip = i->iov_offset; \
+ if (unlikely(i->type & ITER_BVEC)) { \
+ const struct bio_vec *bvec; \
+ struct bio_vec v; \
+ iterate_bvec(i, n, v, bvec, skip, (B)) \
+ } else if (unlikely(i->type & ITER_KVEC)) { \
+ const struct kvec *kvec; \
+ struct kvec v; \
+ iterate_kvec(i, n, v, kvec, skip, (K)) \
+ } else { \
+ const struct iovec *iov; \
+ struct iovec v; \
+ iterate_iovec(i, n, v, iov, skip, (I)) \
+ } \
+}
+
+#define iterate_and_advance(i, n, v, I, B, K) { \
+ size_t skip = i->iov_offset; \
+ if (unlikely(i->type & ITER_BVEC)) { \
+ const struct bio_vec *bvec; \
+ struct bio_vec v; \
+ iterate_bvec(i, n, v, bvec, skip, (B)) \
+ if (skip == bvec->bv_len) { \
+ bvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= bvec - i->bvec; \
+ i->bvec = bvec; \
+ } else if (unlikely(i->type & ITER_KVEC)) { \
+ const struct kvec *kvec; \
+ struct kvec v; \
+ iterate_kvec(i, n, v, kvec, skip, (K)) \
+ if (skip == kvec->iov_len) { \
+ kvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= kvec - i->kvec; \
+ i->kvec = kvec; \
+ } else { \
+ const struct iovec *iov; \
+ struct iovec v; \
+ iterate_iovec(i, n, v, iov, skip, (I)) \
+ if (skip == iov->iov_len) { \
+ iov++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= iov - i->iov; \
+ i->iov = iov; \
+ } \
+ i->count -= n; \
+ i->iov_offset = skip; \
}
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
@@ -256,134 +297,6 @@ done:
return wanted - bytes;
}
-static size_t zero_iovec(size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __clear_user(buf, copy);
- copy -= left;
- skip += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __clear_user(buf, copy);
- copy -= left;
- skip = copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static size_t copy_from_user_atomic_iovec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-
-static void advance_iovec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
/*
* Fault in the first iovec of the given iov_iter, to a maximum length
* of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -395,7 +308,7 @@ static void advance_iovec(struct iov_iter *i, size_t bytes)
*/
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
{
- if (!(i->type & ITER_BVEC)) {
+ if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
char __user *buf = i->iov->iov_base + i->iov_offset;
bytes = min(bytes, i->iov->iov_len - i->iov_offset);
return fault_in_pages_readable(buf, bytes);
@@ -404,136 +317,25 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
}
EXPORT_SYMBOL(iov_iter_fault_in_readable);
-static unsigned long alignment_iovec(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = (unsigned long)iov->iov_base + i->iov_offset;
- n = iov->iov_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++iov)->iov_len) {
- res |= (unsigned long)iov->iov_base | iov->iov_len;
- size -= iov->iov_len;
- }
- res |= (unsigned long)iov->iov_base | size;
- return res;
-}
-
void iov_iter_init(struct iov_iter *i, int direction,
const struct iovec *iov, unsigned long nr_segs,
size_t count)
{
/* It will get better. Eventually... */
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (segment_eq(get_fs(), KERNEL_DS)) {
direction |= ITER_KVEC;
- i->type = direction;
- i->iov = iov;
+ i->type = direction;
+ i->kvec = (struct kvec *)iov;
+ } else {
+ i->type = direction;
+ i->iov = iov;
+ }
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count;
}
EXPORT_SYMBOL(iov_iter_init);
-static ssize_t get_pages_iovec(struct iov_iter *i,
- struct page **pages, size_t maxsize, unsigned maxpages,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- if (len > maxpages * PAGE_SIZE)
- len = maxpages * PAGE_SIZE;
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
- if (unlikely(res < 0))
- return res;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- void *p;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
-
- p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
- if (!p)
- p = vmalloc(n * sizeof(struct page *));
- if (!p)
- return -ENOMEM;
-
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
- if (unlikely(res < 0)) {
- kvfree(p);
- return res;
- }
- *pages = p;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct iovec *iov = i->iov;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, iov++) {
- unsigned long addr = (unsigned long)iov->iov_base + offset;
- size_t len = iov->iov_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
- - addr / PAGE_SIZE;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
-
static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
{
char *from = kmap_atomic(page);
@@ -555,293 +357,78 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_atomic(addr);
}
-static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i)
+size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *from = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
- copy = min_t(size_t, bytes, bvec->bv_len - skip);
+ iterate_and_advance(i, bytes, v,
+ __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
+ v.iov_len),
+ memcpy_to_page(v.bv_page, v.bv_offset,
+ (from += v.bv_len) - v.bv_len, v.bv_len),
+ memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
+ )
- memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
- skip += copy;
- from += copy;
- bytes -= copy;
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted - bytes;
+ return bytes;
}
+EXPORT_SYMBOL(copy_to_iter);
-static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i)
+size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *to = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
-
- copy = min(bytes, bvec->bv_len - skip);
-
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
-
- to += copy;
- skip += copy;
- bytes -= copy;
-
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted;
-}
-
-static size_t copy_page_to_iter_bvec(struct page *page, size_t offset,
- size_t bytes, struct iov_iter *i)
-{
- void *kaddr = kmap_atomic(page);
- size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i);
- kunmap_atomic(kaddr);
- return wanted;
-}
+ iterate_and_advance(i, bytes, v,
+ __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
+ v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
-static size_t copy_page_from_iter_bvec(struct page *page, size_t offset,
- size_t bytes, struct iov_iter *i)
-{
- void *kaddr = kmap_atomic(page);
- size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i);
- kunmap_atomic(kaddr);
- return wanted;
+ return bytes;
}
+EXPORT_SYMBOL(copy_from_iter);
-static size_t zero_bvec(size_t bytes, struct iov_iter *i)
+size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *to = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
- copy = min_t(size_t, bytes, bvec->bv_len - skip);
+ iterate_and_advance(i, bytes, v,
+ __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+ v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
- memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy);
- skip += copy;
- bytes -= copy;
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memzero_page(bvec->bv_page, bvec->bv_offset, copy);
- skip = copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_from_user_bvec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t left;
- const struct bio_vec *bvec;
- size_t base = i->iov_offset;
-
- kaddr = kmap_atomic(page);
- for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
- size_t copy = min(left, bvec->bv_len - base);
- if (!bvec->bv_len)
- continue;
- memcpy_from_page(kaddr + offset, bvec->bv_page,
- bvec->bv_offset + base, copy);
- offset += copy;
- left -= copy;
- }
- kunmap_atomic(kaddr);
return bytes;
}
-
-static void advance_bvec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct bio_vec *bvec = i->bvec;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !bvec->bv_len)) {
- int copy;
-
- copy = min(bytes, bvec->bv_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (bvec->bv_len == base) {
- bvec++;
- nr_segs--;
- base = 0;
- }
- }
- i->bvec = bvec;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
-static unsigned long alignment_bvec(const struct iov_iter *i)
-{
- const struct bio_vec *bvec = i->bvec;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = bvec->bv_offset + i->iov_offset;
- n = bvec->bv_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++bvec)->bv_len) {
- res |= bvec->bv_offset | bvec->bv_len;
- size -= bvec->bv_len;
- }
- res |= bvec->bv_offset | size;
- return res;
-}
-
-static ssize_t get_pages_bvec(struct iov_iter *i,
- struct page **pages, size_t maxsize, unsigned maxpages,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- /* can't be more than PAGE_SIZE */
- *start = bvec->bv_offset + i->iov_offset;
-
- get_page(*pages = bvec->bv_page);
-
- return len;
-}
-
-static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- *start = bvec->bv_offset + i->iov_offset;
-
- *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
- if (!*pages)
- return -ENOMEM;
-
- get_page(**pages = bvec->bv_page);
-
- return len;
-}
-
-static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct bio_vec *bvec = i->bvec;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, bvec++) {
- size_t len = bvec->bv_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages++;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
+EXPORT_SYMBOL(copy_from_iter_nocache);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_page_to_iter_bvec(page, offset, bytes, i);
- else
+ if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+ } else
return copy_page_to_iter_iovec(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);
@@ -849,57 +436,53 @@ EXPORT_SYMBOL(copy_page_to_iter);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_page_from_iter_bvec(page, offset, bytes, i);
- else
+ if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+ } else
return copy_page_from_iter_iovec(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_from_iter);
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_to_iter_bvec(addr, bytes, i);
- else
- return copy_to_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_to_iter);
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
-size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
- if (i->type & ITER_BVEC)
- return copy_from_iter_bvec(addr, bytes, i);
- else
- return copy_from_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_from_iter);
+ if (unlikely(!bytes))
+ return 0;
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
-{
- if (i->type & ITER_BVEC) {
- return zero_bvec(bytes, i);
- } else {
- return zero_iovec(bytes, i);
- }
+ iterate_and_advance(i, bytes, v,
+ __clear_user(v.iov_base, v.iov_len),
+ memzero_page(v.bv_page, v.bv_offset, v.bv_len),
+ memset(v.iov_base, 0, v.iov_len)
+ )
+
+ return bytes;
}
EXPORT_SYMBOL(iov_iter_zero);
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
- if (i->type & ITER_BVEC)
- return copy_from_user_bvec(page, i, offset, bytes);
- else
- return copy_from_user_atomic_iovec(page, i, offset, bytes);
+ char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+ iterate_all_kinds(i, bytes, v,
+ __copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
+ v.iov_base, v.iov_len),
+ memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
+ kunmap_atomic(kaddr);
+ return bytes;
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
void iov_iter_advance(struct iov_iter *i, size_t size)
{
- if (i->type & ITER_BVEC)
- advance_bvec(i, size);
- else
- advance_iovec(i, size);
+ iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -911,18 +494,39 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
if (i->nr_segs == 1)
return i->count;
else if (i->type & ITER_BVEC)
- return min(i->count, i->iov->iov_len - i->iov_offset);
- else
return min(i->count, i->bvec->bv_len - i->iov_offset);
+ else
+ return min(i->count, i->iov->iov_len - i->iov_offset);
}
EXPORT_SYMBOL(iov_iter_single_seg_count);
+void iov_iter_kvec(struct iov_iter *i, int direction,
+ const struct kvec *iov, unsigned long nr_segs,
+ size_t count)
+{
+ BUG_ON(!(direction & ITER_KVEC));
+ i->type = direction;
+ i->kvec = (struct kvec *)iov;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_kvec);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return alignment_bvec(i);
- else
- return alignment_iovec(i);
+ unsigned long res = 0;
+ size_t size = i->count;
+
+ if (!size)
+ return 0;
+
+ iterate_all_kinds(i, size, v,
+ (res |= (unsigned long)v.iov_base | v.iov_len, 0),
+ res |= v.bv_offset | v.bv_len,
+ res |= (unsigned long)v.iov_base | v.iov_len
+ )
+ return res;
}
EXPORT_SYMBOL(iov_iter_alignment);
@@ -930,29 +534,207 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
- if (i->type & ITER_BVEC)
- return get_pages_bvec(i, pages, maxsize, maxpages, start);
- else
- return get_pages_iovec(i, pages, maxsize, maxpages, start);
+ if (maxsize > i->count)
+ maxsize = i->count;
+
+ if (!maxsize)
+ return 0;
+
+ iterate_all_kinds(i, maxsize, v, ({
+ unsigned long addr = (unsigned long)v.iov_base;
+ size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+ int n;
+ int res;
+
+ if (len > maxpages * PAGE_SIZE)
+ len = maxpages * PAGE_SIZE;
+ addr &= ~(PAGE_SIZE - 1);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+ if (unlikely(res < 0))
+ return res;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+ 0;}),({
+ /* can't be more than PAGE_SIZE */
+ *start = v.bv_offset;
+ get_page(*pages = v.bv_page);
+ return v.bv_len;
+ }),({
+ return -EFAULT;
+ })
+ )
+ return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages);
+static struct page **get_pages_array(size_t n)
+{
+ struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+ if (!p)
+ p = vmalloc(n * sizeof(struct page *));
+ return p;
+}
+
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
size_t *start)
{
- if (i->type & ITER_BVEC)
- return get_pages_alloc_bvec(i, pages, maxsize, start);
- else
- return get_pages_alloc_iovec(i, pages, maxsize, start);
+ struct page **p;
+
+ if (maxsize > i->count)
+ maxsize = i->count;
+
+ if (!maxsize)
+ return 0;
+
+ iterate_all_kinds(i, maxsize, v, ({
+ unsigned long addr = (unsigned long)v.iov_base;
+ size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+ int n;
+ int res;
+
+ addr &= ~(PAGE_SIZE - 1);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+ p = get_pages_array(n);
+ if (!p)
+ return -ENOMEM;
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+ if (unlikely(res < 0)) {
+ kvfree(p);
+ return res;
+ }
+ *pages = p;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+ 0;}),({
+ /* can't be more than PAGE_SIZE */
+ *start = v.bv_offset;
+ *pages = p = get_pages_array(1);
+ if (!p)
+ return -ENOMEM;
+ get_page(*p = v.bv_page);
+ return v.bv_len;
+ }),({
+ return -EFAULT;
+ })
+ )
+ return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
+ struct iov_iter *i)
+{
+ char *to = addr;
+ __wsum sum, next;
+ size_t off = 0;
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ sum = *csum;
+ iterate_and_advance(i, bytes, v, ({
+ int err = 0;
+ next = csum_and_copy_from_user(v.iov_base,
+ (to += v.iov_len) - v.iov_len,
+ v.iov_len, 0, &err);
+ if (!err) {
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ }
+ err ? v.iov_len : 0;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck(p + v.bv_offset,
+ (to += v.bv_len) - v.bv_len,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
+ }),({
+ next = csum_partial_copy_nocheck(v.iov_base,
+ (to += v.iov_len) - v.iov_len,
+ v.iov_len, 0);
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ })
+ )
+ *csum = sum;
+ return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter);
+
+size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
+ struct iov_iter *i)
+{
+ char *from = addr;
+ __wsum sum, next;
+ size_t off = 0;
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ sum = *csum;
+ iterate_and_advance(i, bytes, v, ({
+ int err = 0;
+ next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
+ v.iov_base,
+ v.iov_len, 0, &err);
+ if (!err) {
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ }
+ err ? v.iov_len : 0;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
+ p + v.bv_offset,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
+ }),({
+ next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
+ v.iov_base,
+ v.iov_len, 0);
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ })
+ )
+ *csum = sum;
+ return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_to_iter);
+
int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
- if (i->type & ITER_BVEC)
- return iov_iter_npages_bvec(i, maxpages);
- else
- return iov_iter_npages_iovec(i, maxpages);
+ size_t size = i->count;
+ int npages = 0;
+
+ if (!size)
+ return 0;
+
+ iterate_all_kinds(i, size, v, ({
+ unsigned long p = (unsigned long)v.iov_base;
+ npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+ - p / PAGE_SIZE;
+ if (npages >= maxpages)
+ return maxpages;
+ 0;}),({
+ npages++;
+ if (npages >= maxpages)
+ return maxpages;
+ }),({
+ unsigned long p = (unsigned long)v.iov_base;
+ npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+ - p / PAGE_SIZE;
+ if (npages >= maxpages)
+ return maxpages;
+ })
+ )
+ return npages;
}
EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6b2e337bc03c..d247efab5073 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -892,7 +892,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
*/
- entry = ptep_clear_flush(vma, addr, ptep);
+ entry = ptep_clear_flush_notify(vma, addr, ptep);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
@@ -960,7 +960,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
page_add_anon_rmap(kpage, vma, addr);
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938b30da4ab..a271adc93289 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -326,7 +326,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*/
get_file(f);
up_read(&current->mm->mmap_sem);
- error = do_fallocate(f,
+ error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
diff --git a/mm/memblock.c b/mm/memblock.c
index 6ecb0d937fb5..252b77bdf65e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
}
/**
- * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
- * @base: the base phys addr of the region
- * @size: the size of the region
*
- * This function isolates region [@base, @base + @size), and mark it with flag
- * MEMBLOCK_HOTPLUG.
+ * This function isolates region [@base, @base + @size), and sets/clears flag
*
* Return 0 on succees, -errno on failure.
*/
-int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_setclr_flag(phys_addr_t base,
+ phys_addr_t size, int set, int flag)
{
struct memblock_type *type = &memblock.memory;
int i, ret, start_rgn, end_rgn;
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
return ret;
for (i = start_rgn; i < end_rgn; i++)
- memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+ if (set)
+ memblock_set_region_flags(&type->regions[i], flag);
+ else
+ memblock_clear_region_flags(&type->regions[i], flag);
memblock_merge_regions(type);
return 0;
}
/**
- * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
* @base: the base phys addr of the region
* @size: the size of the region
*
- * This function isolates region [@base, @base + @size), and clear flag
- * MEMBLOCK_HOTPLUG for the isolated regions.
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
*
* Return 0 on succees, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
- struct memblock_type *type = &memblock.memory;
- int i, ret, start_rgn, end_rgn;
-
- ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
- if (ret)
- return ret;
-
- for (i = start_rgn; i < end_rgn; i++)
- memblock_clear_region_flags(&type->regions[i],
- MEMBLOCK_HOTPLUG);
-
- memblock_merge_regions(type);
- return 0;
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d6ac0e33e150..ef91e856c7e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
* GNU General Public License for more details.
*/
-#include <linux/res_counter.h>
+#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
@@ -51,7 +51,7 @@
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
unsigned long targets[MEM_CGROUP_NTARGETS];
};
-struct mem_cgroup_reclaim_iter {
- /*
- * last scanned hierarchy member. Valid only if last_dead_count
- * matches memcg->dead_count of the hierarchy root group.
- */
- struct mem_cgroup *last_visited;
- int last_dead_count;
-
+struct reclaim_iter {
+ struct mem_cgroup *position;
/* scan generation, increased every round-trip */
unsigned int generation;
};
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
- struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+ struct reclaim_iter iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
- unsigned long long usage_in_excess;/* Set to the value by which */
+ unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
- u64 threshold;
+ unsigned long threshold;
};
/* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
- /*
- * the counter to account for memory usage
- */
- struct res_counter res;
+
+ /* Accounted resources */
+ struct page_counter memory;
+ struct page_counter memsw;
+ struct page_counter kmem;
+
+ unsigned long soft_limit;
/* vmpressure notifications */
struct vmpressure vmpressure;
@@ -296,19 +293,9 @@ struct mem_cgroup {
int initialized;
/*
- * the counter to account for mem+swap usage.
- */
- struct res_counter memsw;
-
- /*
- * the counter to account for kernel memory usage.
- */
- struct res_counter kmem;
- /*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
@@ -352,7 +339,6 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
- atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct cg_proto tcp_mem;
#endif
@@ -379,38 +365,10 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
};
-/* internal only representation about the status of kmem accounting. */
-enum {
- KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
-};
-
#ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
-{
- /*
- * Our caller must use css_get() first, because memcg_uncharge_kmem()
- * will call css_put() if it sees the memcg is dead.
- */
- smp_wmb();
- if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
- set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
-}
-
-static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
-{
- return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
- &memcg->kmem_account_flags);
+ return memcg->kmemcg_id >= 0;
}
#endif
@@ -650,7 +608,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
* This check can't live in kmem destruction function,
* since the charges will outlive the cgroup
*/
- WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+ WARN_ON(page_counter_read(&memcg->kmem));
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -664,8 +622,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
disarm_kmem_keys(memcg);
}
-static void drain_all_stock_async(struct mem_cgroup *memcg);
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -706,7 +662,7 @@ soft_limit_tree_from_page(struct page *page)
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
+ unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -755,10 +711,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
spin_unlock_irqrestore(&mctz->lock, flags);
}
+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
+{
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long excess = 0;
+
+ if (nr_pages > soft_limit)
+ excess = nr_pages - soft_limit;
+
+ return excess;
+}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
- unsigned long long excess;
+ unsigned long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
@@ -769,7 +736,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_page_zoneinfo(memcg, page);
- excess = res_counter_soft_limit_excess(&memcg->res);
+ excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
@@ -825,7 +792,7 @@ retry:
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
- if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ if (!soft_limit_excess(mz->memcg) ||
!css_tryget_online(&mz->memcg->css))
goto retry;
done:
@@ -1062,122 +1029,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited)
-{
- struct cgroup_subsys_state *prev_css, *next_css;
-
- prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
- next_css = css_next_descendant_pre(prev_css, &root->css);
-
- /*
- * Even if we found a group we have to make sure it is
- * alive. css && !memcg means that the groups should be
- * skipped and we should continue the tree walk.
- * last_visited css is safe to use because it is
- * protected by css_get and the tree walk is rcu safe.
- *
- * We do not take a reference on the root of the tree walk
- * because we might race with the root removal when it would
- * be the only node in the iterated hierarchy and mem_cgroup_iter
- * would end up in an endless loop because it expects that at
- * least one valid node will be returned. Root cannot disappear
- * because caller of the iterator should hold it already so
- * skipping css reference should be safe.
- */
- if (next_css) {
- struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-
- if (next_css == &root->css)
- return memcg;
-
- if (css_tryget_online(next_css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- return memcg;
- css_put(next_css);
- }
-
- prev_css = next_css;
- goto skip_node;
- }
-
- return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
- /*
- * When a group in the hierarchy below root is destroyed, the
- * hierarchy iterator can no longer be trusted since it might
- * have pointed to the destroyed group. Invalidate it.
- */
- atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *root,
- int *sequence)
-{
- struct mem_cgroup *position = NULL;
- /*
- * A cgroup destruction happens in two stages: offlining and
- * release. They are separated by a RCU grace period.
- *
- * If the iterator is valid, we may still race with an
- * offlining. The RCU lock ensures the object won't be
- * released, tryget will fail if we lost the race.
- */
- *sequence = atomic_read(&root->dead_count);
- if (iter->last_dead_count == *sequence) {
- smp_rmb();
- position = iter->last_visited;
-
- /*
- * We cannot take a reference to root because we might race
- * with root removal and returning NULL would end up in
- * an endless loop on the iterator user level when root
- * would be returned all the time.
- */
- if (position && position != root &&
- !css_tryget_online(&position->css))
- position = NULL;
- }
- return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *last_visited,
- struct mem_cgroup *new_position,
- struct mem_cgroup *root,
- int sequence)
-{
- /* root reference counting symmetric to mem_cgroup_iter_load */
- if (last_visited && last_visited != root)
- css_put(&last_visited->css);
- /*
- * We store the sequence count from the time @last_visited was
- * loaded successfully instead of rereading it here so that we
- * don't lose destruction events in between. We could have
- * raced with the destruction of @new_position after all.
- */
- iter->last_visited = new_position;
- smp_wmb();
- iter->last_dead_count = sequence;
-}
-
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1199,8 +1050,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
+ struct reclaim_iter *uninitialized_var(iter);
+ struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *last_visited = NULL;
+ struct mem_cgroup *pos = NULL;
if (mem_cgroup_disabled())
return NULL;
@@ -1209,50 +1062,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
if (prev && !reclaim)
- last_visited = prev;
+ pos = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- goto out_css_put;
+ goto out;
return root;
}
rcu_read_lock();
- while (!memcg) {
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- int uninitialized_var(seq);
-
- if (reclaim) {
- struct mem_cgroup_per_zone *mz;
-
- mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
- iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation) {
- iter->last_visited = NULL;
- goto out_unlock;
- }
- last_visited = mem_cgroup_iter_load(iter, root, &seq);
+ if (reclaim) {
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ iter = &mz->iter[reclaim->priority];
+
+ if (prev && reclaim->generation != iter->generation)
+ goto out_unlock;
+
+ do {
+ pos = ACCESS_ONCE(iter->position);
+ /*
+ * A racing update may change the position and
+ * put the last reference, hence css_tryget(),
+ * or retry to see the updated position.
+ */
+ } while (pos && !css_tryget(&pos->css));
+ }
+
+ if (pos)
+ css = &pos->css;
+
+ for (;;) {
+ css = css_next_descendant_pre(css, &root->css);
+ if (!css) {
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ continue;
+ break;
}
- memcg = __mem_cgroup_iter_next(root, last_visited);
+ /*
+ * Verify the css and acquire a reference. The root
+ * is provided by the caller, so we know it's alive
+ * and kicking, and don't take an extra reference.
+ */
+ memcg = mem_cgroup_from_css(css);
+
+ if (css == &root->css)
+ break;
+
+ if (css_tryget(css)) {
+ /*
+ * Make sure the memcg is initialized:
+ * mem_cgroup_css_online() orders the the
+ * initialization against setting the flag.
+ */
+ if (smp_load_acquire(&memcg->initialized))
+ break;
+
+ css_put(css);
+ }
- if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, root,
- seq);
+ memcg = NULL;
+ }
- if (!memcg)
- iter->generation++;
- else if (!prev && memcg)
- reclaim->generation = iter->generation;
+ if (reclaim) {
+ if (cmpxchg(&iter->position, pos, memcg) == pos) {
+ if (memcg)
+ css_get(&memcg->css);
+ if (pos)
+ css_put(&pos->css);
}
- if (prev && !memcg)
- goto out_unlock;
+ /*
+ * pairs with css_tryget when dereferencing iter->position
+ * above.
+ */
+ if (pos)
+ css_put(&pos->css);
+
+ if (!memcg)
+ iter->generation++;
+ else if (!prev)
+ reclaim->generation = iter->generation;
}
+
out_unlock:
rcu_read_unlock();
-out_css_put:
+out:
if (prev && prev != root)
css_put(&prev->css);
@@ -1346,15 +1250,18 @@ out:
}
/**
- * mem_cgroup_page_lruvec - return lruvec for adding an lru page
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @zone: zone of the page
+ *
+ * This function is only safe when following the LRU page isolation
+ * and putback protocol: the LRU lock must be held, and the page must
+ * either be PageLRU() or the caller must have isolated/allocated it.
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
@@ -1362,20 +1269,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
goto out;
}
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
-
+ memcg = page->mem_cgroup;
/*
- * Surreptitiously switch any uncharged offlist page to root:
- * an uncharged page off lru does nothing to secure
- * its former mem_cgroup from sudden removal.
- *
- * Our caller holds lru_lock, and PageCgroupUsed is updated
- * under page_cgroup lock: between them, they make all uses
- * of pc->mem_cgroup safe.
+ * Swapcache readahead pages are added to the LRU - and
+ * possibly migrated - before they are charged.
*/
- if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
- pc->mem_cgroup = memcg = root_mem_cgroup;
+ if (!memcg)
+ memcg = root_mem_cgroup;
mz = mem_cgroup_page_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
@@ -1414,41 +1314,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-/*
- * Checks whether given mem is same or in the root_mem_cgroup's
- * hierarchy subtree
- */
-bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
- struct mem_cgroup *memcg)
+bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
{
- if (root_memcg == memcg)
+ if (root == memcg)
return true;
- if (!root_memcg->use_hierarchy || !memcg)
+ if (!root->use_hierarchy)
return false;
- return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
-}
-
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
- struct mem_cgroup *memcg)
-{
- bool ret;
-
- rcu_read_lock();
- ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
- rcu_read_unlock();
- return ret;
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}
-bool task_in_mem_cgroup(struct task_struct *task,
- const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
- struct mem_cgroup *curr = NULL;
+ struct mem_cgroup *task_memcg;
struct task_struct *p;
bool ret;
p = find_lock_task_mm(task);
if (p) {
- curr = get_mem_cgroup_from_mm(p->mm);
+ task_memcg = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1457,19 +1340,12 @@ bool task_in_mem_cgroup(struct task_struct *task,
* killed to prevent needlessly killing additional tasks.
*/
rcu_read_lock();
- curr = mem_cgroup_from_task(task);
- if (curr)
- css_get(&curr->css);
+ task_memcg = mem_cgroup_from_task(task);
+ css_get(&task_memcg->css);
rcu_read_unlock();
}
- /*
- * We should check use_hierarchy of "memcg" not "curr". Because checking
- * use_hierarchy of "curr" here make this function true if hierarchy is
- * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
- * hierarchy(even if use_hierarchy is disabled in "memcg").
- */
- ret = mem_cgroup_same_or_subtree(memcg, curr);
- css_put(&curr->css);
+ ret = mem_cgroup_is_descendant(task_memcg, memcg);
+ css_put(&task_memcg->css);
return ret;
}
@@ -1492,7 +1368,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
-#define mem_cgroup_from_res_counter(counter, member) \
+#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
@@ -1504,12 +1380,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
- unsigned long long margin;
+ unsigned long margin = 0;
+ unsigned long count;
+ unsigned long limit;
- margin = res_counter_margin(&memcg->res);
- if (do_swap_account)
- margin = min(margin, res_counter_margin(&memcg->memsw));
- return margin >> PAGE_SHIFT;
+ count = page_counter_read(&memcg->memory);
+ limit = ACCESS_ONCE(memcg->memory.limit);
+ if (count < limit)
+ margin = limit - count;
+
+ if (do_swap_account) {
+ count = page_counter_read(&memcg->memsw);
+ limit = ACCESS_ONCE(memcg->memsw.limit);
+ if (count <= limit)
+ margin = min(margin, limit - count);
+ }
+
+ return margin;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1522,37 +1409,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
}
/*
- * memcg->moving_account is used for checking possibility that some thread is
- * calling move_account(). When a thread on CPU-A starts moving pages under
- * a memcg, other threads should check memcg->moving_account under
- * rcu_read_lock(), like this:
- *
- * CPU-A CPU-B
- * rcu_read_lock()
- * memcg->moving_account+1 if (memcg->mocing_account)
- * take heavy locks.
- * synchronize_rcu() update something.
- * rcu_read_unlock()
- * start move here.
- */
-
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->moving_account);
- synchronize_rcu();
-}
-
-static void mem_cgroup_end_move(struct mem_cgroup *memcg)
-{
- /*
- * Now, mem_cgroup_clear_mc() may call this function with NULL.
- * We check NULL in callee rather than caller.
- */
- if (memcg)
- atomic_dec(&memcg->moving_account);
-}
-
-/*
* A routine for checking "mem" is under move_account() or not.
*
* Checking a cgroup is mc.from or mc.to or under hierarchy of
@@ -1574,8 +1430,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
if (!from)
goto unlock;
- ret = mem_cgroup_same_or_subtree(memcg, from)
- || mem_cgroup_same_or_subtree(memcg, to);
+ ret = mem_cgroup_is_descendant(from, memcg) ||
+ mem_cgroup_is_descendant(to, memcg);
unlock:
spin_unlock(&mc.lock);
return ret;
@@ -1597,23 +1453,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-/*
- * Take this lock when
- * - a code tries to modify page's memcg while it's USED.
- * - a code tries to modify page state accounting in a memcg.
- */
-static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
- unsigned long *flags)
-{
- spin_lock_irqsave(&memcg->move_lock, *flags);
-}
-
-static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
- unsigned long *flags)
-{
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
-}
-
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1644,18 +1483,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
rcu_read_unlock();
- pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->res, RES_FAILCNT));
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+ pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memory)),
+ K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1695,28 +1531,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
- u64 limit;
-
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ unsigned long limit;
- /*
- * Do not consider swap space if we cannot swap due to swappiness
- */
+ limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
- u64 memsw;
-
- limit += total_swap_pages << PAGE_SHIFT;
- memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ unsigned long memsw_limit;
- /*
- * If memsw is finite and limits the amount of swap space
- * available to this memcg, return that limit.
- */
- limit = min(limit, memsw);
+ memsw_limit = memcg->memsw.limit;
+ limit = min(limit + total_swap_pages, memsw_limit);
}
-
return limit;
}
@@ -1734,13 +1559,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ if (fatal_signal_pending(current) || task_will_free_mem(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
- totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+ totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
@@ -1791,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
NULL, "Memory cgroup out of memory");
}
+#if MAX_NUMNODES > 1
+
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -1813,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false;
}
-#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@@ -1880,52 +1706,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
memcg->last_scanned_node = node;
return node;
}
-
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- int nid;
-
- /*
- * quick check...making use of scan_node.
- * We can skip unused nodes.
- */
- if (!nodes_empty(memcg->scan_nodes)) {
- for (nid = first_node(memcg->scan_nodes);
- nid < MAX_NUMNODES;
- nid = next_node(nid, memcg->scan_nodes)) {
-
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- }
- /*
- * Check rest of nodes.
- */
- for_each_node_state(nid, N_MEMORY) {
- if (node_isset(nid, memcg->scan_nodes))
- continue;
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- return false;
-}
-
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
#endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1943,7 +1728,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
.priority = 0,
};
- excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+ excess = soft_limit_excess(root_memcg);
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1969,12 +1754,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
}
continue;
}
- if (!mem_cgroup_reclaimable(victim, false))
- continue;
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
- if (!res_counter_soft_limit_excess(&root_memcg->res))
+ if (!soft_limit_excess(root_memcg))
break;
}
mem_cgroup_iter_break(root_memcg, victim);
@@ -2081,12 +1864,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
- /*
- * Both of oom_wait_info->memcg and wake_memcg are stable under us.
- * Then we can use css_is_ancestor without taking care of RCU.
- */
- if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
- && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
+ if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
+ !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
@@ -2228,26 +2007,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
unsigned long *flags)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
rcu_read_lock();
if (mem_cgroup_disabled())
return NULL;
-
- pc = lookup_page_cgroup(page);
again:
- memcg = pc->mem_cgroup;
- if (unlikely(!memcg || !PageCgroupUsed(pc)))
+ memcg = page->mem_cgroup;
+ if (unlikely(!memcg))
return NULL;
*locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
- move_lock_mem_cgroup(memcg, flags);
- if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
- move_unlock_mem_cgroup(memcg, flags);
+ spin_lock_irqsave(&memcg->move_lock, *flags);
+ if (memcg != page->mem_cgroup) {
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
goto again;
}
*locked = true;
@@ -2261,11 +2037,11 @@ again:
* @locked: value received from mem_cgroup_begin_page_stat()
* @flags: value received from mem_cgroup_begin_page_stat()
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
- unsigned long flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
+ unsigned long *flags)
{
- if (memcg && locked)
- move_unlock_mem_cgroup(memcg, &flags);
+ if (memcg && *locked)
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
rcu_read_unlock();
}
@@ -2316,33 +2092,32 @@ static DEFINE_MUTEX(percpu_charge_mutex);
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- bool ret = true;
+ bool ret = false;
if (nr_pages > CHARGE_BATCH)
- return false;
+ return ret;
stock = &get_cpu_var(memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
- else /* need to call res_counter_charge */
- ret = false;
+ ret = true;
+ }
put_cpu_var(memcg_stock);
return ret;
}
/*
- * Returns stocks cached in percpu to res_counter and reset cached information.
+ * Returns stocks cached in percpu and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
- unsigned long bytes = stock->nr_pages * PAGE_SIZE;
-
- res_counter_uncharge(&old->res, bytes);
+ page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_swap_account)
- res_counter_uncharge(&old->memsw, bytes);
+ page_counter_uncharge(&old->memsw, stock->nr_pages);
+ css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
stock->cached = NULL;
@@ -2371,7 +2146,7 @@ static void __init memcg_stock_init(void)
}
/*
- * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2388,13 +2163,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
- * of the hierarchy under it. sync flag says whether we should block
- * until the work is done.
+ * of the hierarchy under it.
*/
-static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg)
{
int cpu, curcpu;
+ /* If someone's already draining, avoid adding running more workers. */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
@@ -2405,7 +2182,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
memcg = stock->cached;
if (!memcg || !stock->nr_pages)
continue;
- if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
+ if (!mem_cgroup_is_descendant(memcg, root_memcg))
continue;
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
@@ -2415,42 +2192,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
}
}
put_cpu();
-
- if (!sync)
- goto out;
-
- for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
- flush_work(&stock->work);
- }
-out:
put_online_cpus();
-}
-
-/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
- */
-static void drain_all_stock_async(struct mem_cgroup *root_memcg)
-{
- /*
- * If someone calls draining, avoid adding more kworker runs.
- */
- if (!mutex_trylock(&percpu_charge_mutex))
- return;
- drain_all_stock(root_memcg, false);
- mutex_unlock(&percpu_charge_mutex);
-}
-
-/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
-{
- /* called when force_empty is called */
- mutex_lock(&percpu_charge_mutex);
- drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
}
@@ -2506,9 +2248,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
- struct res_counter *fail_res;
+ struct page_counter *counter;
unsigned long nr_reclaimed;
- unsigned long long size;
bool may_swap = true;
bool drained = false;
int ret = 0;
@@ -2519,16 +2260,15 @@ retry:
if (consume_stock(memcg, nr_pages))
goto done;
- size = batch * PAGE_SIZE;
if (!do_swap_account ||
- !res_counter_charge(&memcg->memsw, size, &fail_res)) {
- if (!res_counter_charge(&memcg->res, size, &fail_res))
+ !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+ if (!page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
if (do_swap_account)
- res_counter_uncharge(&memcg->memsw, size);
- mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ page_counter_uncharge(&memcg->memsw, batch);
+ mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
- mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ mem_over_limit = mem_cgroup_from_counter(counter, memsw);
may_swap = false;
}
@@ -2561,7 +2301,7 @@ retry:
goto retry;
if (!drained) {
- drain_all_stock_async(mem_over_limit);
+ drain_all_stock(mem_over_limit);
drained = true;
goto retry;
}
@@ -2603,6 +2343,7 @@ bypass:
return -EINTR;
done_restock:
+ css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
done:
@@ -2611,32 +2352,14 @@ done:
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- unsigned long bytes = nr_pages * PAGE_SIZE;
-
if (mem_cgroup_is_root(memcg))
return;
- res_counter_uncharge(&memcg->res, bytes);
+ page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
- res_counter_uncharge(&memcg->memsw, bytes);
-}
-
-/*
- * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
- * This is useful when moving usage to parent cgroup.
- */
-static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
-{
- unsigned long bytes = nr_pages * PAGE_SIZE;
+ page_counter_uncharge(&memcg->memsw, nr_pages);
- if (mem_cgroup_is_root(memcg))
- return;
-
- res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
- if (do_swap_account)
- res_counter_uncharge_until(&memcg->memsw,
- memcg->memsw.parent, bytes);
+ css_put_many(&memcg->css, nr_pages);
}
/*
@@ -2665,17 +2388,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
*/
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
+ struct mem_cgroup *memcg;
unsigned short id;
swp_entry_t ent;
VM_BUG_ON_PAGE(!PageLocked(page), page);
- pc = lookup_page_cgroup(page);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- if (memcg && !css_tryget_online(&memcg->css))
+ memcg = page->mem_cgroup;
+ if (memcg) {
+ if (!css_tryget_online(&memcg->css))
memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
@@ -2723,14 +2444,9 @@ static void unlock_page_lru(struct page *page, int isolated)
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
bool lrucare)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
int isolated;
- VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
- /*
- * we don't need page_cgroup_lock about tail pages, becase they are not
- * accessed by any other context at this point.
- */
+ VM_BUG_ON_PAGE(page->mem_cgroup, page);
/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2741,7 +2457,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
/*
* Nobody should be changing or seriously looking at
- * pc->mem_cgroup and pc->flags at this point:
+ * page->mem_cgroup at this point:
*
* - the page is uncharged
*
@@ -2753,15 +2469,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
* - a page cache insertion, a swapin fault, or a migration
* have the page locked
*/
- pc->mem_cgroup = memcg;
- pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
+ page->mem_cgroup = memcg;
if (lrucare)
unlock_page_lru(page, isolated);
}
-static DEFINE_MUTEX(set_limit_mutex);
-
#ifdef CONFIG_MEMCG_KMEM
/*
* The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2769,8 +2482,6 @@ static DEFINE_MUTEX(set_limit_mutex);
*/
static DEFINE_MUTEX(memcg_slab_mutex);
-static DEFINE_MUTEX(activate_kmem_mutex);
-
/*
* This is a bit cumbersome, but it is rarely used and avoids a backpointer
* in the memcg_cache_params struct.
@@ -2784,36 +2495,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
-#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- struct memcg_cache_params *params;
-
- if (!memcg_kmem_is_active(memcg))
- return -EIO;
-
- print_slabinfo_header(m);
-
- mutex_lock(&memcg_slab_mutex);
- list_for_each_entry(params, &memcg->memcg_slab_caches, list)
- cache_show(memcg_params_to_cache(params), m);
- mutex_unlock(&memcg_slab_mutex);
-
- return 0;
-}
-#endif
-
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned long nr_pages)
{
- struct res_counter *fail_res;
+ struct page_counter *counter;
int ret = 0;
- ret = res_counter_charge(&memcg->kmem, size, &fail_res);
- if (ret)
+ ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+ if (ret < 0)
return ret;
- ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
+ ret = try_charge(memcg, gfp, nr_pages);
if (ret == -EINTR) {
/*
* try_charge() chose to bypass to root due to OOM kill or
@@ -2830,37 +2522,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
* when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
- res_counter_charge_nofail(&memcg->res, size, &fail_res);
+ page_counter_charge(&memcg->memory, nr_pages);
if (do_swap_account)
- res_counter_charge_nofail(&memcg->memsw, size,
- &fail_res);
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
ret = 0;
} else if (ret)
- res_counter_uncharge(&memcg->kmem, size);
+ page_counter_uncharge(&memcg->kmem, nr_pages);
return ret;
}
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+ unsigned long nr_pages)
{
- res_counter_uncharge(&memcg->res, size);
+ page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
- res_counter_uncharge(&memcg->memsw, size);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
- /* Not down to 0 */
- if (res_counter_uncharge(&memcg->kmem, size))
- return;
+ page_counter_uncharge(&memcg->kmem, nr_pages);
- /*
- * Releases a reference taken in kmem_cgroup_css_offline in case
- * this last uncharge is racing with the offlining code or it is
- * outliving the memcg existence.
- *
- * The memory barrier imposed by test&clear is paired with the
- * explicit one in memcg_kmem_mark_dead().
- */
- if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
+ css_put_many(&memcg->css, nr_pages);
}
/*
@@ -2953,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
if (!cachep)
return;
- css_get(&memcg->css);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/*
@@ -2987,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
list_del(&cachep->memcg_params->list);
kmem_cache_destroy(cachep);
-
- /* drop the reference taken in memcg_register_cache */
- css_put(&memcg->css);
-}
-
-/*
- * During the creation a new cache, we need to disable our accounting mechanism
- * altogether. This is true even if we are not creating, but rather just
- * enqueing new caches to be created.
- *
- * This is because that process will trigger allocations; some visible, like
- * explicit kmallocs to auxiliary data structures, name strings and internal
- * cache structures; some well concealed, like INIT_WORK() that can allocate
- * objects during debug.
- *
- * If any allocation happens during memcg_kmem_get_cache, we will recurse back
- * to it. This may not be a bounded recursion: since the first cache creation
- * failed to complete (waiting on the allocation), we'll just try to create the
- * cache again, failing at the same point.
- *
- * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
- * memcg_kmem_skip_account. So we enclose anything that might allocate memory
- * inside the following two functions.
- */
-static inline void memcg_stop_kmem_account(void)
-{
- VM_BUG_ON(!current->mm);
- current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
- VM_BUG_ON(!current->mm);
- current->memcg_kmem_skip_account--;
}
int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -3054,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
mutex_lock(&memcg_slab_mutex);
list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
- kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- memcg_unregister_cache(cachep);
+ memcg_unregister_cache(cachep);
}
mutex_unlock(&memcg_slab_mutex);
}
@@ -3091,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
struct memcg_register_cache_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
- if (cw == NULL) {
- css_put(&memcg->css);
+ if (!cw)
return;
- }
+
+ css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
@@ -3117,26 +2762,23 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
- memcg_stop_kmem_account();
+ current->memcg_kmem_skip_account = 1;
__memcg_schedule_register_cache(memcg, cachep);
- memcg_resume_kmem_account();
+ current->memcg_kmem_skip_account = 0;
}
int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
{
- int res;
+ unsigned int nr_pages = 1 << order;
- res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
- PAGE_SIZE << order);
- if (!res)
- atomic_add(1 << order, &cachep->memcg_params->nr_pages);
- return res;
+ return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
}
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
{
- memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
- atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+ unsigned int nr_pages = 1 << order;
+
+ memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
}
/*
@@ -3152,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
- gfp_t gfp)
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -3161,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
- if (!current->mm || current->memcg_kmem_skip_account)
+ if (current->memcg_kmem_skip_account)
return cachep;
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
-
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_kmem_is_active(memcg))
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
- if (likely(memcg_cachep)) {
- cachep = memcg_cachep;
- goto out;
- }
-
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget_online(&memcg->css))
- goto out;
- rcu_read_unlock();
+ if (likely(memcg_cachep))
+ return memcg_cachep;
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -3194,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* defer everything.
*/
memcg_schedule_register_cache(memcg, cachep);
- return cachep;
out:
- rcu_read_unlock();
+ css_put(&memcg->css);
return cachep;
}
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+ if (!is_root_cache(cachep))
+ css_put(&cachep->memcg_params->memcg->css);
+}
+
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
@@ -3222,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
*_memcg = NULL;
- /*
- * Disabling accounting is only relevant for some specific memcg
- * internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are
- * accounted to kmemcg (alloc_kmem_pages and friends) only happen
- * outside memcg core. We are mostly concerned with cache allocations,
- * and by having this test at memcg_kmem_get_cache, we are already able
- * to relay the allocation to the root cache and bypass the memcg cache
- * altogether.
- *
- * There is one exception, though: the SLUB allocator does not create
- * large order caches, but rather service large kmallocs directly from
- * the page allocator. Therefore, the following sequence when backed by
- * the SLUB allocator:
- *
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
- *
- * would effectively ignore the fact that we should skip accounting,
- * since it will drive us directly to this function without passing
- * through the cache selector memcg_kmem_get_cache. Such large
- * allocations are extremely rare but can happen, for instance, for the
- * cache arrays. We bring this test here.
- */
- if (!current->mm || current->memcg_kmem_skip_account)
- return true;
-
memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_kmem_is_active(memcg)) {
@@ -3257,7 +2866,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
return true;
}
- ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+ ret = memcg_charge_kmem(memcg, gfp, 1 << order);
if (!ret)
*_memcg = memcg;
@@ -3268,50 +2877,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
int order)
{
- struct page_cgroup *pc;
-
VM_BUG_ON(mem_cgroup_is_root(memcg));
/* The page allocation failed. Revert */
if (!page) {
- memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+ memcg_uncharge_kmem(memcg, 1 << order);
return;
}
- /*
- * The page is freshly allocated and not visible to any
- * outside callers yet. Set up pc non-atomically.
- */
- pc = lookup_page_cgroup(page);
- pc->mem_cgroup = memcg;
- pc->flags = PCG_USED;
+ page->mem_cgroup = memcg;
}
void __memcg_kmem_uncharge_pages(struct page *page, int order)
{
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
-
-
- pc = lookup_page_cgroup(page);
- if (!PageCgroupUsed(pc))
- return;
-
- memcg = pc->mem_cgroup;
- pc->flags = 0;
+ struct mem_cgroup *memcg = page->mem_cgroup;
- /*
- * We trust that only if there is a memcg associated with the page, it
- * is a valid allocation
- */
if (!memcg)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
-}
-#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
+
+ memcg_uncharge_kmem(memcg, 1 << order);
+ page->mem_cgroup = NULL;
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -3325,21 +2911,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
- struct page_cgroup *head_pc = lookup_page_cgroup(head);
- struct page_cgroup *pc;
- struct mem_cgroup *memcg;
int i;
if (mem_cgroup_disabled())
return;
- memcg = head_pc->mem_cgroup;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- pc = head_pc + i;
- pc->mem_cgroup = memcg;
- pc->flags = head_pc->flags;
- }
- __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ for (i = 1; i < HPAGE_PMD_NR; i++)
+ head[i].mem_cgroup = head->mem_cgroup;
+
+ __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3348,7 +2928,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
* mem_cgroup_move_account - move account of the page
* @page: the page
* @nr_pages: number of regular pages (>1 for huge pages)
- * @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
@@ -3361,7 +2940,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
*/
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
- struct page_cgroup *pc,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
@@ -3381,7 +2959,7 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
/*
- * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
* of its source page while we change it: page migration takes
* both pages off the LRU, but page cache replacement doesn't.
*/
@@ -3389,10 +2967,10 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
ret = -EINVAL;
- if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+ if (page->mem_cgroup != from)
goto out_unlock;
- move_lock_mem_cgroup(from, &flags);
+ spin_lock_irqsave(&from->move_lock, flags);
if (!PageAnon(page) && page_mapped(page)) {
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3409,14 +2987,15 @@ static int mem_cgroup_move_account(struct page *page,
}
/*
- * It is safe to change pc->mem_cgroup here because the page
+ * It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
* uncharging, charging, migration, or LRU putback.
*/
/* caller should have done css_get */
- pc->mem_cgroup = to;
- move_unlock_mem_cgroup(from, &flags);
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
ret = 0;
local_irq_disable();
@@ -3431,72 +3010,6 @@ out:
return ret;
}
-/**
- * mem_cgroup_move_parent - moves page to the parent group
- * @page: the page to move
- * @pc: page_cgroup of the page
- * @child: page's cgroup
- *
- * move charges to its parent or the root cgroup if the group has no
- * parent (aka use_hierarchy==0).
- * Although this might fail (get_page_unless_zero, isolate_lru_page or
- * mem_cgroup_move_account fails) the failure is always temporary and
- * it signals a race with a page removal/uncharge or migration. In the
- * first case the page is on the way out and it will vanish from the LRU
- * on the next attempt and the call should be retried later.
- * Isolation from the LRU fails only if page has been isolated from
- * the LRU since we looked at it and that usually means either global
- * reclaim or migration going on. The page will either get back to the
- * LRU or vanish.
- * Finaly mem_cgroup_move_account fails only if the page got uncharged
- * (!PageCgroupUsed) or moved to a different group. The page will
- * disappear in the next attempt.
- */
-static int mem_cgroup_move_parent(struct page *page,
- struct page_cgroup *pc,
- struct mem_cgroup *child)
-{
- struct mem_cgroup *parent;
- unsigned int nr_pages;
- unsigned long uninitialized_var(flags);
- int ret;
-
- VM_BUG_ON(mem_cgroup_is_root(child));
-
- ret = -EBUSY;
- if (!get_page_unless_zero(page))
- goto out;
- if (isolate_lru_page(page))
- goto put;
-
- nr_pages = hpage_nr_pages(page);
-
- parent = parent_mem_cgroup(child);
- /*
- * If no parent, move charges to root cgroup.
- */
- if (!parent)
- parent = root_mem_cgroup;
-
- if (nr_pages > 1) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- flags = compound_lock_irqsave(page);
- }
-
- ret = mem_cgroup_move_account(page, nr_pages,
- pc, child, parent);
- if (!ret)
- __mem_cgroup_cancel_local_charge(child, nr_pages);
-
- if (nr_pages > 1)
- compound_unlock_irqrestore(page, flags);
- putback_lru_page(page);
-put:
- put_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -3516,7 +3029,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
*
* Returns 0 on success, -EINVAL on failure.
*
- * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * The caller must have charged to @to, IOW, called page_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3045,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
mem_cgroup_swap_statistics(to, true);
/*
* This function is only called from task migration context now.
- * It postpones res_counter and refcount handling till the end
+ * It postpones page_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone css_get(to) because if
* the process that has been moved to @to does swap-in, the
@@ -3554,96 +3067,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-#ifdef CONFIG_DEBUG_VM
-static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
-{
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup(page);
- /*
- * Can be NULL while feeding pages into the page allocator for
- * the first time, i.e. during boot or memory hotplug;
- * or when mem_cgroup_disabled().
- */
- if (likely(pc) && PageCgroupUsed(pc))
- return pc;
- return NULL;
-}
-
-bool mem_cgroup_bad_page_check(struct page *page)
-{
- if (mem_cgroup_disabled())
- return false;
-
- return lookup_page_cgroup_used(page) != NULL;
-}
-
-void mem_cgroup_print_bad_page(struct page *page)
-{
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup_used(page);
- if (pc) {
- pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
- pc, pc->flags, pc->mem_cgroup);
- }
-}
-#endif
+static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long long val)
+ unsigned long limit)
{
+ unsigned long curusage;
+ unsigned long oldusage;
+ bool enlarge = false;
int retry_count;
- int ret = 0;
- int children = mem_cgroup_count_children(memcg);
- u64 curusage, oldusage;
- int enlarge;
+ int ret;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
- retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
+ retry_count = MEM_CGROUP_RECLAIM_RETRIES *
+ mem_cgroup_count_children(memcg);
- oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ oldusage = page_counter_read(&memcg->memory);
- enlarge = 0;
- while (retry_count) {
+ do {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
- /*
- * Rather than hide all in some function, I do this in
- * open coded manner. You see what this really does.
- * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
- */
- mutex_lock(&set_limit_mutex);
- if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
+
+ mutex_lock(&memcg_limit_mutex);
+ if (limit > memcg->memsw.limit) {
+ mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
- mutex_unlock(&set_limit_mutex);
break;
}
-
- if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
- enlarge = 1;
-
- ret = res_counter_set_limit(&memcg->res, val);
- mutex_unlock(&set_limit_mutex);
+ if (limit > memcg->memory.limit)
+ enlarge = true;
+ ret = page_counter_limit(&memcg->memory, limit);
+ mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
- curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ curusage = page_counter_read(&memcg->memory);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
- }
+ } while (retry_count);
+
if (!ret && enlarge)
memcg_oom_recover(memcg);
@@ -3651,52 +3125,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
}
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long long val)
+ unsigned long limit)
{
+ unsigned long curusage;
+ unsigned long oldusage;
+ bool enlarge = false;
int retry_count;
- u64 oldusage, curusage;
- int children = mem_cgroup_count_children(memcg);
- int ret = -EBUSY;
- int enlarge = 0;
+ int ret;
/* see mem_cgroup_resize_res_limit */
- retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
- oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
- while (retry_count) {
+ retry_count = MEM_CGROUP_RECLAIM_RETRIES *
+ mem_cgroup_count_children(memcg);
+
+ oldusage = page_counter_read(&memcg->memsw);
+
+ do {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
- /*
- * Rather than hide all in some function, I do this in
- * open coded manner. You see what this really does.
- * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
- */
- mutex_lock(&set_limit_mutex);
- if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
+
+ mutex_lock(&memcg_limit_mutex);
+ if (limit < memcg->memory.limit) {
+ mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
- mutex_unlock(&set_limit_mutex);
break;
}
- if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
- enlarge = 1;
- ret = res_counter_set_limit(&memcg->memsw, val);
- mutex_unlock(&set_limit_mutex);
+ if (limit > memcg->memsw.limit)
+ enlarge = true;
+ ret = page_counter_limit(&memcg->memsw, limit);
+ mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
- curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ curusage = page_counter_read(&memcg->memsw);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
- }
+ } while (retry_count);
+
if (!ret && enlarge)
memcg_oom_recover(memcg);
+
return ret;
}
@@ -3709,7 +3184,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
- unsigned long long excess;
+ unsigned long excess;
unsigned long nr_scanned;
if (order > 0)
@@ -3735,35 +3210,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
+ __mem_cgroup_remove_exceeded(mz, mctz);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
- if (!reclaimed) {
- do {
- /*
- * Loop until we find yet another one.
- *
- * By the time we get the soft_limit lock
- * again, someone might have aded the
- * group back on the RB tree. Iterate to
- * make sure we get a different mem.
- * mem_cgroup_largest_soft_limit_node returns
- * NULL if no other cgroup is present on
- * the tree
- */
- next_mz =
- __mem_cgroup_largest_soft_limit_node(mctz);
- if (next_mz == mz)
- css_put(&next_mz->memcg->css);
- else /* next_mz == NULL or other memcg */
- break;
- } while (1);
- }
- __mem_cgroup_remove_exceeded(mz, mctz);
- excess = res_counter_soft_limit_excess(&mz->memcg->res);
+ if (!reclaimed)
+ next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
+
+ excess = soft_limit_excess(mz->memcg);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
@@ -3792,107 +3249,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return nr_reclaimed;
}
-/**
- * mem_cgroup_force_empty_list - clears LRU of a group
- * @memcg: group to clear
- * @node: NUMA node
- * @zid: zone id
- * @lru: lru to to clear
- *
- * Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - pages are moved to the parent (or root)
- * group.
- */
-static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
- int node, int zid, enum lru_list lru)
-{
- struct lruvec *lruvec;
- unsigned long flags;
- struct list_head *list;
- struct page *busy;
- struct zone *zone;
-
- zone = &NODE_DATA(node)->node_zones[zid];
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- list = &lruvec->lists[lru];
-
- busy = NULL;
- do {
- struct page_cgroup *pc;
- struct page *page;
-
- spin_lock_irqsave(&zone->lru_lock, flags);
- if (list_empty(list)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- break;
- }
- page = list_entry(list->prev, struct page, lru);
- if (busy == page) {
- list_move(&page->lru, list);
- busy = NULL;
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- continue;
- }
- spin_unlock_irqrestore(&zone->lru_lock, flags);
-
- pc = lookup_page_cgroup(page);
-
- if (mem_cgroup_move_parent(page, pc, memcg)) {
- /* found lock contention or "pc" is obsolete. */
- busy = page;
- } else
- busy = NULL;
- cond_resched();
- } while (!list_empty(list));
-}
-
-/*
- * make mem_cgroup's charge to be 0 if there is no task by moving
- * all the charges and pages to the parent.
- * This enables deleting this mem_cgroup.
- *
- * Caller is responsible for holding css reference on the memcg.
- */
-static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
-{
- int node, zid;
- u64 usage;
-
- do {
- /* This is for making all *used* pages to be on LRU. */
- lru_add_drain_all();
- drain_all_stock_sync(memcg);
- mem_cgroup_start_move(memcg);
- for_each_node_state(node, N_MEMORY) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- enum lru_list lru;
- for_each_lru(lru) {
- mem_cgroup_force_empty_list(memcg,
- node, zid, lru);
- }
- }
- }
- mem_cgroup_end_move(memcg);
- memcg_oom_recover(memcg);
- cond_resched();
-
- /*
- * Kernel memory may not necessarily be trackable to a specific
- * process. So they are not migrated, and therefore we can't
- * expect their value to drop to 0 here.
- * Having res filled up with kmem only is enough.
- *
- * This is a safety check because mem_cgroup_force_empty_list
- * could have raced with mem_cgroup_replace_page_cache callers
- * so the lru seemed empty but the page could have been added
- * right after the check. RES_USAGE should be safe as we always
- * charge before adding to the LRU.
- */
- usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
- res_counter_read_u64(&memcg->kmem, RES_USAGE);
- } while (usage > 0);
-}
-
/*
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
@@ -3930,7 +3286,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
- while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
+ while (nr_retries && page_counter_read(&memcg->memory)) {
int progress;
if (signal_pending(current))
@@ -4001,8 +3357,8 @@ out:
return retval;
}
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static unsigned long tree_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
long val = 0;
@@ -4020,55 +3376,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
u64 val;
- if (!mem_cgroup_is_root(memcg)) {
+ if (mem_cgroup_is_root(memcg)) {
+ val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ } else {
if (!swap)
- return res_counter_read_u64(&memcg->res, RES_USAGE);
+ val = page_counter_read(&memcg->memory);
else
- return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ val = page_counter_read(&memcg->memsw);
}
-
- /*
- * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
- * as well as in MEM_CGROUP_STAT_RSS_HUGE.
- */
- val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-
- if (swap)
- val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
-
return val << PAGE_SHIFT;
}
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+ RES_SOFT_LIMIT,
+};
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- enum res_type type = MEMFILE_TYPE(cft->private);
- int name = MEMFILE_ATTR(cft->private);
+ struct page_counter *counter;
- switch (type) {
+ switch (MEMFILE_TYPE(cft->private)) {
case _MEM:
- if (name == RES_USAGE)
- return mem_cgroup_usage(memcg, false);
- return res_counter_read_u64(&memcg->res, name);
+ counter = &memcg->memory;
+ break;
case _MEMSWAP:
- if (name == RES_USAGE)
- return mem_cgroup_usage(memcg, true);
- return res_counter_read_u64(&memcg->memsw, name);
+ counter = &memcg->memsw;
+ break;
case _KMEM:
- return res_counter_read_u64(&memcg->kmem, name);
+ counter = &memcg->kmem;
break;
default:
BUG();
}
+
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ if (counter == &memcg->memory)
+ return mem_cgroup_usage(memcg, false);
+ if (counter == &memcg->memsw)
+ return mem_cgroup_usage(memcg, true);
+ return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_LIMIT:
+ return (u64)counter->limit * PAGE_SIZE;
+ case RES_MAX_USAGE:
+ return (u64)counter->watermark * PAGE_SIZE;
+ case RES_FAILCNT:
+ return counter->failcnt;
+ case RES_SOFT_LIMIT:
+ return (u64)memcg->soft_limit * PAGE_SIZE;
+ default:
+ BUG();
+ }
}
#ifdef CONFIG_MEMCG_KMEM
-/* should be called with activate_kmem_mutex held */
-static int __memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long long limit)
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long nr_pages)
{
int err = 0;
int memcg_id;
@@ -4077,12 +3449,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
return 0;
/*
- * We are going to allocate memory for data shared by all memory
- * cgroups so let's stop accounting here.
- */
- memcg_stop_kmem_account();
-
- /*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
* already joined.
@@ -4108,48 +3474,36 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
goto out;
}
- memcg->kmemcg_id = memcg_id;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-
/*
- * We couldn't have accounted to this cgroup, because it hasn't got the
- * active bit set yet, so this should succeed.
+ * We couldn't have accounted to this cgroup, because it hasn't got
+ * activated yet, so this should succeed.
*/
- err = res_counter_set_limit(&memcg->kmem, limit);
+ err = page_counter_limit(&memcg->kmem, nr_pages);
VM_BUG_ON(err);
static_key_slow_inc(&memcg_kmem_enabled_key);
/*
- * Setting the active bit after enabling static branching will
+ * A memory cgroup is considered kmem-active as soon as it gets
+ * kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
- memcg_kmem_set_active(memcg);
+ memcg->kmemcg_id = memcg_id;
out:
- memcg_resume_kmem_account();
return err;
}
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long long limit)
-{
- int ret;
-
- mutex_lock(&activate_kmem_mutex);
- ret = __memcg_activate_kmem(memcg, limit);
- mutex_unlock(&activate_kmem_mutex);
- return ret;
-}
-
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long long val)
+ unsigned long limit)
{
int ret;
+ mutex_lock(&memcg_limit_mutex);
if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, val);
+ ret = memcg_activate_kmem(memcg, limit);
else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ ret = page_counter_limit(&memcg->kmem, limit);
+ mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4161,19 +3515,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
if (!parent)
return 0;
- mutex_lock(&activate_kmem_mutex);
+ mutex_lock(&memcg_limit_mutex);
/*
* If the parent cgroup is not kmem-active now, it cannot be activated
* after this point, because it has at least one child already.
*/
if (memcg_kmem_is_active(parent))
- ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
- mutex_unlock(&activate_kmem_mutex);
+ ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
+ mutex_unlock(&memcg_limit_mutex);
return ret;
}
#else
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long long val)
+ unsigned long limit)
{
return -EINVAL;
}
@@ -4187,110 +3541,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- enum res_type type;
- int name;
- unsigned long long val;
+ unsigned long nr_pages;
int ret;
buf = strstrip(buf);
- type = MEMFILE_TYPE(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ ret = page_counter_memparse(buf, &nr_pages);
+ if (ret)
+ return ret;
- switch (name) {
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
- /* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buf, &val);
- if (ret)
+ switch (MEMFILE_TYPE(of_cft(of)->private)) {
+ case _MEM:
+ ret = mem_cgroup_resize_limit(memcg, nr_pages);
break;
- if (type == _MEM)
- ret = mem_cgroup_resize_limit(memcg, val);
- else if (type == _MEMSWAP)
- ret = mem_cgroup_resize_memsw_limit(memcg, val);
- else if (type == _KMEM)
- ret = memcg_update_kmem_limit(memcg, val);
- else
- return -EINVAL;
- break;
- case RES_SOFT_LIMIT:
- ret = res_counter_memparse_write_strategy(buf, &val);
- if (ret)
+ case _MEMSWAP:
+ ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
break;
- /*
- * For memsw, soft limits are hard to implement in terms
- * of semantics, for now, we support soft limits for
- * control without swap
- */
- if (type == _MEM)
- ret = res_counter_set_soft_limit(&memcg->res, val);
- else
- ret = -EINVAL;
+ case _KMEM:
+ ret = memcg_update_kmem_limit(memcg, nr_pages);
+ break;
+ }
break;
- default:
- ret = -EINVAL; /* should be BUG() ? */
+ case RES_SOFT_LIMIT:
+ memcg->soft_limit = nr_pages;
+ ret = 0;
break;
}
return ret ?: nbytes;
}
-static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
- unsigned long long *mem_limit, unsigned long long *memsw_limit)
-{
- unsigned long long min_limit, min_memsw_limit, tmp;
-
- min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
- min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
- if (!memcg->use_hierarchy)
- goto out;
-
- while (memcg->css.parent) {
- memcg = mem_cgroup_from_css(memcg->css.parent);
- if (!memcg->use_hierarchy)
- break;
- tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
- min_limit = min(min_limit, tmp);
- tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
- min_memsw_limit = min(min_memsw_limit, tmp);
- }
-out:
- *mem_limit = min_limit;
- *memsw_limit = min_memsw_limit;
-}
-
static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- int name;
- enum res_type type;
+ struct page_counter *counter;
- type = MEMFILE_TYPE(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ switch (MEMFILE_TYPE(of_cft(of)->private)) {
+ case _MEM:
+ counter = &memcg->memory;
+ break;
+ case _MEMSWAP:
+ counter = &memcg->memsw;
+ break;
+ case _KMEM:
+ counter = &memcg->kmem;
+ break;
+ default:
+ BUG();
+ }
- switch (name) {
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
- if (type == _MEM)
- res_counter_reset_max(&memcg->res);
- else if (type == _MEMSWAP)
- res_counter_reset_max(&memcg->memsw);
- else if (type == _KMEM)
- res_counter_reset_max(&memcg->kmem);
- else
- return -EINVAL;
+ page_counter_reset_watermark(counter);
break;
case RES_FAILCNT:
- if (type == _MEM)
- res_counter_reset_failcnt(&memcg->res);
- else if (type == _MEMSWAP)
- res_counter_reset_failcnt(&memcg->memsw);
- else if (type == _KMEM)
- res_counter_reset_failcnt(&memcg->kmem);
- else
- return -EINVAL;
+ counter->failcnt = 0;
break;
+ default:
+ BUG();
}
return nbytes;
@@ -4379,17 +3692,15 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
-static inline void mem_cgroup_lru_names_not_uptodate(void)
-{
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
-}
-
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
@@ -4406,14 +3717,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
/* Hierarchical information */
- {
- unsigned long long limit, memsw_limit;
- memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
- seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
- if (do_swap_account)
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- memsw_limit);
+ memory = memsw = PAGE_COUNTER_MAX;
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
+ memory = min(memory, mi->memory.limit);
+ memsw = min(memsw, mi->memsw.limit);
}
+ seq_printf(m, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
+ if (do_swap_account)
+ seq_printf(m, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long long val = 0;
@@ -4497,7 +3810,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
- u64 usage;
+ unsigned long usage;
int i;
rcu_read_lock();
@@ -4596,10 +3909,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- u64 threshold, usage;
+ unsigned long threshold;
+ unsigned long usage;
int i, size, ret;
- ret = res_counter_memparse_write_strategy(args, &threshold);
+ ret = page_counter_memparse(args, &threshold);
if (ret)
return ret;
@@ -4689,7 +4003,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- u64 usage;
+ unsigned long usage;
int i, j, size;
mutex_lock(&memcg->thresholds_lock);
@@ -4843,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
int ret;
- memcg->kmemcg_id = -1;
ret = memcg_propagate_kmem(memcg);
if (ret)
return ret;
@@ -4853,42 +4166,9 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
+ memcg_unregister_all_caches(memcg);
mem_cgroup_sockets_destroy(memcg);
}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
- if (!memcg_kmem_is_active(memcg))
- return;
-
- /*
- * kmem charges can outlive the cgroup. In the case of slab
- * pages, for instance, a page contain objects from various
- * processes. As we prevent from taking a reference for every
- * such allocation we have to be careful when doing uncharge
- * (see memcg_uncharge_kmem) and here during offlining.
- *
- * The idea is that that only the _last_ uncharge which sees
- * the dead memcg will drop the last reference. An additional
- * reference is taken here before the group is marked dead
- * which is then paired with css_put during uncharge resp. here.
- *
- * Although this might sound strange as this path is called from
- * css_offline() when the referencemight have dropped down to 0 and
- * shouldn't be incremented anymore (css_tryget_online() would
- * fail) we do not have other options because of the kmem
- * allocations lifetime.
- */
- css_get(&memcg->css);
-
- memcg_kmem_mark_dead(memcg);
-
- if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
- return;
-
- if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
-}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
@@ -4898,10 +4178,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
-}
#endif
/*
@@ -5064,7 +4340,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
*
* DO NOT ADD NEW FILES.
*/
- name = cfile.file->f_dentry->d_name.name;
+ name = cfile.file->f_path.dentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
@@ -5088,7 +4364,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
+ cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
@@ -5228,7 +4504,10 @@ static struct cftype mem_cgroup_files[] = {
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .seq_show = mem_cgroup_slabinfo_read,
+ .seq_start = slab_start,
+ .seq_next = slab_next,
+ .seq_stop = slab_stop,
+ .seq_show = memcg_slab_show,
},
#endif
#endif
@@ -5343,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_percpu(memcg->stat);
- /*
- * We need to make sure that (at least for now), the jump label
- * destruction code runs outside of the cgroup lock. This is because
- * get_online_cpus(), which is called from the static_branch update,
- * can't be called inside the cgroup_lock. cpusets are the ones
- * enforcing this dependency, so if they ever change, we might as well.
- *
- * schedule_work() will guarantee this happens. Be careful if you need
- * to move this code around, and make sure it is outside
- * the cgroup_lock.
- */
disarm_static_keys(memcg);
kfree(memcg);
}
@@ -5363,9 +4631,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
*/
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
- if (!memcg->res.parent)
+ if (!memcg->memory.parent)
return NULL;
- return mem_cgroup_from_res_counter(memcg->res.parent, res);
+ return mem_cgroup_from_counter(memcg->memory.parent, memory);
}
EXPORT_SYMBOL(parent_mem_cgroup);
@@ -5410,9 +4678,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
- res_counter_init(&memcg->res, NULL);
- res_counter_init(&memcg->memsw, NULL);
- res_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->memsw, NULL);
+ page_counter_init(&memcg->kmem, NULL);
}
memcg->last_scanned_node = MAX_NUMNODES;
@@ -5423,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
+#ifdef CONFIG_MEMCG_KMEM
+ memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+#endif
return &memcg->css;
@@ -5451,18 +4723,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
memcg->swappiness = mem_cgroup_swappiness(parent);
if (parent->use_hierarchy) {
- res_counter_init(&memcg->res, &parent->res);
- res_counter_init(&memcg->memsw, &parent->memsw);
- res_counter_init(&memcg->kmem, &parent->kmem);
+ page_counter_init(&memcg->memory, &parent->memory);
+ page_counter_init(&memcg->memsw, &parent->memsw);
+ page_counter_init(&memcg->kmem, &parent->kmem);
/*
* No need to take a reference to the parent because cgroup
* core guarantees its existence.
*/
} else {
- res_counter_init(&memcg->res, NULL);
- res_counter_init(&memcg->memsw, NULL);
- res_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->memsw, NULL);
+ page_counter_init(&memcg->kmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -5487,29 +4759,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *parent = memcg;
-
- while ((parent = parent_mem_cgroup(parent)))
- mem_cgroup_iter_invalidate(parent);
-
- /*
- * if the root memcg is not hierarchical we have to check it
- * explicitely.
- */
- if (!root_mem_cgroup->use_hierarchy)
- mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
- struct cgroup_subsys_state *iter;
/*
* Unregister events and notify userspace.
@@ -5523,60 +4776,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- kmem_cgroup_css_offline(memcg);
-
- mem_cgroup_invalidate_reclaim_iterators(memcg);
-
- /*
- * This requires that offlining is serialized. Right now that is
- * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
- */
- css_for_each_descendant_post(iter, css)
- mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
-
- memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- /*
- * XXX: css_offline() would be where we should reparent all
- * memory to prepare the cgroup for destruction. However,
- * memcg does not do css_tryget_online() and res_counter charging
- * under the same RCU lock region, which means that charging
- * could race with offlining. Offlining only happens to
- * cgroups with no tasks in them but charges can show up
- * without any tasks from the swapin path when the target
- * memcg is looked up from the swapout record and not from the
- * current task as it usually is. A race like this can leak
- * charges and put pages with stale cgroup pointers into
- * circulation:
- *
- * #0 #1
- * lookup_swap_cgroup_id()
- * rcu_read_lock()
- * mem_cgroup_lookup()
- * css_tryget_online()
- * rcu_read_unlock()
- * disable css_tryget_online()
- * call_rcu()
- * offline_css()
- * reparent_charges()
- * res_counter_charge()
- * css_put()
- * css_free()
- * pc->mem_cgroup = dead memcg
- * add page to lru
- *
- * The bulk of the charges are still moved in offline_css() to
- * avoid pinning a lot of pages in case a long-term reference
- * like a swapout record is deferring the css_free() to long
- * after offlining. But this makes sure we catch any charges
- * made after offlining:
- */
- mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
@@ -5599,10 +4804,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- mem_cgroup_resize_limit(memcg, ULLONG_MAX);
- mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
- memcg_update_kmem_limit(memcg, ULLONG_MAX);
- res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+ mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
+ mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
+ memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ memcg->soft_limit = 0;
}
#ifdef CONFIG_MMU
@@ -5758,7 +4963,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
- struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
@@ -5772,13 +4976,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (!page && !ent.val)
return ret;
if (page) {
- pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o serialization.
- * mem_cgroup_move_account() checks the pc is valid or
+ * mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
@@ -5806,15 +5009,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
- struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
- pc = lookup_page_cgroup(page);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@@ -5897,7 +5098,6 @@ static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
- int i;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
@@ -5916,19 +5116,17 @@ static void __mem_cgroup_clear_mc(void)
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
- res_counter_uncharge(&mc.from->memsw,
- PAGE_SIZE * mc.moved_swap);
-
- for (i = 0; i < mc.moved_swap; i++)
- css_put(&mc.from->css);
+ page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
/*
- * we charged both to->res and to->memsw, so we should
- * uncharge to->res.
+ * we charged both to->memory and to->memsw, so we
+ * should uncharge to->memory.
*/
if (!mem_cgroup_is_root(mc.to))
- res_counter_uncharge(&mc.to->res,
- PAGE_SIZE * mc.moved_swap);
+ page_counter_uncharge(&mc.to->memory, mc.moved_swap);
+
+ css_put_many(&mc.from->css, mc.moved_swap);
+
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
@@ -5939,8 +5137,6 @@ static void __mem_cgroup_clear_mc(void)
static void mem_cgroup_clear_mc(void)
{
- struct mem_cgroup *from = mc.from;
-
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
@@ -5951,7 +5147,6 @@ static void mem_cgroup_clear_mc(void)
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
- mem_cgroup_end_move(from);
}
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
@@ -5984,7 +5179,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- mem_cgroup_start_move(from);
+
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
@@ -6004,7 +5199,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- mem_cgroup_clear_mc();
+ if (mc.to)
+ mem_cgroup_clear_mc();
}
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
@@ -6018,7 +5214,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct page *page;
- struct page_cgroup *pc;
/*
* We don't take compound_lock() here but no race with splitting thp
@@ -6039,9 +5234,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
- pc, mc.from, mc.to)) {
+ mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
@@ -6069,9 +5263,7 @@ retry:
page = target.page;
if (isolate_lru_page(page))
goto put;
- pc = lookup_page_cgroup(page);
- if (!mem_cgroup_move_account(page, 1, pc,
- mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -6115,6 +5307,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
+ /*
+ * Signal mem_cgroup_begin_page_stat() to take the memcg's
+ * move_lock while we're moving its pages to another memcg.
+ * Then wait for already started RCU-only updates to finish.
+ */
+ atomic_inc(&mc.from->moving_account);
+ synchronize_rcu();
retry:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
@@ -6147,6 +5346,7 @@ retry:
break;
}
up_read(&mm->mmap_sem);
+ atomic_dec(&mc.from->moving_account);
}
static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
@@ -6250,7 +5450,7 @@ static void __init enable_swap_cgroup(void)
*/
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
- struct page_cgroup *pc;
+ struct mem_cgroup *memcg;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -6259,20 +5459,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!do_swap_account)
return;
- pc = lookup_page_cgroup(page);
+ memcg = page->mem_cgroup;
/* Readahead page, never charged */
- if (!PageCgroupUsed(pc))
+ if (!memcg)
return;
- VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
-
- oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
- pc->flags &= ~PCG_MEMSW;
- css_get(&pc->mem_cgroup->css);
- mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+ page->mem_cgroup = NULL;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, 1);
+
+ /* XXX: caller holds IRQ-safe mapping->tree_lock */
+ VM_BUG_ON(!irqs_disabled());
+
+ mem_cgroup_charge_statistics(memcg, page, -1);
+ memcg_check_events(memcg, page);
}
/**
@@ -6294,7 +5500,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
memcg = mem_cgroup_lookup(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ page_counter_uncharge(&memcg->memsw, 1);
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
@@ -6330,7 +5536,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
goto out;
if (PageSwapCache(page)) {
- struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
@@ -6338,7 +5543,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
- if (PageCgroupUsed(pc))
+ if (page->mem_cgroup)
goto out;
}
@@ -6452,19 +5657,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
}
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
- unsigned long nr_mem, unsigned long nr_memsw,
unsigned long nr_anon, unsigned long nr_file,
unsigned long nr_huge, struct page *dummy_page)
{
+ unsigned long nr_pages = nr_anon + nr_file;
unsigned long flags;
if (!mem_cgroup_is_root(memcg)) {
- if (nr_mem)
- res_counter_uncharge(&memcg->res,
- nr_mem * PAGE_SIZE);
- if (nr_memsw)
- res_counter_uncharge(&memcg->memsw,
- nr_memsw * PAGE_SIZE);
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_uncharge(&memcg->memsw, nr_pages);
memcg_oom_recover(memcg);
}
@@ -6473,27 +5675,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
- __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+ __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
memcg_check_events(memcg, dummy_page);
local_irq_restore(flags);
+
+ if (!mem_cgroup_is_root(memcg))
+ css_put_many(&memcg->css, nr_pages);
}
static void uncharge_list(struct list_head *page_list)
{
struct mem_cgroup *memcg = NULL;
- unsigned long nr_memsw = 0;
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
unsigned long pgpgout = 0;
- unsigned long nr_mem = 0;
struct list_head *next;
struct page *page;
next = page_list->next;
do {
unsigned int nr_pages = 1;
- struct page_cgroup *pc;
page = list_entry(next, struct page, lru);
next = page->lru.next;
@@ -6501,24 +5703,22 @@ static void uncharge_list(struct list_head *page_list)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- pc = lookup_page_cgroup(page);
- if (!PageCgroupUsed(pc))
+ if (!page->mem_cgroup)
continue;
/*
* Nobody should be changing or seriously looking at
- * pc->mem_cgroup and pc->flags at this point, we have
- * fully exclusive access to the page.
+ * page->mem_cgroup at this point, we have fully
+ * exclusive access to the page.
*/
- if (memcg != pc->mem_cgroup) {
+ if (memcg != page->mem_cgroup) {
if (memcg) {
- uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
- nr_anon, nr_file, nr_huge, page);
- pgpgout = nr_mem = nr_memsw = 0;
- nr_anon = nr_file = nr_huge = 0;
+ uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
+ nr_huge, page);
+ pgpgout = nr_anon = nr_file = nr_huge = 0;
}
- memcg = pc->mem_cgroup;
+ memcg = page->mem_cgroup;
}
if (PageTransHuge(page)) {
@@ -6532,18 +5732,14 @@ static void uncharge_list(struct list_head *page_list)
else
nr_file += nr_pages;
- if (pc->flags & PCG_MEM)
- nr_mem += nr_pages;
- if (pc->flags & PCG_MEMSW)
- nr_memsw += nr_pages;
- pc->flags = 0;
+ page->mem_cgroup = NULL;
pgpgout++;
} while (next != page_list);
if (memcg)
- uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
- nr_anon, nr_file, nr_huge, page);
+ uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
+ nr_huge, page);
}
/**
@@ -6555,14 +5751,11 @@ static void uncharge_list(struct list_head *page_list)
*/
void mem_cgroup_uncharge(struct page *page)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
/* Don't touch page->lru of any random page, pre-check: */
- pc = lookup_page_cgroup(page);
- if (!PageCgroupUsed(pc))
+ if (!page->mem_cgroup)
return;
INIT_LIST_HEAD(&page->lru);
@@ -6598,7 +5791,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
bool lrucare)
{
- struct page_cgroup *pc;
+ struct mem_cgroup *memcg;
int isolated;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6613,27 +5806,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
return;
/* Page cache replacement: new page already charged? */
- pc = lookup_page_cgroup(newpage);
- if (PageCgroupUsed(pc))
+ if (newpage->mem_cgroup)
return;
- /* Re-entrant migration: old page already uncharged? */
- pc = lookup_page_cgroup(oldpage);
- if (!PageCgroupUsed(pc))
+ /*
+ * Swapcache readahead pages can get migrated before being
+ * charged, and migration from compaction can happen to an
+ * uncharged page when the PFN walker finds a page that
+ * reclaim just put back on the LRU but has not released yet.
+ */
+ memcg = oldpage->mem_cgroup;
+ if (!memcg)
return;
- VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
- VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
-
if (lrucare)
lock_page_lru(oldpage, &isolated);
- pc->flags = 0;
+ oldpage->mem_cgroup = NULL;
if (lrucare)
unlock_page_lru(oldpage, isolated);
- commit_charge(newpage, pc->mem_cgroup, lrucare);
+ commit_charge(newpage, memcg, lrucare);
}
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8639f6b28746..feb803bf3443 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,25 +233,20 @@ void shake_page(struct page *p, int access)
lru_add_drain_all();
if (PageLRU(p))
return;
- drain_all_pages();
+ drain_all_pages(page_zone(p));
if (PageLRU(p) || is_free_buddy_page(p))
return;
}
/*
- * Only call shrink_slab here (which would also shrink other caches) if
- * access is not potentially fatal.
+ * Only call shrink_node_slabs here (which would also shrink
+ * other caches) if access is not potentially fatal.
*/
if (access) {
int nr;
int nid = page_to_nid(p);
do {
- struct shrink_control shrink = {
- .gfp_mask = GFP_KERNEL,
- };
- node_set(nid, shrink.nodes_to_scan);
-
- nr = shrink_slab(&shrink, 1000, 1000);
+ nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
}
/*
@@ -860,7 +855,6 @@ static int page_action(struct page_state *ps, struct page *p,
int count;
result = ps->action(p, pfn);
- action_result(pfn, ps->msg, result);
count = page_count(p) - 1;
if (ps->action == me_swapcache_dirty && result == DELAYED)
@@ -871,6 +865,7 @@ static int page_action(struct page_state *ps, struct page *p,
pfn, ps->msg, count);
result = FAILED;
}
+ action_result(pfn, ps->msg, result);
/* Could do more checks here if page looks ok */
/*
@@ -1661,7 +1656,7 @@ static int __soft_offline_page(struct page *page, int flags)
if (!is_free_buddy_page(page))
lru_add_drain_all();
if (!is_free_buddy_page(page))
- drain_all_pages();
+ drain_all_pages(page_zone(page));
SetPageHWPoison(page);
if (!is_free_buddy_page(page))
pr_info("soft offline: %#lx: page leaked\n",
diff --git a/mm/memory.c b/mm/memory.c
index d86aa88902a0..649e7d440bd7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -220,9 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
/* Is it from 0 to ~0? */
tlb->fullmm = !(start | (end+1));
tlb->need_flush_all = 0;
- tlb->start = start;
- tlb->end = end;
- tlb->need_flush = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
@@ -232,15 +229,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
#endif
+
+ __tlb_reset_range(tlb);
}
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
- tlb->need_flush = 0;
tlb_flush(tlb);
+ mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
+ __tlb_reset_range(tlb);
}
static void tlb_flush_mmu_free(struct mmu_gather *tlb)
@@ -256,8 +256,9 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
void tlb_flush_mmu(struct mmu_gather *tlb)
{
- if (!tlb->need_flush)
+ if (!tlb->end)
return;
+
tlb_flush_mmu_tlbonly(tlb);
tlb_flush_mmu_free(tlb);
}
@@ -292,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
struct mmu_gather_batch *batch;
- VM_BUG_ON(!tlb->need_flush);
+ VM_BUG_ON(!tlb->end);
batch = tlb->active;
batch->pages[batch->nr++] = page;
@@ -359,8 +360,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
struct mmu_table_batch **batch = &tlb->batch;
- tlb->need_flush = 1;
-
/*
* When there's less then two users of this mm there cannot be a
* concurrent page-table walk.
@@ -815,20 +814,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (likely(!non_swap_entry(entry)))
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
rss[MM_SWAPENTS]++;
- else if (is_migration_entry(entry)) {
+ } else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
if (PageAnon(page))
@@ -1186,20 +1185,8 @@ again:
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
- if (force_flush) {
- unsigned long old_end;
-
- /*
- * Flush the TLB just for the previous segment,
- * then update the range to be the remaining
- * TLB range.
- */
- old_end = tlb->end;
- tlb->end = addr;
+ if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
- tlb->start = addr;
- tlb->end = old_end;
- }
pte_unmap_unlock(start_pte, ptl);
/*
@@ -1340,9 +1327,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* safe to do nothing in this case.
*/
if (vma->vm_file) {
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
@@ -2234,7 +2221,7 @@ gotten:
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush(vma, address, page_table);
+ ptep_clear_flush_notify(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -2391,12 +2378,12 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2641,7 +2628,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_SIGBUS;
/* Use the zero-page for reads */
- if (!(flags & FAULT_FLAG_WRITE)) {
+ if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -3009,6 +2996,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (set_page_dirty(fault_page))
dirtied = 1;
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
mapping = fault_page->mapping;
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
@@ -3388,6 +3381,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
+EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_PUD_FOLDED
/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 252e1dbbed86..9fab10795bea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/tlbflush.h>
@@ -1066,6 +1067,16 @@ out:
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+static void reset_node_present_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ z->present_pages = 0;
+
+ pgdat->node_present_pages = 0;
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
@@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
+ /*
+ * zone->managed_pages is set to an approximate value in
+ * free_area_init_core(), which will cause
+ * /sys/device/system/node/nodeX/meminfo has wrong data.
+ * So reset it to 0 before any memory is onlined.
+ */
+ reset_node_managed_pages(pgdat);
+
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages because they will be updated in
+ * online_pages() and offline_pages().
+ */
+ reset_node_present_pages(pgdat);
+
return pgdat;
}
@@ -1699,7 +1725,7 @@ repeat:
if (drain) {
lru_add_drain_all();
cond_resched();
- drain_all_pages();
+ drain_all_pages(zone);
}
pfn = scan_movable_pages(start_pfn, end_pfn);
@@ -1721,7 +1747,7 @@ repeat:
lru_add_drain_all();
yield();
/* drain pcp pages, this is synchronous. */
- drain_all_pages();
+ drain_all_pages(zone);
/*
* dissolve free hugepages in the memory block before doing offlining
* actually in order to make hugetlbfs's object counting consistent.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e58725aff7e9..0e0961b8c39c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -162,12 +162,6 @@ static const struct mempolicy_operations {
enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];
-/* Check that the nodemask contains at least one populated zone */
-static int is_valid_nodemask(const nodemask_t *nodemask)
-{
- return nodes_intersects(*nodemask, node_states[N_MEMORY]);
-}
-
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
@@ -202,7 +196,7 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
- if (!is_valid_nodemask(nodes))
+ if (nodes_empty(*nodes))
return -EINVAL;
pol->v.nodes = *nodes;
return 0;
@@ -234,7 +228,7 @@ static int mpol_set_nodemask(struct mempolicy *pol,
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
@@ -1047,10 +1041,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
down_read(&mm->mmap_sem);
- err = migrate_vmas(mm, from, to, flags);
- if (err)
- goto out;
-
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
@@ -1130,7 +1120,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err < 0)
break;
}
-out:
up_read(&mm->mmap_sem);
if (err < 0)
return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 01439953abf5..344cdf692fc8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache, enum migrate_mode mode)
+ int page_was_mapped, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
newpage->mapping = NULL;
} else {
mem_cgroup_migrate(page, newpage, false);
- if (remap_swapcache)
+ if (page_was_mapped)
remove_migration_ptes(page, newpage);
page->mapping = NULL;
}
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
int force, enum migrate_mode mode)
{
int rc = -EAGAIN;
- int remap_swapcache = 1;
+ int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
if (!trylock_page(page)) {
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* migrated but are not remapped when migration
* completes
*/
- remap_swapcache = 0;
} else {
goto out_unlock;
}
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
/* Establish migration ptes or remove ptes */
- try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ if (page_mapped(page)) {
+ try_to_unmap(page,
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ page_was_mapped = 1;
+ }
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache, mode);
+ rc = move_to_new_page(newpage, page, page_was_mapped, mode);
- if (rc && remap_swapcache)
+ if (rc && page_was_mapped)
remove_migration_ptes(page, page);
/* Drop an anon_vma reference if we took one */
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
{
int rc = 0;
int *result = NULL;
+ int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
- try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ if (page_mapped(hpage)) {
+ try_to_unmap(hpage,
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ page_was_mapped = 1;
+ }
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1, mode);
+ rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
@@ -1528,27 +1536,6 @@ out:
return err;
}
-/*
- * Call migration functions in the vma_ops that may prepare
- * memory in a vm for migration. migration functions may perform
- * the migration for vmas that do not have an underlying page struct.
- */
-int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
- const nodemask_t *from, unsigned long flags)
-{
- struct vm_area_struct *vma;
- int err = 0;
-
- for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
- if (vma->vm_ops && vma->vm_ops->migrate) {
- err = vma->vm_ops->migrate(vma, to, from, flags);
- if (err)
- break;
- }
- }
- return err;
-}
-
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns true if this is a safe migration target node for misplaced NUMA
@@ -1854,7 +1841,7 @@ fail_putback:
*/
flush_cache_range(vma, mmun_start, mmun_end);
page_add_anon_rmap(new_page, vma, mmun_start);
- pmdp_clear_flush(vma, mmun_start, pmd);
+ pmdp_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
@@ -1862,6 +1849,7 @@ fail_putback:
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
flush_tlb_range(vma, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page);
goto fail_putback;
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..c8c528b36641 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
} else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_migration_entry(entry)) {
- /* migration entries are always uptodate */
+ if (non_swap_entry(entry)) {
+ /*
+ * migration or hwpoison entries are always
+ * uptodate
+ */
*vec = 1;
} else {
#ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index 87e82b38453c..7b36aa7cc89a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
}
/*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
__remove_shared_vm_struct(vma, file, mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
}
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
}
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
@@ -776,8 +776,11 @@ again: remove_next = 1 + (end > next->vm_end);
* shrinking vma had, to cover any anon pages imported.
*/
if (exporter && exporter->anon_vma && !importer->anon_vma) {
- if (anon_vma_clone(importer, exporter))
- return -ENOMEM;
+ int error;
+
+ error = anon_vma_clone(importer, exporter);
+ if (error)
+ return error;
importer->anon_vma = exporter->anon_vma;
}
}
@@ -793,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end);
next->vm_end);
}
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (insert) {
/*
* Put into interval tree now, so instantiated pages
@@ -880,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end);
anon_vma_unlock_write(anon_vma);
}
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (root) {
uprobe_mmap(vma);
@@ -2359,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
}
#endif
+EXPORT_SYMBOL_GPL(find_extend_vma);
+
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
@@ -2469,7 +2474,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (err)
goto out_free_vma;
- if (anon_vma_clone(new, vma))
+ err = anon_vma_clone(new, vma);
+ if (err)
goto out_free_mpol;
if (new->vm_file)
@@ -2597,6 +2603,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
detach_vmas_to_be_unmapped(mm, vma, prev, end);
unmap_region(mm, vma, prev, start, end);
+ arch_unmap(mm, vma, start, end);
+
/* Fix up all other VM information */
remove_vma_list(mm, vma);
@@ -2785,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm)
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
@@ -3080,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
}
}
@@ -3107,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -3176,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 2c8da9825fe3..3b9b3d0741b2 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -193,6 +193,16 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ /*
+ * Call invalidate_range here too to avoid the need for the
+ * subsystem of having to register an invalidate_range_end
+ * call-back when there is invalidate_range already. Usually a
+ * subsystem registers either invalidate_range_start()/end() or
+ * invalidate_range(), so this will be no additional overhead
+ * (besides the pointer check).
+ */
+ if (mn->ops->invalidate_range)
+ mn->ops->invalidate_range(mn, mm, start, end);
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
@@ -200,6 +210,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
+void __mmu_notifier_invalidate_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range)
+ mn->ops->invalidate_range(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..17fa018f5f39 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spinlock_t *old_ptl, *new_ptl;
/*
- * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+ * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
* locks to ensure that rmap will always observe either the old or the
* new ptes. This is the easiest way to avoid races with
* truncate_pagecache(), page migration, etc...
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (need_rmap_locks) {
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (anon_vma)
anon_vma_unlock_write(anon_vma);
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -288,7 +288,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- }
+ } else if (vma->vm_file && vma->vm_file->f_op->mremap)
+ vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7c7ab32ee503..90b50468333e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void)
static int reset_managed_pages_done __initdata;
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+void reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
z->managed_pages = 0;
}
@@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void)
{
struct pglist_data *pgdat;
+ if (reset_managed_pages_done)
+ return;
+
for_each_online_pgdat(pgdat)
reset_node_managed_pages(pgdat);
+
reset_managed_pages_done = 1;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e194a7..b51eadf6d952 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/* remove from the MM's tree and list */
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
unsigned long len,
unsigned long capabilities)
{
- struct page *pages;
- unsigned long total, point, n;
+ unsigned long total, point;
void *base;
int ret, order;
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma,
order = get_order(len);
kdebug("alloc order %d for %lx", order, len);
- pages = alloc_pages(GFP_KERNEL, order);
- if (!pages)
- goto enomem;
-
total = 1 << order;
- atomic_long_add(total, &mmap_pages_allocated);
-
point = len >> PAGE_SHIFT;
- /* we allocated a power-of-2 sized page set, so we may want to trim off
- * the excess */
+ /* we don't want to allocate a power-of-2 sized page set */
if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
- while (total > point) {
- order = ilog2(total - point);
- n = 1 << order;
- kdebug("shave %lu/%lu @%lu", n, total - point, total);
- atomic_long_sub(n, &mmap_pages_allocated);
- total -= n;
- set_page_refcounted(pages + total);
- __free_pages(pages + total, order);
- }
+ total = point;
+ kdebug("try to alloc exact %lu pages", total);
+ base = alloc_pages_exact(len, GFP_KERNEL);
+ } else {
+ base = (void *)__get_free_pages(GFP_KERNEL, order);
}
- for (point = 1; point < total; point++)
- set_page_refcounted(&pages[point]);
+ if (!base)
+ goto enomem;
+
+ atomic_long_add(total, &mmap_pages_allocated);
- base = page_address(pages);
region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
region->vm_start = (unsigned long) base;
region->vm_end = region->vm_start + len;
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
down_write(&nommu_region_sem);
- mutex_lock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_lock_read(inode->i_mapping);
/* search for VMAs that fall within the dead zone */
vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
/* found one - only interested if it's shared out of the page
* cache */
if (vma->vm_flags & VM_SHARED) {
- mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_unlock_read(inode->i_mapping);
up_write(&nommu_region_sem);
return -ETXTBSY; /* not quite true, but near enough */
}
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
* we don't check for any regions that start beyond the EOF as there
* shouldn't be any
*/
- vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
- 0, ULONG_MAX) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
if (!(vma->vm_flags & VM_SHARED))
continue;
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
}
}
- mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_unlock_read(inode->i_mapping);
up_write(&nommu_region_sem);
return 0;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5340f6b91312..d503e9ce1c7b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -119,7 +119,7 @@ found:
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
- const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+ struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
if (is_global_init(p))
return true;
@@ -233,7 +233,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
- if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
+ if (!cpuset_zone_allowed(zone, gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task->flags & PF_EXITING && !force_kill) {
- /*
- * If this task is not being ptraced on exit, then wait for it
- * to finish before killing some other task unnecessarily.
- */
- if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
- return OOM_SCAN_ABORT;
- }
+ if (task_will_free_mem(task) && !force_kill)
+ return OOM_SCAN_ABORT;
+
return OOM_SCAN_OK;
}
@@ -353,7 +348,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
* swapents, oom_score_adj value, and name.
*/
-static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
- if (p->flags & PF_EXITING) {
+ if (task_will_free_mem(p)) {
set_tsk_thread_flag(p, TIF_MEMDIE);
put_task_struct(p);
return;
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ if (fatal_signal_pending(current) || task_will_free_mem(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 19ceae87522d..d5d81f5384d1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page)
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
+ mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
return ret;
}
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
+ mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cd36b822444..7633c503a116 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,7 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
-#include <linux/page_cgroup.h>
+#include <linux/page_ext.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -56,9 +56,10 @@
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
+#include <linux/page_owner.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -110,6 +111,7 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
/*
* When calculating the number of globally allowed dirty pages, there
* is a certain number of per-zone reserves that should not be
@@ -425,6 +427,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
+bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_guardpage_enabled __read_mostly;
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ _debug_pagealloc_enabled = true;
+
+ return 0;
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static bool need_debug_guardpage(void)
+{
+ /* If we don't use debug_pagealloc, we don't need guard page */
+ if (!debug_pagealloc_enabled())
+ return false;
+
+ return true;
+}
+
+static void init_debug_guardpage(void)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ _debug_guardpage_enabled = true;
+}
+
+struct page_ext_operations debug_guardpage_ops = {
+ .need = need_debug_guardpage,
+ .init = init_debug_guardpage,
+};
static int __init debug_guardpage_minorder_setup(char *buf)
{
@@ -440,18 +478,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
-static inline void set_page_guard_flag(struct page *page)
+static inline void set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
{
- __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ if (!debug_guardpage_enabled())
+ return;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+
+ INIT_LIST_HEAD(&page->lru);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
}
-static inline void clear_page_guard_flag(struct page *page)
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
{
- __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ if (!debug_guardpage_enabled())
+ return;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-static inline void set_page_guard_flag(struct page *page) { }
-static inline void clear_page_guard_flag(struct page *page) { }
+struct page_ext_operations debug_guardpage_ops = { NULL, };
+static inline void set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
#endif
static inline void set_page_order(struct page *page, unsigned int order)
@@ -467,29 +531,6 @@ static inline void rmv_page_order(struct page *page)
}
/*
- * Locate the struct page for both the matching buddy in our
- * pair (buddy1) and the combined O(n+1) page they form (page).
- *
- * 1) Any buddy B1 will have an order O twin B2 which satisfies
- * the following equation:
- * B2 = B1 ^ (1 << O)
- * For example, if the starting buddy (buddy2) is #8 its order
- * 1 buddy is #10:
- * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
- *
- * 2) Any buddy B will have an order O+1 parent P which
- * satisfies the following equation:
- * P = B & ~(1 << O)
- *
- * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
- */
-static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
-{
- return page_idx ^ (1 << order);
-}
-
-/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
* (a) the buddy is not in a hole &&
@@ -569,6 +610,7 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
+ int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
@@ -577,13 +619,24 @@ static inline void __free_one_page(struct page *page,
return;
VM_BUG_ON(migratetype == -1);
+ if (is_migrate_isolate(migratetype)) {
+ /*
+ * We restrict max order of merging to prevent merge
+ * between freepages on isolate pageblock and normal
+ * pageblock. Without this, pageblock isolation
+ * could cause incorrect freepage accounting.
+ */
+ max_order = min(MAX_ORDER, pageblock_order + 1);
+ } else {
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
+ }
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
+ page_idx = pfn & ((1 << max_order) - 1);
VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- while (order < MAX_ORDER-1) {
+ while (order < max_order - 1) {
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
@@ -593,10 +646,7 @@ static inline void __free_one_page(struct page *page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
- clear_page_guard_flag(buddy);
- set_page_private(page, 0);
- __mod_zone_freepage_state(zone, 1 << order,
- migratetype);
+ clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
@@ -650,8 +700,10 @@ static inline int free_pages_check(struct page *page)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
}
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+ if (unlikely(page->mem_cgroup))
+ bad_reason = "page still charged to cgroup";
+#endif
if (unlikely(bad_reason)) {
bad_page(page, bad_reason, bad_flags);
return 1;
@@ -715,14 +767,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
mt = get_freepage_migratetype(page);
+ if (unlikely(has_isolate_pageblock(zone)))
+ mt = get_pageblock_migratetype(page);
+
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- if (likely(!is_migrate_isolate_page(page))) {
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
- if (is_migrate_cma(mt))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
- }
} while (--to_free && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
@@ -739,9 +789,11 @@ static void free_one_page(struct zone *zone,
if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
__free_one_page(page, pfn, zone, order, migratetype);
- if (unlikely(!is_migrate_isolate(migratetype)))
- __mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
@@ -750,6 +802,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
int i;
int bad = 0;
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
+
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
@@ -760,6 +815,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
if (bad)
return false;
+ reset_page_owner(page, order);
+
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
@@ -866,23 +923,18 @@ static inline void expand(struct zone *zone, struct page *page,
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (high < debug_guardpage_minorder()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
+ debug_guardpage_enabled() &&
+ high < debug_guardpage_minorder()) {
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
- INIT_LIST_HEAD(&page[size].lru);
- set_page_guard_flag(&page[size]);
- set_page_private(&page[size], high);
- /* Guard pages are not available for any usage */
- __mod_zone_freepage_state(zone, -(1 << high),
- migratetype);
+ set_page_guard(zone, &page[size], high, migratetype);
continue;
}
-#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -907,8 +959,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
}
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+ if (unlikely(page->mem_cgroup))
+ bad_reason = "page still charged to cgroup";
+#endif
if (unlikely(bad_reason)) {
bad_page(page, bad_reason, bad_flags);
return 1;
@@ -938,6 +992,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
+ set_page_owner(page, order, gfp_flags);
+
return 0;
}
@@ -1276,55 +1332,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
#endif
/*
- * Drain pages of the indicated processor.
+ * Drain pcplists of the indicated processor and zone.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
-static void drain_pages(unsigned int cpu)
+static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
- struct zone *zone;
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
- for_each_populated_zone(zone) {
- struct per_cpu_pageset *pset;
- struct per_cpu_pages *pcp;
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);
- local_irq_save(flags);
- pset = per_cpu_ptr(zone->pageset, cpu);
+ pcp = &pset->pcp;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ pcp->count = 0;
+ }
+ local_irq_restore(flags);
+}
- pcp = &pset->pcp;
- if (pcp->count) {
- free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
- }
- local_irq_restore(flags);
+/*
+ * Drain pcplists of all zones on the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ drain_pages_zone(cpu, zone);
}
}
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ *
+ * The CPU has to be pinned. When zone parameter is non-NULL, spill just
+ * the single zone's pages.
*/
-void drain_local_pages(void *arg)
+void drain_local_pages(struct zone *zone)
{
- drain_pages(smp_processor_id());
+ int cpu = smp_processor_id();
+
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
* Note that this code is protected against sending an IPI to an offline
* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
* nothing keeps CPUs from showing up after we populated the cpumask and
* before the call to on_each_cpu_mask().
*/
-void drain_all_pages(void)
+void drain_all_pages(struct zone *zone)
{
int cpu;
- struct per_cpu_pageset *pcp;
- struct zone *zone;
/*
* Allocate in the BSS so we wont require allocation in
@@ -1339,20 +1415,31 @@ void drain_all_pages(void)
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pcp;
+ struct zone *z;
bool has_pcps = false;
- for_each_populated_zone(zone) {
+
+ if (zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
- if (pcp->pcp.count) {
+ if (pcp->pcp.count)
has_pcps = true;
- break;
+ } else {
+ for_each_populated_zone(z) {
+ pcp = per_cpu_ptr(z->pageset, cpu);
+ if (pcp->pcp.count) {
+ has_pcps = true;
+ break;
+ }
}
}
+
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
- on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+ on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
+ zone, 1);
}
#ifdef CONFIG_HIBERNATION
@@ -1479,12 +1566,15 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- for (i = 1; i < (1 << order); i++)
+ set_page_owner(page, 0, 0);
+ for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
+ set_page_owner(page + i, 0, 0);
+ }
}
EXPORT_SYMBOL_GPL(split_page);
-static int __isolate_free_page(struct page *page, unsigned int order)
+int __isolate_free_page(struct page *page, unsigned int order)
{
unsigned long watermark;
struct zone *zone;
@@ -1520,6 +1610,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
}
}
+ set_page_owner(page, order, 0);
return 1UL << order;
}
@@ -1714,7 +1805,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, int alloc_flags,
long free_pages)
{
- /* free_pages my go negative - that's OK */
+ /* free_pages may go negative - that's OK */
long min = mark;
int o;
long free_cma = 0;
@@ -1962,7 +2053,7 @@ zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+ * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
@@ -1973,7 +2064,7 @@ zonelist_scan:
continue;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ !cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* Distribute pages in proportion to the individual
@@ -2305,7 +2396,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
int classzone_idx, int migratetype, enum migrate_mode mode,
int *contended_compaction, bool *deferred_compaction)
{
- struct zone *last_compact_zone = NULL;
unsigned long compact_result;
struct page *page;
@@ -2316,7 +2406,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, mode,
contended_compaction,
- &last_compact_zone);
+ alloc_flags, classzone_idx);
current->flags &= ~PF_MEMALLOC;
switch (compact_result) {
@@ -2335,10 +2425,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- /* Page migration frees to the PCP lists but we want merging */
- drain_pages(get_cpu());
- put_cpu();
-
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2354,14 +2440,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
/*
- * last_compact_zone is where try_to_compact_pages thought allocation
- * should succeed, so it did not defer compaction. But here we know
- * that it didn't succeed, so we do the defer.
- */
- if (last_compact_zone && mode != MIGRATE_ASYNC)
- defer_compaction(last_compact_zone, order);
-
- /*
* It's bad if compaction run occurs and fails. The most likely reason
* is that pages exist, but not enough to satisfy watermarks.
*/
@@ -2442,7 +2520,7 @@ retry:
* pages are pinned on the per-cpu lists. Drain them and try again
*/
if (!page && !drained) {
- drain_all_pages();
+ drain_all_pages(NULL);
drained = true;
goto retry;
}
@@ -2514,7 +2592,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
- * comment for __cpuset_node_allowed_softwall().
+ * comment for __cpuset_node_allowed().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -3902,14 +3980,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- printk("Built %i zonelists in %s order, mobility grouping %s. "
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
- printk("Policy zone: %s\n", zone_names[policy_zone]);
+ pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
@@ -4841,7 +4919,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
- pgdat_page_cgroup_init(pgdat);
+ pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -4860,16 +4938,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, realsize);
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
+ if (!is_highmem_idx(j)) {
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ printk(KERN_WARNING
+ " %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+ }
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
@@ -5343,33 +5423,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
- printk("Zone ranges:\n");
+ pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- printk(KERN_CONT " %-8s ", zone_names[i]);
+ pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
- printk(KERN_CONT "empty\n");
+ pr_cont("empty\n");
else
- printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+ pr_cont("[mem %0#10lx-%0#10lx]\n",
arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
(arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
- printk("Movable zone start for each node\n");
+ pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
- printk(" Node %d: %#010lx\n", i,
+ pr_info(" Node %d: %#010lx\n", i,
zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early node map */
- printk("Early memory node ranges\n");
+ pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
+ pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
@@ -5505,9 +5585,9 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- printk("Memory: %luK/%luK available "
+ pr_info("Memory: %luK/%luK available "
"(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved"
+ "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
@@ -5515,7 +5595,8 @@ void __init mem_init_print_info(const char *str)
nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages) << (PAGE_SHIFT-10),
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
+ totalcma_pages << (PAGE_SHIFT-10),
#ifdef CONFIG_HIGHMEM
totalhigh_pages << (PAGE_SHIFT-10),
#endif
@@ -6207,9 +6288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (!PageLRU(page))
found++;
/*
- * If there are RECLAIMABLE pages, we need to check it.
- * But now, memory offline itself doesn't call shrink_slab()
- * and it still to be fixed.
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
*/
/*
* If the page is not RAM, page_count()should be 0.
@@ -6394,7 +6475,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages();
+ drain_all_pages(cc.zone);
order = 0;
outer_start = start;
@@ -6408,13 +6489,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
- outer_start, end);
+ pr_info("%s: [%lx, %lx) PFNs busy\n",
+ __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
-
/* Grab isolated pages from freelists. */
outer_end = isolate_freepages_range(&cc, outer_start, end);
if (!outer_end) {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index 5331c2bd85a2..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,530 +0,0 @@
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/bootmem.h>
-#include <linux/bit_spinlock.h>
-#include <linux/page_cgroup.h>
-#include <linux/hash.h>
-#include <linux/slab.h>
-#include <linux/memory.h>
-#include <linux/vmalloc.h>
-#include <linux/cgroup.h>
-#include <linux/swapops.h>
-#include <linux/kmemleak.h>
-
-static unsigned long total_usage;
-
-#if !defined(CONFIG_SPARSEMEM)
-
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
- pgdat->node_page_cgroup = NULL;
-}
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- unsigned long offset;
- struct page_cgroup *base;
-
- base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
-#ifdef CONFIG_DEBUG_VM
- /*
- * The sanity checks the page allocator does upon freeing a
- * page can reach here before the page_cgroup arrays are
- * allocated when feeding a range of pages to the allocator
- * for the first time during bootup or memory hotplug.
- */
- if (unlikely(!base))
- return NULL;
-#endif
- offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
- return base + offset;
-}
-
-static int __init alloc_node_page_cgroup(int nid)
-{
- struct page_cgroup *base;
- unsigned long table_size;
- unsigned long nr_pages;
-
- nr_pages = NODE_DATA(nid)->node_spanned_pages;
- if (!nr_pages)
- return 0;
-
- table_size = sizeof(struct page_cgroup) * nr_pages;
-
- base = memblock_virt_alloc_try_nid_nopanic(
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
- if (!base)
- return -ENOMEM;
- NODE_DATA(nid)->node_page_cgroup = base;
- total_usage += table_size;
- return 0;
-}
-
-void __init page_cgroup_init_flatmem(void)
-{
-
- int nid, fail;
-
- if (mem_cgroup_disabled())
- return;
-
- for_each_online_node(nid) {
- fail = alloc_node_page_cgroup(nid);
- if (fail)
- goto fail;
- }
- printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
- " don't want memory cgroups\n");
- return;
-fail:
- printk(KERN_CRIT "allocation of page_cgroup failed.\n");
- printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
-}
-
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
- /*
- * The sanity checks the page allocator does upon freeing a
- * page can reach here before the page_cgroup arrays are
- * allocated when feeding a range of pages to the allocator
- * for the first time during bootup or memory hotplug.
- */
- if (!section->page_cgroup)
- return NULL;
-#endif
- return section->page_cgroup + pfn;
-}
-
-static void *__meminit alloc_page_cgroup(size_t size, int nid)
-{
- gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
- void *addr = NULL;
-
- addr = alloc_pages_exact_nid(nid, size, flags);
- if (addr) {
- kmemleak_alloc(addr, size, 1, flags);
- return addr;
- }
-
- if (node_state(nid, N_HIGH_MEMORY))
- addr = vzalloc_node(size, nid);
- else
- addr = vzalloc(size);
-
- return addr;
-}
-
-static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
-{
- struct mem_section *section;
- struct page_cgroup *base;
- unsigned long table_size;
-
- section = __pfn_to_section(pfn);
-
- if (section->page_cgroup)
- return 0;
-
- table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- base = alloc_page_cgroup(table_size, nid);
-
- /*
- * The value stored in section->page_cgroup is (base - pfn)
- * and it does not point to the memory block allocated above,
- * causing kmemleak false positives.
- */
- kmemleak_not_leak(base);
-
- if (!base) {
- printk(KERN_ERR "page cgroup allocation failure\n");
- return -ENOMEM;
- }
-
- /*
- * The passed "pfn" may not be aligned to SECTION. For the calculation
- * we need to apply a mask.
- */
- pfn &= PAGE_SECTION_MASK;
- section->page_cgroup = base - pfn;
- total_usage += table_size;
- return 0;
-}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_page_cgroup(void *addr)
-{
- if (is_vmalloc_addr(addr)) {
- vfree(addr);
- } else {
- struct page *page = virt_to_page(addr);
- size_t table_size =
- sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-
- BUG_ON(PageReserved(page));
- kmemleak_free(addr);
- free_pages_exact(addr, table_size);
- }
-}
-
-static void __free_page_cgroup(unsigned long pfn)
-{
- struct mem_section *ms;
- struct page_cgroup *base;
-
- ms = __pfn_to_section(pfn);
- if (!ms || !ms->page_cgroup)
- return;
- base = ms->page_cgroup + pfn;
- free_page_cgroup(base);
- ms->page_cgroup = NULL;
-}
-
-static int __meminit online_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
-{
- unsigned long start, end, pfn;
- int fail = 0;
-
- start = SECTION_ALIGN_DOWN(start_pfn);
- end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
- if (nid == -1) {
- /*
- * In this case, "nid" already exists and contains valid memory.
- * "start_pfn" passed to us is a pfn which is an arg for
- * online__pages(), and start_pfn should exist.
- */
- nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_state(nid, N_ONLINE));
- }
-
- for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
- if (!pfn_present(pfn))
- continue;
- fail = init_section_page_cgroup(pfn, nid);
- }
- if (!fail)
- return 0;
-
- /* rollback */
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
- __free_page_cgroup(pfn);
-
- return -ENOMEM;
-}
-
-static int __meminit offline_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
-{
- unsigned long start, end, pfn;
-
- start = SECTION_ALIGN_DOWN(start_pfn);
- end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
- __free_page_cgroup(pfn);
- return 0;
-
-}
-
-static int __meminit page_cgroup_callback(struct notifier_block *self,
- unsigned long action, void *arg)
-{
- struct memory_notify *mn = arg;
- int ret = 0;
- switch (action) {
- case MEM_GOING_ONLINE:
- ret = online_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_OFFLINE:
- offline_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_CANCEL_ONLINE:
- offline_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_GOING_OFFLINE:
- break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
- break;
- }
-
- return notifier_from_errno(ret);
-}
-
-#endif
-
-void __init page_cgroup_init(void)
-{
- unsigned long pfn;
- int nid;
-
- if (mem_cgroup_disabled())
- return;
-
- for_each_node_state(nid, N_MEMORY) {
- unsigned long start_pfn, end_pfn;
-
- start_pfn = node_start_pfn(nid);
- end_pfn = node_end_pfn(nid);
- /*
- * start_pfn and end_pfn may not be aligned to SECTION and the
- * page->flags of out of node pages are not initialized. So we
- * scan [start_pfn, the biggest section's pfn < end_pfn) here.
- */
- for (pfn = start_pfn;
- pfn < end_pfn;
- pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
-
- if (!pfn_valid(pfn))
- continue;
- /*
- * Nodes's pfns can be overlapping.
- * We know some arch can have a nodes layout such as
- * -------------pfn-------------->
- * N0 | N1 | N2 | N0 | N1 | N2|....
- */
- if (pfn_to_nid(pfn) != nid)
- continue;
- if (init_section_page_cgroup(pfn, nid))
- goto oom;
- }
- }
- hotplug_memory_notifier(page_cgroup_callback, 0);
- printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
- "don't want memory cgroups\n");
- return;
-oom:
- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
-}
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
- return;
-}
-
-#endif
-
-
-#ifdef CONFIG_MEMCG_SWAP
-
-static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
- struct page **map;
- unsigned long length;
- spinlock_t lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-
-struct swap_cgroup {
- unsigned short id;
-};
-#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- * - we have no race in "exchange" when we're accessed via SwapCache because
- * SwapCache(and its swp_entry) is under lock.
- * - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
- struct page *page;
- struct swap_cgroup_ctrl *ctrl;
- unsigned long idx, max;
-
- ctrl = &swap_cgroup_ctrl[type];
-
- for (idx = 0; idx < ctrl->length; idx++) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
- goto not_enough_page;
- ctrl->map[idx] = page;
- }
- return 0;
-not_enough_page:
- max = idx;
- for (idx = 0; idx < max; idx++)
- __free_page(ctrl->map[idx]);
-
- return -ENOMEM;
-}
-
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
- struct swap_cgroup_ctrl **ctrlp)
-{
- pgoff_t offset = swp_offset(ent);
- struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
- struct swap_cgroup *sc;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- if (ctrlp)
- *ctrlp = ctrl;
-
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
-}
-
-/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
- *
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
- */
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new)
-{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned long flags;
- unsigned short retval;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- retval = sc->id;
- if (retval == old)
- sc->id = new;
- else
- retval = 0;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- return retval;
-}
-
-/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
- * @id: mem_cgroup to be recorded
- *
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
- */
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned short old;
- unsigned long flags;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- old = sc->id;
- sc->id = id;
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- return old;
-}
-
-/**
- * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
- * @ent: swap entry to be looked up.
- *
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
- */
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
- return lookup_swap_cgroup(ent, NULL)->id;
-}
-
-int swap_cgroup_swapon(int type, unsigned long max_pages)
-{
- void *array;
- unsigned long array_size;
- unsigned long length;
- struct swap_cgroup_ctrl *ctrl;
-
- if (!do_swap_account)
- return 0;
-
- length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
- array_size = length * sizeof(void *);
-
- array = vzalloc(array_size);
- if (!array)
- goto nomem;
-
- ctrl = &swap_cgroup_ctrl[type];
- mutex_lock(&swap_cgroup_mutex);
- ctrl->length = length;
- ctrl->map = array;
- spin_lock_init(&ctrl->lock);
- if (swap_cgroup_prepare(type)) {
- /* memory shortage */
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
- vfree(array);
- goto nomem;
- }
- mutex_unlock(&swap_cgroup_mutex);
-
- return 0;
-nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
- return -ENOMEM;
-}
-
-void swap_cgroup_swapoff(int type)
-{
- struct page **map;
- unsigned long i, length;
- struct swap_cgroup_ctrl *ctrl;
-
- if (!do_swap_account)
- return;
-
- mutex_lock(&swap_cgroup_mutex);
- ctrl = &swap_cgroup_ctrl[type];
- map = ctrl->map;
- length = ctrl->length;
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
-
- if (map) {
- for (i = 0; i < length; i++) {
- struct page *page = map[i];
- if (page)
- __free_page(page);
- }
- vfree(map);
- }
-}
-
-#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000000..a009574fbba9
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,192 @@
+/*
+ * Lockless hierarchical page accounting & limiting
+ *
+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/page_counter.h>
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/page.h>
+
+/**
+ * page_counter_cancel - take pages out of the local counter
+ * @counter: counter
+ * @nr_pages: number of pages to cancel
+ */
+void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
+{
+ long new;
+
+ new = atomic_long_sub_return(nr_pages, &counter->count);
+ /* More uncharges than charges? */
+ WARN_ON_ONCE(new < 0);
+}
+
+/**
+ * page_counter_charge - hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * NOTE: This does not consider any configured counter limits.
+ */
+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent) {
+ long new;
+
+ new = atomic_long_add_return(nr_pages, &c->count);
+ /*
+ * This is indeed racy, but we can live with some
+ * inaccuracy in the watermark.
+ */
+ if (new > c->watermark)
+ c->watermark = new;
+ }
+}
+
+/**
+ * page_counter_try_charge - try to hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ * @fail: points first counter to hit its limit, if any
+ *
+ * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
+ * its ancestors has hit its configured limit.
+ */
+int page_counter_try_charge(struct page_counter *counter,
+ unsigned long nr_pages,
+ struct page_counter **fail)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent) {
+ long new;
+ /*
+ * Charge speculatively to avoid an expensive CAS. If
+ * a bigger charge fails, it might falsely lock out a
+ * racing smaller charge and send it into reclaim
+ * early, but the error is limited to the difference
+ * between the two sizes, which is less than 2M/4M in
+ * case of a THP locking out a regular page charge.
+ *
+ * The atomic_long_add_return() implies a full memory
+ * barrier between incrementing the count and reading
+ * the limit. When racing with page_counter_limit(),
+ * we either see the new limit or the setter sees the
+ * counter has changed and retries.
+ */
+ new = atomic_long_add_return(nr_pages, &c->count);
+ if (new > c->limit) {
+ atomic_long_sub(nr_pages, &c->count);
+ /*
+ * This is racy, but we can live with some
+ * inaccuracy in the failcnt.
+ */
+ c->failcnt++;
+ *fail = c;
+ goto failed;
+ }
+ /*
+ * Just like with failcnt, we can live with some
+ * inaccuracy in the watermark.
+ */
+ if (new > c->watermark)
+ c->watermark = new;
+ }
+ return 0;
+
+failed:
+ for (c = counter; c != *fail; c = c->parent)
+ page_counter_cancel(c, nr_pages);
+
+ return -ENOMEM;
+}
+
+/**
+ * page_counter_uncharge - hierarchically uncharge pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent)
+ page_counter_cancel(c, nr_pages);
+}
+
+/**
+ * page_counter_limit - limit the number of pages allowed
+ * @counter: counter
+ * @limit: limit to set
+ *
+ * Returns 0 on success, -EBUSY if the current number of pages on the
+ * counter already exceeds the specified limit.
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+int page_counter_limit(struct page_counter *counter, unsigned long limit)
+{
+ for (;;) {
+ unsigned long old;
+ long count;
+
+ /*
+ * Update the limit while making sure that it's not
+ * below the concurrently-changing counter value.
+ *
+ * The xchg implies two full memory barriers before
+ * and after, so the read-swap-read is ordered and
+ * ensures coherency with page_counter_try_charge():
+ * that function modifies the count before checking
+ * the limit, so if it sees the old limit, we see the
+ * modified counter and retry.
+ */
+ count = atomic_long_read(&counter->count);
+
+ if (count > limit)
+ return -EBUSY;
+
+ old = xchg(&counter->limit, limit);
+
+ if (atomic_long_read(&counter->count) <= count)
+ return 0;
+
+ counter->limit = old;
+ cond_resched();
+ }
+}
+
+/**
+ * page_counter_memparse - memparse() for page counter limits
+ * @buf: string to parse
+ * @nr_pages: returns the result in number of pages
+ *
+ * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
+ * limited to %PAGE_COUNTER_MAX.
+ */
+int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+{
+ char unlimited[] = "-1";
+ char *end;
+ u64 bytes;
+
+ if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+ *nr_pages = PAGE_COUNTER_MAX;
+ return 0;
+ }
+
+ bytes = memparse(buf, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
+
+ return 0;
+}
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..d86fd2f5353f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,403 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/page_ext.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
+#include <linux/page_owner.h>
+
+/*
+ * struct page extension
+ *
+ * This is the feature to manage memory for extended data per page.
+ *
+ * Until now, we must modify struct page itself to store extra data per page.
+ * This requires rebuilding the kernel and it is really time consuming process.
+ * And, sometimes, rebuild is impossible due to third party module dependency.
+ * At last, enlarging struct page could cause un-wanted system behaviour change.
+ *
+ * This feature is intended to overcome above mentioned problems. This feature
+ * allocates memory for extended data per page in certain place rather than
+ * the struct page itself. This memory can be accessed by the accessor
+ * functions provided by this code. During the boot process, it checks whether
+ * allocation of huge chunk of memory is needed or not. If not, it avoids
+ * allocating memory at all. With this advantage, we can include this feature
+ * into the kernel in default and can avoid rebuild and solve related problems.
+ *
+ * To help these things to work well, there are two callbacks for clients. One
+ * is the need callback which is mandatory if user wants to avoid useless
+ * memory allocation at boot-time. The other is optional, init callback, which
+ * is used to do proper initialization after memory is allocated.
+ *
+ * The need callback is used to decide whether extended memory allocation is
+ * needed or not. Sometimes users want to deactivate some features in this
+ * boot and extra memory would be unneccessary. In this case, to avoid
+ * allocating huge chunk of memory, each clients represent their need of
+ * extra memory through the need callback. If one of the need callbacks
+ * returns true, it means that someone needs extra memory so that
+ * page extension core should allocates memory for page extension. If
+ * none of need callbacks return true, memory isn't needed at all in this boot
+ * and page extension core can skip to allocate memory. As result,
+ * none of memory is wasted.
+ *
+ * The init callback is used to do proper initialization after page extension
+ * is completely initialized. In sparse memory system, extra memory is
+ * allocated some time later than memmap is allocated. In other words, lifetime
+ * of memory for page extension isn't same with memmap for struct page.
+ * Therefore, clients can't store extra data until page extension is
+ * initialized, even if pages are allocated and used freely. This could
+ * cause inadequate state of extra data per page, so, to prevent it, client
+ * can utilize this callback to initialize the state of it correctly.
+ */
+
+static struct page_ext_operations *page_ext_ops[] = {
+ &debug_guardpage_ops,
+#ifdef CONFIG_PAGE_POISONING
+ &page_poisoning_ops,
+#endif
+#ifdef CONFIG_PAGE_OWNER
+ &page_owner_ops,
+#endif
+};
+
+static unsigned long total_usage;
+
+static bool __init invoke_need_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->need && page_ext_ops[i]->need())
+ return true;
+ }
+
+ return false;
+}
+
+static void __init invoke_init_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->init)
+ page_ext_ops[i]->init();
+ }
+}
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+ pgdat->node_page_ext = NULL;
+}
+
+struct page_ext *lookup_page_ext(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset;
+ struct page_ext *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_ext;
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (unlikely(!base))
+ return NULL;
+#endif
+ offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
+ MAX_ORDER_NR_PAGES);
+ return base + offset;
+}
+
+static int __init alloc_node_page_ext(int nid)
+{
+ struct page_ext *base;
+ unsigned long table_size;
+ unsigned long nr_pages;
+
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+ if (!nr_pages)
+ return 0;
+
+ /*
+ * Need extra space if node range is not aligned with
+ * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
+ * checks buddy's status, range could be out of exact node range.
+ */
+ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
+ !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
+ nr_pages += MAX_ORDER_NR_PAGES;
+
+ table_size = sizeof(struct page_ext) * nr_pages;
+
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ if (!base)
+ return -ENOMEM;
+ NODE_DATA(nid)->node_page_ext = base;
+ total_usage += table_size;
+ return 0;
+}
+
+void __init page_ext_init_flatmem(void)
+{
+
+ int nid, fail;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_ext(nid);
+ if (fail)
+ goto fail;
+ }
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+fail:
+ pr_crit("allocation of page_ext failed.\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_ext *lookup_page_ext(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (!section->page_ext)
+ return NULL;
+#endif
+ return section->page_ext + pfn;
+}
+
+static void *__meminit alloc_page_ext(size_t size, int nid)
+{
+ gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+ void *addr = NULL;
+
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
+ return addr;
+ }
+
+ if (node_state(nid, N_HIGH_MEMORY))
+ addr = vzalloc_node(size, nid);
+ else
+ addr = vzalloc(size);
+
+ return addr;
+}
+
+static int __meminit init_section_page_ext(unsigned long pfn, int nid)
+{
+ struct mem_section *section;
+ struct page_ext *base;
+ unsigned long table_size;
+
+ section = __pfn_to_section(pfn);
+
+ if (section->page_ext)
+ return 0;
+
+ table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+ base = alloc_page_ext(table_size, nid);
+
+ /*
+ * The value stored in section->page_ext is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
+
+ if (!base) {
+ pr_err("page ext allocation failure\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
+ section->page_ext = base - pfn;
+ total_usage += table_size;
+ return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_ext(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size;
+
+ table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
+ }
+}
+
+static void __free_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ struct page_ext *base;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ base = ms->page_ext + pfn;
+ free_page_ext(base);
+ ms->page_ext = NULL;
+}
+
+static int __meminit online_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
+{
+ unsigned long start, end, pfn;
+ int fail = 0;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ if (nid == -1) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
+
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+ fail = init_section_page_ext(pfn, nid);
+ }
+ if (!fail)
+ return 0;
+
+ /* rollback */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+
+ return -ENOMEM;
+}
+
+static int __meminit offline_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
+{
+ unsigned long start, end, pfn;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+ return 0;
+
+}
+
+static int __meminit page_ext_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = online_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_OFFLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_CANCEL_ONLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_GOING_OFFLINE:
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+#endif
+
+void __init page_ext_init(void)
+{
+ unsigned long pfn;
+ int nid;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn; pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_ext(pfn, nid))
+ goto oom;
+ }
+ }
+ hotplug_memory_notifier(page_ext_callback, 0);
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+oom:
+ panic("Out of memory");
+}
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+}
+
+#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index d1473b2e9481..72f5ac381ab3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -60,6 +60,7 @@ out:
int migratetype = get_pageblock_migratetype(page);
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ zone->nr_isolate_pageblock++;
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -67,7 +68,7 @@ out:
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
- drain_all_pages();
+ drain_all_pages(zone);
return ret;
}
@@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
+ struct page *isolated_page = NULL;
+ unsigned int order;
+ unsigned long page_idx, buddy_idx;
+ struct page *buddy;
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- nr_pages = move_freepages_block(zone, page, migratetype);
- __mod_zone_freepage_state(zone, nr_pages, migratetype);
+
+ /*
+ * Because freepage with more than pageblock_order on isolated
+ * pageblock is restricted to merge due to freepage counting problem,
+ * it is possible that there is free buddy page.
+ * move_freepages_block() doesn't care of merge so we need other
+ * approach in order to merge them. Isolation and free will make
+ * these pages to be merged.
+ */
+ if (PageBuddy(page)) {
+ order = page_order(page);
+ if (order >= pageblock_order) {
+ page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
+
+ if (!is_migrate_isolate_page(buddy)) {
+ __isolate_free_page(page, order);
+ set_page_refcounted(page);
+ isolated_page = page;
+ }
+ }
+ }
+
+ /*
+ * If we isolate freepage with more than pageblock_order, there
+ * should be no freepage in the range, so we could avoid costly
+ * pageblock scanning for freepage moving.
+ */
+ if (!isolated_page) {
+ nr_pages = move_freepages_block(zone, page, migratetype);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
+ }
set_pageblock_migratetype(page, migratetype);
+ zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
+ if (isolated_page)
+ __free_pages(isolated_page, order);
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000000..9ab4a9b5bc09
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,311 @@
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/bootmem.h>
+#include <linux/stacktrace.h>
+#include <linux/page_owner.h>
+#include "internal.h"
+
+static bool page_owner_disabled = true;
+bool page_owner_inited __read_mostly;
+
+static void init_early_allocated_pages(void);
+
+static int early_page_owner_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ page_owner_disabled = false;
+
+ return 0;
+}
+early_param("page_owner", early_page_owner_param);
+
+static bool need_page_owner(void)
+{
+ if (page_owner_disabled)
+ return false;
+
+ return true;
+}
+
+static void init_page_owner(void)
+{
+ if (page_owner_disabled)
+ return;
+
+ page_owner_inited = true;
+ init_early_allocated_pages();
+}
+
+struct page_ext_operations page_owner_ops = {
+ .need = need_page_owner,
+ .init = init_page_owner,
+};
+
+void __reset_page_owner(struct page *page, unsigned int order)
+{
+ int i;
+ struct page_ext *page_ext;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ }
+}
+
+void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+{
+ struct page_ext *page_ext;
+ struct stack_trace *trace;
+
+ page_ext = lookup_page_ext(page);
+
+ trace = &page_ext->trace;
+ trace->nr_entries = 0;
+ trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
+ trace->entries = &page_ext->trace_entries[0];
+ trace->skip = 3;
+ save_stack_trace(&page_ext->trace);
+
+ page_ext->order = order;
+ page_ext->gfp_mask = gfp_mask;
+
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+}
+
+static ssize_t
+print_page_owner(char __user *buf, size_t count, unsigned long pfn,
+ struct page *page, struct page_ext *page_ext)
+{
+ int ret;
+ int pageblock_mt, page_mt;
+ char *kbuf;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = snprintf(kbuf, count,
+ "Page allocated via order %u, mask 0x%x\n",
+ page_ext->order, page_ext->gfp_mask);
+
+ if (ret >= count)
+ goto err;
+
+ /* Print information relevant to grouping pages by mobility */
+ pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+ ret += snprintf(kbuf + ret, count - ret,
+ "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ pfn,
+ pfn >> pageblock_order,
+ pageblock_mt,
+ pageblock_mt != page_mt ? "Fallback" : " ",
+ PageLocked(page) ? "K" : " ",
+ PageError(page) ? "E" : " ",
+ PageReferenced(page) ? "R" : " ",
+ PageUptodate(page) ? "U" : " ",
+ PageDirty(page) ? "D" : " ",
+ PageLRU(page) ? "L" : " ",
+ PageActive(page) ? "A" : " ",
+ PageSlab(page) ? "S" : " ",
+ PageWriteback(page) ? "W" : " ",
+ PageCompound(page) ? "C" : " ",
+ PageSwapCache(page) ? "B" : " ",
+ PageMappedToDisk(page) ? "M" : " ");
+
+ if (ret >= count)
+ goto err;
+
+ ret += snprint_stack_trace(kbuf + ret, count - ret,
+ &page_ext->trace, 0);
+ if (ret >= count)
+ goto err;
+
+ ret += snprintf(kbuf + ret, count - ret, "\n");
+ if (ret >= count)
+ goto err;
+
+ if (copy_to_user(buf, kbuf, ret))
+ ret = -EFAULT;
+
+ kfree(kbuf);
+ return ret;
+
+err:
+ kfree(kbuf);
+ return -ENOMEM;
+}
+
+static ssize_t
+read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned long pfn;
+ struct page *page;
+ struct page_ext *page_ext;
+
+ if (!page_owner_inited)
+ return -EINVAL;
+
+ page = NULL;
+ pfn = min_low_pfn + *ppos;
+
+ /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
+ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
+ pfn++;
+
+ drain_all_pages(NULL);
+
+ /* Find an allocated page */
+ for (; pfn < max_pfn; pfn++) {
+ /*
+ * If the new page is in a new MAX_ORDER_NR_PAGES area,
+ * validate the area as existing, skip it if not
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
+ pfn += MAX_ORDER_NR_PAGES - 1;
+ continue;
+ }
+
+ /* Check for holes within a MAX_ORDER area */
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = page_order_unsafe(page);
+
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
+ continue;
+ }
+
+ page_ext = lookup_page_ext(page);
+
+ /*
+ * Some pages could be missed by concurrent allocation or free,
+ * because we don't hold the zone lock.
+ */
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /* Record the next PFN to read in the file offset */
+ *ppos = (pfn - min_low_pfn) + 1;
+
+ return print_page_owner(buf, count, pfn, page, page_ext);
+ }
+
+ return 0;
+}
+
+static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count = 0;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ page = pfn_to_page(pfn);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+
+ /*
+ * We are safe to check buddy flag and order, because
+ * this is init stage and only single thread runs.
+ */
+ if (PageBuddy(page)) {
+ pfn += (1UL << page_order(page)) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+
+ /* Maybe overraping zone */
+ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /* Found early allocated page */
+ set_page_owner(page, 0, 0);
+ count++;
+ }
+ }
+
+ pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
+ pgdat->node_id, zone->name, count);
+}
+
+static void init_zones_in_node(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ init_pages_in_zone(pgdat, zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
+static void init_early_allocated_pages(void)
+{
+ pg_data_t *pgdat;
+
+ drain_all_pages(NULL);
+ for_each_online_pgdat(pgdat)
+ init_zones_in_node(pgdat);
+}
+
+static const struct file_operations proc_page_owner_operations = {
+ .read = read_page_owner,
+};
+
+static int __init pageowner_init(void)
+{
+ struct dentry *dentry;
+
+ if (!page_owner_inited) {
+ pr_info("page_owner is disabled\n");
+ return 0;
+ }
+
+ dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
+ NULL, &proc_page_owner_operations);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ return 0;
+}
+module_init(pageowner_init)
diff --git a/mm/percpu.c b/mm/percpu.c
index 014bab65e0ff..d39e2f4e335c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1591,7 +1591,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (cpu == NR_CPUS)
continue;
- PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1e542744e131..c5bc241127b2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
* inode->i_mutex (while writing or truncating, not reading or faulting)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_mutex
+ * mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -274,6 +274,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
+ int error;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
@@ -283,8 +284,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- if (anon_vma_clone(vma, pvma))
- return -ENOMEM;
+ error = anon_vma_clone(vma, pvma);
+ if (error)
+ return error;
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
@@ -1052,7 +1054,7 @@ void page_add_file_rmap(struct page *page)
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg, locked, flags);
+ mem_cgroup_end_page_stat(memcg, &locked, &flags);
}
static void page_remove_file_rmap(struct page *page)
@@ -1082,7 +1084,7 @@ static void page_remove_file_rmap(struct page *page)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg, locked, flags);
+ mem_cgroup_end_page_stat(memcg, &locked, &flags);
}
/**
@@ -1259,7 +1261,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1379,7 +1381,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address)) {
@@ -1634,7 +1636,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page_to_pgoff(page);
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1642,6 +1644,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (!anon_vma)
return ret;
+ pgoff = page_to_pgoff(page);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1675,7 +1678,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page_to_pgoff(page);
+ pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1683,13 +1686,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
+ * so we can safely take mapping->i_mmap_rwsem.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
return ret;
- mutex_lock(&mapping->i_mmap_mutex);
+
+ pgoff = page_to_pgoff(page);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1710,9 +1715,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
goto done;
ret = rwc->file_nonlinear(page, mapping, rwc->arg);
-
done:
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 185836ba53ef..73ba1df7c8ba 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1536,7 +1536,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* holes of a sparse file, we actually need to allocate those pages,
* and even mark them dirty, so it cannot exceed the max_blocks limit.
*/
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (!iter_is_iovec(to))
sgp = SGP_DIRTY;
index = *ppos >> PAGE_CACHE_SHIFT;
diff --git a/mm/slab.c b/mm/slab.c
index eb2b2ea30130..65b5dcb6f671 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & GFP_SLAB_BUG_MASK);
+ if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+ pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+ BUG();
+ }
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/* Take the node list lock to change the colour_next on this node */
@@ -3012,7 +3015,7 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_hardwall(zone, flags) &&
+ if (cpuset_zone_allowed(zone, flags) &&
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
@@ -3076,7 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
void *obj;
int x;
- VM_BUG_ON(nodeid > num_online_nodes());
+ VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
n = get_node(cachep, nodeid);
BUG_ON(!n);
@@ -3179,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
memset(ptr, 0, cachep->object_size);
}
+ memcg_kmem_put_cache(cachep);
return ptr;
}
@@ -3244,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
memset(objp, 0, cachep->object_size);
}
+ memcg_kmem_put_cache(cachep);
return objp;
}
@@ -3580,11 +3585,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
for_each_online_node(node) {
- if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit, gfp);
- if (!new_alien)
- goto fail;
- }
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
new_shared = NULL;
if (cachep->shared) {
@@ -4043,12 +4048,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void *leaks_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&slab_mutex);
- return seq_list_start(&slab_caches, *pos);
-}
-
static inline int add_caller(unsigned long *n, unsigned long v)
{
unsigned long *p;
@@ -4170,7 +4169,7 @@ static int leaks_show(struct seq_file *m, void *p)
}
static const struct seq_operations slabstats_op = {
- .start = leaks_start,
+ .start = slab_start,
.next = slab_next,
.stop = slab_stop,
.show = leaks_show,
diff --git a/mm/slab.h b/mm/slab.h
index ab019e63e3c2..1cf4005482dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
rcu_read_lock();
params = rcu_dereference(s->memcg_params);
- cachep = params->memcg_caches[idx];
- rcu_read_unlock();
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
* memcg_register_cache()).
*/
- smp_read_barrier_depends();
+ cachep = lockless_dereference(params->memcg_caches[idx]);
+ rcu_read_unlock();
+
return cachep;
}
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
#endif
+void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
+int memcg_slab_show(struct seq_file *m, void *p);
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 406944207b61..e03dd6f2a272 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
size = ALIGN(size, align);
flags = kmem_cache_flags(size, flags, name, NULL);
- list_for_each_entry(s, &slab_caches, list) {
+ list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -259,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
if (s->size - size >= sizeof(void *))
continue;
+ if (IS_ENABLED(CONFIG_SLAB) && align &&
+ (align > s->align || s->align % align))
+ continue;
+
return s;
}
return NULL;
@@ -807,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
#define SLABINFO_RIGHTS S_IRUSR
#endif
-void print_slabinfo_header(struct seq_file *m)
+static void print_slabinfo_header(struct seq_file *m)
{
/*
* Output format version, so at least we can change it
@@ -830,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m)
seq_putc(m, '\n');
}
-static void *s_start(struct seq_file *m, loff_t *pos)
+void *slab_start(struct seq_file *m, loff_t *pos)
{
- loff_t n = *pos;
-
mutex_lock(&slab_mutex);
- if (!n)
- print_slabinfo_header(m);
-
return seq_list_start(&slab_caches, *pos);
}
@@ -877,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
}
}
-int cache_show(struct kmem_cache *s, struct seq_file *m)
+static void cache_show(struct kmem_cache *s, struct seq_file *m)
{
struct slabinfo sinfo;
@@ -896,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
slabinfo_show_stats(m, s);
seq_putc(m, '\n');
+}
+
+static int slab_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+ if (p == slab_caches.next)
+ print_slabinfo_header(m);
+ if (is_root_cache(s))
+ cache_show(s, m);
return 0;
}
-static int s_show(struct seq_file *m, void *p)
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- if (!is_root_cache(s))
- return 0;
- return cache_show(s, m);
+ if (p == slab_caches.next)
+ print_slabinfo_header(m);
+ if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+ cache_show(s, m);
+ return 0;
}
+#endif
/*
* slabinfo_op - iterator that generates /proc/slabinfo
@@ -922,10 +936,10 @@ static int s_show(struct seq_file *m, void *p)
* + further values on SMP and with statistics enabled
*/
static const struct seq_operations slabinfo_op = {
- .start = s_start,
+ .start = slab_start,
.next = slab_next,
.stop = slab_stop,
- .show = s_show,
+ .show = slab_show,
};
static int slabinfo_open(struct inode *inode, struct file *file)
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..fe376fe1f4fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page)
maxobj = order_objects(compound_order(page), s->size, s->reserved);
if (page->objects > maxobj) {
slab_err(s, page, "objects %u > max %u",
- s->name, page->objects, maxobj);
+ page->objects, maxobj);
return 0;
}
if (page->inuse > page->objects) {
slab_err(s, page, "inuse %u > max %u",
- s->name, page->inuse, page->objects);
+ page->inuse, page->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
int nr = 0;
void *fp;
void *object = NULL;
- unsigned long max_objects;
+ int max_objects;
fp = page->freelist;
while (fp && nr <= page->objects) {
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
kmemleak_free(x);
}
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
{
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
might_sleep_if(flags & __GFP_WAIT);
- return should_failslab(s->object_size, flags, s->flags);
+ if (should_failslab(s->object_size, flags, s->flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
flags &= gfp_allowed_mask;
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
+ memcg_kmem_put_cache(s);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1377,7 +1382,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
int order;
int idx;
- BUG_ON(flags & GFP_SLAB_BUG_MASK);
+ if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+ pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+ BUG();
+ }
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1662,7 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
object = get_partial_node(s, n, c, flags);
if (object) {
@@ -2380,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
- if (slab_pre_alloc_hook(s, gfpflags))
+ s = slab_pre_alloc_hook(s, gfpflags);
+ if (!s)
return NULL;
-
- s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2554,7 +2561,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
} else { /* Needs to be taken off a list */
- n = get_node(s, page_to_nid(page));
+ n = get_node(s, page_to_nid(page));
/*
* Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
@@ -2587,10 +2594,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* The list lock was not taken therefore no list
* activity can be necessary.
*/
- if (was_frozen)
- stat(s, FREE_FROZEN);
- return;
- }
+ if (was_frozen)
+ stat(s, FREE_FROZEN);
+ return;
+ }
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
goto slab_empty;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
+#include <linux/swap_cgroup.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include <linux/swapops.h> /* depends on mm.h include */
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+ struct page **map;
+ unsigned long length;
+ spinlock_t lock;
+};
+
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+struct swap_cgroup {
+ unsigned short id;
+};
+#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ * - we have no race in "exchange" when we're accessed via SwapCache because
+ * SwapCache(and its swp_entry) is under lock.
+ * - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+ struct page *page;
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long idx, max;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ for (idx = 0; idx < ctrl->length; idx++) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto not_enough_page;
+ ctrl->map[idx] = page;
+ }
+ return 0;
+not_enough_page:
+ max = idx;
+ for (idx = 0; idx < max; idx++)
+ __free_page(ctrl->map[idx]);
+
+ return -ENOMEM;
+}
+
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+ struct swap_cgroup_ctrl **ctrlp)
+{
+ pgoff_t offset = swp_offset(ent);
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ if (ctrlp)
+ *ctrlp = ctrl;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned long flags;
+ unsigned short retval;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ retval = sc->id;
+ if (retval == old)
+ sc->id = new;
+ else
+ retval = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return retval;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned short old;
+ unsigned long flags;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ old = sc->id;
+ sc->id = id;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ return old;
+}
+
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+ return lookup_swap_cgroup(ent, NULL)->id;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ void *array;
+ unsigned long array_size;
+ unsigned long length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return 0;
+
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+ array_size = length * sizeof(void *);
+
+ array = vzalloc(array_size);
+ if (!array)
+ goto nomem;
+
+ ctrl = &swap_cgroup_ctrl[type];
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl->length = length;
+ ctrl->map = array;
+ spin_lock_init(&ctrl->lock);
+ if (swap_cgroup_prepare(type)) {
+ /* memory shortage */
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
+ goto nomem;
+ }
+ mutex_unlock(&swap_cgroup_mutex);
+
+ return 0;
+nomem:
+ printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+ printk(KERN_INFO
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+ struct page **map;
+ unsigned long i, length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return;
+
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl = &swap_cgroup_ctrl[type];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
+ if (page)
+ __free_page(page);
+ }
+ vfree(map);
+ }
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
-#include <linux/page_cgroup.h>
#include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
diff --git a/mm/truncate.c b/mm/truncate.c
index 261eaf6e5a19..f1e4d6052369 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache);
* necessary) to @newsize. It will be typically be called from the filesystem's
* setattr function when ATTR_SIZE is passed in.
*
- * Must be called with inode_mutex held and before all filesystem specific
- * block truncation has been performed.
+ * Must be called with a lock serializing truncates and writes (generally
+ * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
@@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
struct page *page;
pgoff_t index;
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
WARN_ON(to > inode->i_size);
if (from >= to || bsize == PAGE_CACHE_SIZE)
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 9f25af825dec..b6e3662fe339 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm)
{
struct task_struct *g, *p;
+ count_vm_vmacache_event(VMACACHE_FULL_FLUSHES);
+
/*
* Single threaded tasks need not iterate the entire
* list of process. We can avoid the flushing as well
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 90520af7f186..39c338896416 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -463,8 +463,7 @@ overflow:
goto retry;
}
if (printk_ratelimit())
- printk(KERN_WARNING
- "vmap allocation for size %lu failed: "
+ pr_warn("vmap allocation for size %lu failed: "
"use vmalloc=<size> to increase size.\n", size);
kfree(va);
return ERR_PTR(-EBUSY);
@@ -2575,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
if (v->flags & VM_UNINITIALIZED)
return;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index d4042e75f7c7..c5afd573d7da 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -165,6 +165,7 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
+ spin_lock(&vmpr->sr_lock);
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
@@ -173,11 +174,12 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- if (!vmpr->scanned)
+ scanned = vmpr->scanned;
+ if (!scanned) {
+ spin_unlock(&vmpr->sr_lock);
return;
+ }
- spin_lock(&vmpr->sr_lock);
- scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
vmpr->scanned = 0;
vmpr->reclaimed = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb47074ae03..bd9a72bc4a1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
-static unsigned long
-shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
- unsigned long nr_pages_scanned, unsigned long lru_pages)
+static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -255,13 +256,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta = (4 * nr_scanned) / shrinker->seeks;
delta *= freeable;
- do_div(delta, lru_pages + 1);
+ do_div(delta, nr_eligible + 1);
total_scan += delta;
if (total_scan < 0) {
- printk(KERN_ERR
- "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
}
@@ -290,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_pages_scanned, lru_pages,
- freeable, delta, total_scan);
+ nr_scanned, nr_eligible,
+ freeable, delta, total_scan);
/*
* Normally, we should not scan less than batch_size objects in one
@@ -340,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
return freed;
}
-/*
- * Call the shrink functions to age shrinkable caches
- *
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object. With this in mind we age equal
- * percentages of the lru and ageable caches. This should balance the seeks
- * generated by these structures.
+/**
+ * shrink_node_slabs - shrink slab caches of a given node
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @nr_scanned: pressure numerator
+ * @nr_eligible: pressure denominator
*
- * If the vm encountered mapped pages on the LRU it increase the pressure on
- * slab to avoid swapping.
+ * Call the shrink functions to age shrinkable caches.
*
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
*
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt. It is used for balancing
- * slab reclaim versus page reclaim.
+ * @nr_scanned and @nr_eligible form a ratio that indicate how much of
+ * the available objects should be scanned. Page reclaim for example
+ * passes the number of pages scanned and the number of pages on the
+ * LRU lists that it considered on @nid, plus a bias in @nr_scanned
+ * when it encountered mapped pages. The ratio is further biased by
+ * the ->seeks setting of the shrink function, which indicates the
+ * cost to recreate an object relative to that of an LRU page.
*
- * Returns the number of slab objects which we shrunk.
+ * Returns the number of reclaimed slab objects.
*/
-unsigned long shrink_slab(struct shrink_control *shrinkctl,
- unsigned long nr_pages_scanned,
- unsigned long lru_pages)
+unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
struct shrinker *shrinker;
unsigned long freed = 0;
- if (nr_pages_scanned == 0)
- nr_pages_scanned = SWAP_CLUSTER_MAX;
+ if (nr_scanned == 0)
+ nr_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/*
@@ -381,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
- shrinkctl->nid = 0;
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
- continue;
- }
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ };
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (node_online(shrinkctl->nid))
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ sc.nid = 0;
- }
+ freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
}
+
up_read(&shrinker_rwsem);
out:
cond_resched();
@@ -875,7 +875,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* end of the LRU a second time.
*/
mapping = page_mapping(page);
- if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+ if (((dirty || writeback) && mapping &&
+ bdi_write_congested(mapping->backing_dev_info)) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -1876,7 +1877,8 @@ enum scan_balance {
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *nr)
+ struct scan_control *sc, unsigned long *nr,
+ unsigned long *lru_pages)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
@@ -2022,6 +2024,7 @@ out:
some_scanned = false;
/* Only use force_scan on second pass. */
for (pass = 0; !some_scanned && pass < 2; pass++) {
+ *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long size;
@@ -2048,14 +2051,19 @@ out:
case SCAN_FILE:
case SCAN_ANON:
/* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
+ if ((scan_balance == SCAN_FILE) != file) {
+ size = 0;
scan = 0;
+ }
break;
default:
/* Look ma, no brain */
BUG();
}
+
+ *lru_pages += size;
nr[lru] = scan;
+
/*
* Skip the second pass and don't force_scan,
* if we found something to scan.
@@ -2069,7 +2077,7 @@ out:
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc)
+ struct scan_control *sc, unsigned long *lru_pages)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, swappiness, sc, nr);
+ get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2249,7 +2257,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(zone, sc->order)) {
+ switch (compaction_suitable(zone, sc->order, 0, 0)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static bool shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc,
+ bool is_classzone)
{
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
.zone = zone,
.priority = sc->priority,
};
+ unsigned long zone_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
+ unsigned long lru_pages;
struct lruvec *lruvec;
int swappiness;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
- shrink_lruvec(lruvec, swappiness, sc);
+ shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ zone_lru_pages += lru_pages;
/*
* Direct reclaim and kswapd have to scan all memory
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
+ /*
+ * Shrink the slab caches in the same proportion that
+ * the eligible LRU pages were scanned.
+ */
+ if (global_reclaim(sc) && is_classzone) {
+ struct reclaim_state *reclaim_state;
+
+ shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
+ sc->nr_scanned - nr_scanned,
+ zone_lru_pages);
+
+ reclaim_state = current->reclaim_state;
+ if (reclaim_state) {
+ sc->nr_reclaimed +=
+ reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
@@ -2346,7 +2377,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
* If compaction is not ready to start and allocation is not likely
* to succeed without it, then keep reclaiming.
*/
- if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
+ if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
- unsigned long lru_pages = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
gfp_t orig_mask;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
bool reclaimable = false;
@@ -2394,23 +2420,27 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
- nodes_clear(shrink.nodes_to_scan);
-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
+ requested_highidx, sc->nodemask) {
+ enum zone_type classzone_idx;
+
if (!populated_zone(zone))
continue;
+
+ classzone_idx = requested_highidx;
+ while (!populated_zone(zone->zone_pgdat->node_zones +
+ classzone_idx))
+ classzone_idx--;
+
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (global_reclaim(sc)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ if (!cpuset_zone_allowed(zone,
+ GFP_KERNEL | __GFP_HARDWALL))
continue;
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
@@ -2449,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}
- if (shrink_zone(zone, sc))
+ if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
reclaimable = true;
if (global_reclaim(sc) &&
@@ -2458,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
}
/*
- * Don't shrink slabs when reclaiming memory from over limit cgroups
- * but do shrink slab at least once when aborting reclaim for
- * compaction to avoid unevenly scanning file/anon LRU pages over slab
- * pages.
- */
- if (global_reclaim(sc)) {
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- }
-
- /*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
*/
@@ -2735,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
int swappiness = mem_cgroup_swappiness(memcg);
+ unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2750,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc);
+ shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2824,8 +2841,8 @@ static bool zone_balanced(struct zone *zone, int order,
balance_gap, classzone_idx, 0))
return false;
- if (IS_ENABLED(CONFIG_COMPACTION) && order &&
- compaction_suitable(zone, order) == COMPACT_SKIPPED)
+ if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
+ order, 0, classzone_idx) == COMPACT_SKIPPED)
return false;
return true;
@@ -2931,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
- unsigned long lru_pages,
unsigned long *nr_attempted)
{
int testorder = sc->order;
unsigned long balance_gap;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
bool lowmem_pressure;
/* Reclaim above the high watermark. */
@@ -2952,8 +2964,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
* from memory. Do not reclaim more than needed for compaction.
*/
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order) !=
- COMPACT_SKIPPED)
+ compaction_suitable(zone, sc->order, 0, classzone_idx)
+ != COMPACT_SKIPPED)
testorder = 0;
/*
@@ -2974,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
balance_gap, classzone_idx))
return true;
- shrink_zone(zone, sc);
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
- reclaim_state->reclaimed_slab = 0;
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
@@ -3041,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long lru_pages = 0;
unsigned long nr_attempted = 0;
bool raise_priority = true;
bool pgdat_needs_compaction = (order > 0);
@@ -3101,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone))
continue;
- lru_pages += zone_reclaimable_pages(zone);
-
/*
* If any zone is currently balanced then kswapd will
* not call compaction as it is expected that the
@@ -3158,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone, &sc,
- lru_pages, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone,
+ &sc, &nr_attempted))
raise_priority = false;
}
@@ -3388,7 +3391,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!populated_zone(zone))
return;
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order) {
@@ -3611,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
- unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
/*
@@ -3633,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* priorities until we have enough memory freed.
*/
do {
- shrink_zone(zone, &sc);
+ shrink_zone(zone, &sc, true);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages0 > zone->min_slab_pages) {
- /*
- * shrink_slab() does not currently allow us to determine how
- * many pages were freed in this zone. So we take the current
- * number of slab pages and shake the slab until it is reduced
- * by the same nr_pages that we used for reclaiming unmapped
- * pages.
- */
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
- for (;;) {
- unsigned long lru_pages = zone_reclaimable_pages(zone);
-
- /* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
- break;
-
- /* Freed enough memory */
- nr_slab_pages1 = zone_page_state(zone,
- NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
- break;
- }
-
- /*
- * Update nr_reclaimed by the number of slab pages we
- * reclaimed from this zone.
- */
- nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 < nr_slab_pages0)
- sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
- }
-
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1b12d390dc68..1284f89fca08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,6 +22,8 @@
#include <linux/writeback.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
+#include <linux/page_ext.h>
+#include <linux/page_owner.h>
#include "internal.h"
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_DEBUG_VM_VMACACHE
"vmacache_find_calls",
"vmacache_find_hits",
+ "vmacache_full_flushes",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
return 0;
}
+#ifdef CONFIG_PAGE_OWNER
+static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+ pg_data_t *pgdat,
+ struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+ int pageblock_mt, page_mt;
+ int i;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ page = pfn_to_page(pfn);
+ pageblock_mt = get_pfnblock_migratetype(page, pfn);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ pfn += (1UL << page_order(page)) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+ if (pageblock_mt != page_mt) {
+ if (is_migrate_cma(pageblock_mt))
+ count[MIGRATE_MOVABLE]++;
+ else
+ count[pageblock_mt]++;
+
+ pfn = block_end_pfn;
+ break;
+ }
+ pfn += (1UL << page_ext->order) - 1;
+ }
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (i = 0; i < MIGRATE_TYPES; i++)
+ seq_printf(m, "%12lu ", count[i]);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_PAGE_OWNER */
+
+/*
+ * Print out the number of pageblocks for each migratetype that contain pages
+ * of other types. This gives an indication of how well fallbacks are being
+ * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
+ * to determine what is going on
+ */
+static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
+{
+#ifdef CONFIG_PAGE_OWNER
+ int mtype;
+
+ if (!page_owner_inited)
+ return;
+
+ drain_all_pages(NULL);
+
+ seq_printf(m, "\n%-23s", "Number of mixed blocks ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+#endif /* CONFIG_PAGE_OWNER */
+}
+
/*
* This prints out statistics in relation to grouping pages by mobility.
* It is expensive to collect so do not constantly read the file.
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
seq_putc(m, '\n');
pagetypeinfo_showfree(m, pgdat);
pagetypeinfo_showblockcount(m, pgdat);
+ pagetypeinfo_showmixedcount(m, pgdat);
return 0;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index ecf1dbef6983..4e387bea702e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = {
static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zbud_create_pool(gfp, &zbud_zpool_ops);
+ return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
}
static void zbud_zpool_destroy(void *pool)
@@ -619,5 +619,5 @@ module_init(init_zbud);
module_exit(exit_zbud);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..b72403927aa4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -155,8 +155,6 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
- ZS_SIZE_CLASS_DELTA + 1)
/*
* We do not maintain any list for completely empty or full pages
@@ -171,6 +169,11 @@ enum fullness_group {
};
/*
+ * number of size_classes
+ */
+static int zs_size_classes;
+
+/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
* n = number of allocated objects
@@ -214,7 +217,7 @@ struct link_free {
};
struct zs_pool {
- struct size_class size_class[ZS_SIZE_CLASSES];
+ struct size_class **size_class;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
struct page *next_page;
struct link_free *link;
unsigned int i = 1;
+ void *vaddr;
/*
* page->index stores offset of first object starting
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
if (page != first_page)
page->index = off;
- link = (struct link_free *)kmap_atomic(page) +
- off / sizeof(*link);
+ vaddr = kmap_atomic(page);
+ link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
link->next = obj_location_to_handle(page, i++);
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
*/
next_page = get_next_page(page);
link->next = obj_location_to_handle(next_page, 0);
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm_buf)
return 0;
- area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
if (!area->vm_buf)
return -ENOMEM;
return 0;
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
static inline void __zs_cpu_down(struct mapping_area *area)
{
- if (area->vm_buf)
- free_page((unsigned long)area->vm_buf);
+ kfree(area->vm_buf);
area->vm_buf = NULL;
}
@@ -881,13 +884,26 @@ static struct notifier_block zs_cpu_nb = {
.notifier_call = zs_cpu_notifier
};
-static void zs_exit(void)
+static int zs_register_cpu_notifier(void)
{
- int cpu;
+ int cpu, uninitialized_var(ret);
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
-#endif
+ cpu_notifier_register_begin();
+
+ __register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret))
+ break;
+ }
+
+ cpu_notifier_register_done();
+ return notifier_to_errno(ret);
+}
+
+static void zs_unregister_cpu_notifier(void)
+{
+ int cpu;
cpu_notifier_register_begin();
@@ -898,93 +914,129 @@ static void zs_exit(void)
cpu_notifier_register_done();
}
-static int zs_init(void)
+static void init_zs_size_classes(void)
{
- int cpu, ret;
+ int nr;
- cpu_notifier_register_begin();
+ nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+ if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+ nr += 1;
- __register_cpu_notifier(&zs_cpu_nb);
- for_each_online_cpu(cpu) {
- ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret)) {
- cpu_notifier_register_done();
- goto fail;
- }
- }
+ zs_size_classes = nr;
+}
- cpu_notifier_register_done();
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
-#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
-#endif
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+{
+ if (prev->pages_per_zspage != pages_per_zspage)
+ return false;
- return 0;
-fail:
- zs_exit();
- return notifier_to_errno(ret);
+ if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+ != get_maxobj_per_zspage(size, pages_per_zspage))
+ return false;
+
+ return true;
}
+unsigned long zs_get_total_pages(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->pages_allocated);
+}
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
+
/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
*
- * This function must be called before anything when using
- * the zsmalloc allocator.
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
*
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
{
- int i, ovhd_size;
- struct zs_pool *pool;
+ struct page *page;
+ unsigned long obj_idx, off;
- ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
- pool = kzalloc(ovhd_size, GFP_KERNEL);
- if (!pool)
- return NULL;
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int size;
- struct size_class *class;
+ BUG_ON(!handle);
- size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
- if (size > ZS_MAX_ALLOC_SIZE)
- size = ZS_MAX_ALLOC_SIZE;
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
- class = &pool->size_class[i];
- class->size = size;
- class->index = i;
- spin_lock_init(&class->lock);
- class->pages_per_zspage = get_pages_per_zspage(size);
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ return area->vm_addr + off;
}
- pool->flags = flags;
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
- return pool;
+ return __zs_map_object(area, pages, off, class->size);
}
-EXPORT_SYMBOL_GPL(zs_create_pool);
+EXPORT_SYMBOL_GPL(zs_map_object);
-void zs_destroy_pool(struct zs_pool *pool)
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
- int i;
+ struct page *page;
+ unsigned long obj_idx, off;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int fg;
- struct size_class *class = &pool->size_class[i];
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
- for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
- if (class->fullness_list[fg]) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
- }
+ BUG_ON(!handle);
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = this_cpu_ptr(&zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
}
- kfree(pool);
+ put_cpu_var(zs_map_area);
}
-EXPORT_SYMBOL_GPL(zs_destroy_pool);
+EXPORT_SYMBOL_GPL(zs_unmap_object);
/**
* zs_malloc - Allocate block of given size from pool.
@@ -999,8 +1051,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
unsigned long obj;
struct link_free *link;
- int class_idx;
struct size_class *class;
+ void *vaddr;
struct page *first_page, *m_page;
unsigned long m_objidx, m_offset;
@@ -1008,9 +1060,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- class_idx = get_size_class_index(size);
- class = &pool->size_class[class_idx];
- BUG_ON(class_idx != class->index);
+ class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
first_page = find_get_zspage(class);
@@ -1031,11 +1081,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
obj_handle_to_location(obj, &m_page, &m_objidx);
m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
- link = (struct link_free *)kmap_atomic(m_page) +
- m_offset / sizeof(*link);
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
first_page->freelist = link->next;
memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
first_page->inuse++;
/* Now move the zspage to another fullness group, if required */
@@ -1051,6 +1101,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
+ void *vaddr;
int class_idx;
struct size_class *class;
@@ -1063,16 +1114,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
spin_lock(&class->lock);
/* Insert this object in containing zspage's freelist */
- link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
- + f_offset);
+ vaddr = kmap_atomic(f_page);
+ link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
first_page->inuse--;
@@ -1088,100 +1139,137 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
EXPORT_SYMBOL_GPL(zs_free);
/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
*
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
*
- * This function returns with preemption and page faults disabled.
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
*/
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+struct zs_pool *zs_create_pool(gfp_t flags)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
+ struct zs_pool *pool;
+ struct size_class *prev_class = NULL;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
- struct page *pages[2];
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
- BUG_ON(!handle);
+ pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+ GFP_KERNEL);
+ if (!pool->size_class) {
+ kfree(pool);
+ return NULL;
+ }
/*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
+ * Iterate reversly, because, size of size_class that we want to use
+ * for merging should be larger or equal to current size.
*/
- BUG_ON(in_interrupt());
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ int size;
+ int pages_per_zspage;
+ struct size_class *class;
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+ pages_per_zspage = get_pages_per_zspage(size);
- area = &get_cpu_var(zs_map_area);
- area->vm_mm = mm;
- if (off + class->size <= PAGE_SIZE) {
- /* this object is contained entirely within a page */
- area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ /*
+ * size_class is used for normal zsmalloc operation such
+ * as alloc/free for that size. Although it is natural that we
+ * have one size_class for each size, there is a chance that we
+ * can get more memory utilization if we use one size_class for
+ * many different sizes whose size_class have same
+ * characteristics. So, we makes size_class point to
+ * previous size_class if possible.
+ */
+ if (prev_class) {
+ if (can_merge(prev_class, size, pages_per_zspage)) {
+ pool->size_class[i] = prev_class;
+ continue;
+ }
+ }
+
+ class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+ if (!class)
+ goto err;
+
+ class->size = size;
+ class->index = i;
+ class->pages_per_zspage = pages_per_zspage;
+ spin_lock_init(&class->lock);
+ pool->size_class[i] = class;
+
+ prev_class = class;
}
- /* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ pool->flags = flags;
- return __zs_map_object(area, pages, off, class->size);
+ return pool;
+
+err:
+ zs_destroy_pool(pool);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_create_pool);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_destroy_pool(struct zs_pool *pool)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
+ for (i = 0; i < zs_size_classes; i++) {
+ int fg;
+ struct size_class *class = pool->size_class[i];
- BUG_ON(!handle);
+ if (!class)
+ continue;
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ if (class->index != i)
+ continue;
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_atomic(area->vm_addr);
- else {
- struct page *pages[2];
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ kfree(class);
+ }
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ kfree(pool->size_class);
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
- __zs_unmap_object(area, pages, off, class->size);
+static int __init zs_init(void)
+{
+ int ret = zs_register_cpu_notifier();
+
+ if (ret) {
+ zs_unregister_cpu_notifier();
+ return ret;
}
- put_cpu_var(zs_map_area);
+
+ init_zs_size_classes();
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zs_zpool_driver);
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
-unsigned long zs_get_total_pages(struct zs_pool *pool)
+static void __exit zs_exit(void)
{
- return atomic_long_read(&pool->pages_allocated);
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+ zs_unregister_cpu_notifier();
}
-EXPORT_SYMBOL_GPL(zs_get_total_pages);
module_init(zs_init);
module_exit(zs_exit);
diff --git a/mm/zswap.c b/mm/zswap.c
index ea064c1a09ba..0cfce9bc51e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void)
return 0;
}
-static void zswap_comp_exit(void)
+static void __init zswap_comp_exit(void)
{
/* free percpu transforms */
- if (zswap_comp_pcpu_tfms)
- free_percpu(zswap_comp_pcpu_tfms);
+ free_percpu(zswap_comp_pcpu_tfms);
}
/*********************************
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
**********************************/
static struct kmem_cache *zswap_entry_cache;
-static int zswap_entry_cache_create(void)
+static int __init zswap_entry_cache_create(void)
{
zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
return zswap_entry_cache == NULL;
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = {
.notifier_call = zswap_cpu_notifier
};
-static int zswap_cpu_init(void)
+static int __init zswap_cpu_init(void)
{
unsigned long cpu;
@@ -951,5 +950,5 @@ error:
late_initcall(init_zswap);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
MODULE_DESCRIPTION("Compressed cache for swap pages");