diff options
Diffstat (limited to 'mm')
60 files changed, 3557 insertions, 3410 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 4b2443254de2..56badfc4810a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,8 +1,18 @@ +config PAGE_EXTENSION + bool "Extend memmap on extra space for more information on page" + ---help--- + Extend memmap on extra space for more information on page. This + could be used for debugging features that need to insert extra + field for every page. This extension enables us to save memory + by not allocating this extra memory according to boottime + configuration. + config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK + select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a9..4bf586e66378 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,12 +55,15 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o +obj-$(CONFIG_PAGE_COUNTER) += page_counter.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o @@ -69,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 8a000cebb0d7..477be696511d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } @@ -33,6 +33,7 @@ #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> +#include <linux/io.h> struct cma { unsigned long base_pfn; @@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) return (1UL << (align_order - cma->order_per_bit)) - 1; } +static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +{ + unsigned int alignment; + + if (align_order <= cma->order_per_bit) + return 0; + alignment = 1UL << (align_order - cma->order_per_bit); + return ALIGN(cma->base_pfn, alignment) - + (cma->base_pfn >> cma->order_per_bit); +} + static unsigned long cma_bitmap_maxno(struct cma *cma) { return cma->count >> cma->order_per_bit; @@ -124,6 +136,7 @@ static int __init cma_activate_area(struct cma *cma) err: kfree(cma->bitmap); + cma->count = 0; return -EINVAL; } @@ -214,12 +227,23 @@ int __init cma_declare_contiguous(phys_addr_t base, bool fixed, struct cma **res_cma) { phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start = __pa(high_memory); + phys_addr_t highmem_start; int ret = 0; - pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", - __func__, (unsigned long)size, (unsigned long)base, - (unsigned long)limit, (unsigned long)alignment); +#ifdef CONFIG_X86 + /* + * high_memory isn't direct mapped memory so retrieving its physical + * address isn't appropriate. But it would be useful to check the + * physical address of the highmem boundary so it's justfiable to get + * the physical address from it. On x86 there is a validation check for + * this case, so the following workaround is needed to avoid it. + */ + highmem_start = __pa_nodebug(high_memory); +#else + highmem_start = __pa(high_memory); +#endif + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); @@ -244,52 +268,78 @@ int __init cma_declare_contiguous(phys_addr_t base, size = ALIGN(size, alignment); limit &= ~(alignment - 1); + if (!base) + fixed = false; + /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; /* - * adjust limit to avoid crossing low/high memory boundary for - * automatically allocated regions + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. */ - if (((limit == 0 || limit > memblock_end) && - (memblock_end - size < highmem_start && - memblock_end > highmem_start)) || - (!fixed && limit > highmem_start && limit - size < highmem_start)) { - limit = highmem_start; - } - - if (fixed && base < highmem_start && base+size > highmem_start) { + if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; - pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", - (unsigned long)base, (unsigned long)highmem_start); + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); goto err; } + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + /* Reserve memory */ - if (base && fixed) { + if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } } + + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore(phys_to_virt(addr)); + base = addr; } ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); if (ret) goto err; - pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + totalcma_pages += (size / PAGE_SIZE); + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); return 0; err: @@ -308,7 +358,7 @@ err: */ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) { - unsigned long mask, pfn, start = 0; + unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -323,13 +373,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) return NULL; mask = cma_bitmap_aligned_mask(cma, align); + offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); for (;;) { mutex_lock(&cma->lock); - bitmap_no = bitmap_find_next_zero_area(cma->bitmap, - bitmap_maxno, start, bitmap_count, mask); + bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask, + offset); if (bitmap_no >= bitmap_maxno) { mutex_unlock(&cma->lock); break; diff --git a/mm/compaction.c b/mm/compaction.c index ec74cf0123ef..546e571e9d60 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; - unsigned long count = 0; + unsigned long high_pfn = 0; list_for_each_entry_safe(page, next, freelist, lru) { + unsigned long pfn = page_to_pfn(page); list_del(&page->lru); __free_page(page); - count++; + if (pfn > high_pfn) + high_pfn = pfn; } - return count; + return high_pfn; } static void map_pages(struct list_head *list) @@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc, /* Update where async and sync compaction should restart */ if (migrate_scanner) { - if (cc->finished_update_migrate) - return; if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && pfn > zone->compact_cached_migrate_pfn[1]) zone->compact_cached_migrate_pfn[1] = pfn; } else { - if (cc->finished_update_free) - return; if (pfn < zone->compact_cached_free_pfn) zone->compact_cached_free_pfn = pfn; } @@ -479,6 +477,16 @@ isolate_freepages_range(struct compact_control *cc, block_end_pfn = min(block_end_pfn, end_pfn); + /* + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. + */ + if (pfn >= block_end_pfn) { + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + } + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) break; @@ -705,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; @@ -879,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc) block_start_pfn - pageblock_nr_pages; /* - * Set a flag that we successfully isolated in this pageblock. - * In the next loop iteration, zone->compact_cached_free_pfn - * will not be updated and thus it will effectively contain the - * highest pageblock we isolated pages from. - */ - if (isolated) - cc->finished_update_free = true; - - /* * isolate_freepages_block() might have aborted due to async * compaction being contended */ @@ -1029,8 +1027,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } acct_isolated(zone, cc); - /* Record where migration scanner will be restarted */ - cc->migrate_pfn = low_pfn; + /* + * Record where migration scanner will be restarted. If we end up in + * the same pageblock as the free scanner, make the scanners fully + * meet so that compact_finished() terminates compaction. + */ + cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } @@ -1072,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, /* Compaction run is not finished if the watermark is not met */ watermark = low_wmark_pages(zone); - watermark += (1 << cc->order); - if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, + cc->alloc_flags)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ @@ -1100,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -unsigned long compaction_suitable(struct zone *zone, int order) +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) { int fragindex; unsigned long watermark; @@ -1112,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (order == -1) return COMPACT_CONTINUE; + watermark = low_wmark_pages(zone); + /* + * If watermarks for high-order allocation are already met, there + * should be no need for compaction at all. + */ + if (zone_watermark_ok(zone, order, watermark, classzone_idx, + alloc_flags)) + return COMPACT_PARTIAL; + /* * Watermarks for order-0 must be met for compaction. Note the 2UL. * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + watermark += (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) return COMPACT_SKIPPED; /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * - * index of -1000 implies allocations might succeed depending on - * watermarks + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * @@ -1136,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) return COMPACT_SKIPPED; - if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, - 0, 0)) - return COMPACT_PARTIAL; - return COMPACT_CONTINUE; } @@ -1150,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + unsigned long last_migrated_pfn = 0; - ret = compaction_suitable(zone, cc->order); + ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + cc->classzone_idx); switch (ret) { case COMPACT_PARTIAL: case COMPACT_SKIPPED: @@ -1194,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc, migratetype)) == COMPACT_CONTINUE) { int err; + unsigned long isolate_start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: @@ -1202,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: - continue; + /* + * We haven't isolated and migrated anything, but + * there might still be unflushed migrations from + * previous cc->order aligned block. + */ + goto check_drain; case ISOLATE_SUCCESS: ; } @@ -1227,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } + + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. We use the pfn that + * isolate_migratepages() started from in this loop iteration + * - this is the lowest page that could have been isolated and + * then freed by migration. + */ + if (!last_migrated_pfn) + last_migrated_pfn = isolate_start_pfn; + +check_drain: + /* + * Has the migration scanner moved away from the previous + * cc->order aligned block where we migrated from? If yes, + * flush the pages that were freed, so that they can merge and + * compact_finished() can detect immediately if allocation + * would succeed. + */ + if (cc->order > 0 && last_migrated_pfn) { + int cpu; + unsigned long current_block_start = + cc->migrate_pfn & ~((1UL << cc->order) - 1); + + if (last_migrated_pfn < current_block_start) { + cpu = get_cpu(); + lru_add_drain_cpu(cpu); + drain_local_pages(zone); + put_cpu(); + /* No more flushing until we migrate again */ + last_migrated_pfn = 0; + } + } + } out: - /* Release free pages and check accounting */ - cc->nr_freepages -= release_freepages(&cc->freepages); - VM_BUG_ON(cc->nr_freepages != 0); + /* + * Release free pages and update where the free scanner should restart, + * so we don't leave any returned pages behind in the next attempt. + */ + if (cc->nr_freepages > 0) { + unsigned long free_pfn = release_freepages(&cc->freepages); + + cc->nr_freepages = 0; + VM_BUG_ON(free_pfn == 0); + /* The cached pfn is always the first in a pageblock */ + free_pfn &= ~(pageblock_nr_pages-1); + /* + * Only go back, not forward. The cached pfn might have been + * already reset to zone end in compact_finished() + */ + if (free_pfn > zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = free_pfn; + } trace_mm_compaction_end(ret); @@ -1240,7 +1305,8 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, int *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1250,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .gfp_mask = gfp_mask, .zone = zone, .mode = mode, + .alloc_flags = alloc_flags, + .classzone_idx = classzone_idx, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1274,14 +1342,13 @@ int sysctl_extfrag_threshold = 500; * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention - * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, - struct zone **candidate_zone) + int alloc_flags, int classzone_idx) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1289,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_DEFERRED; - int alloc_flags = 0; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1298,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1312,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended); + &zone_contended, alloc_flags, classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1321,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) { - *candidate_zone = zone; + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), + classzone_idx, alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1345,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, goto break_loop; } - if (mode != MIGRATE_ASYNC) { + if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 789ff70c8a4a..5bf5906ce13b 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -2,23 +2,55 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/page-debug-flags.h> +#include <linux/page_ext.h> #include <linux/poison.h> #include <linux/ratelimit.h> +static bool page_poisoning_enabled __read_mostly; + +static bool need_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return; + + page_poisoning_enabled = true; +} + +struct page_ext_operations page_poisoning_ops = { + .need = need_page_poisoning, + .init = init_page_poisoning, +}; + static inline void set_page_poison(struct page *page) { - __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static inline void clear_page_poison(struct page *page) { - __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static inline bool page_poison(struct page *page) { - return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static void poison_page(struct page *page) @@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { + if (!page_poisoning_enabled) + return; + if (enable) unpoison_pages(page, numpages); else diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9a29b5..0e58f3211f89 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason, dump_flags(page->flags & badflags, pageflag_names, ARRAY_SIZE(pageflag_names)); } - mem_cgroup_print_bad_page(page); +#ifdef CONFIG_MEMCG + if (page->mem_cgroup) + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); +#endif } void dump_page(struct page *page, const char *reason) diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81db45e..2ad7adf4f0a4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); - /* First and last FULL page! */ + /* + * First and last FULL page! Partial pages are deliberately + * preserved on the expectation that it is better to preserve + * needed memory than to discard unneeded memory. + */ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1..bd8543c6508f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -62,16 +62,16 @@ /* * Lock ordering: * - * ->i_mmap_mutex (truncate_pagecache) + * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_mutex (truncate->unmap_mapping_range) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -85,7 +85,7 @@ * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -105,7 +105,7 @@ * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->tasklist_lock (memory_failure, collect_procs_ao) */ @@ -2464,7 +2464,7 @@ ssize_t generic_perform_write(struct file *file, /* * Copies from kernel address space cannot fail (NFSD is a big user). */ - if (segment_eq(get_fs(), KERNEL_DS)) + if (!iter_is_iovec(i)) flags |= AOP_FLAG_UNINTERRUPTIBLE; do { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3f685c..0d105aeff82f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) EXPORT_SYMBOL_GPL(xip_file_read); /* - * __xip_unmap is invoked from xip_unmap and - * xip_write + * __xip_unmap is invoked from xip_unmap and xip_write * * This function walks all vmas of the address_space and unmaps the * __xip_sparse_page when found at pgoff. */ -static void -__xip_unmap (struct address_space * mapping, - unsigned long pgoff) +static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long address; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; struct page *page; unsigned count; int locked = 0; @@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping, return; retry: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - mm = vma->vm_mm; - address = vma->vm_start + + pte_t *pte, pteval; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { @@ -202,7 +197,7 @@ retry: page_cache_release(page); } } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (locked) { mutex_unlock(&xip_sparse_mutex); diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa361433..2805d71cf476 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_present(pte)) { flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush(vma, addr, ptep); + pte = ptep_clear_flush_notify(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) @@ -238,13 +238,13 @@ get_write_lock: } goto out_freed; } - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/frontswap.c b/mm/frontswap.c index c30eec536f03..8d82809eb085 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map) if (frontswap_ops) frontswap_ops->init(type); else { - BUG_ON(type > MAX_SWAPFILES); + BUG_ON(type >= MAX_SWAPFILES); set_bit(type, need_init); } } @@ -244,8 +244,10 @@ int __frontswap_store(struct page *page) the (older) page from frontswap */ inc_frontswap_failed_stores(); - if (dup) + if (dup) { __frontswap_clear(sis, offset); + frontswap_ops->invalidate_page(type, offset); + } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ @@ -3,7 +3,6 @@ #include <linux/err.h> #include <linux/spinlock.h> -#include <linux/hugetlb.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> @@ -12,6 +11,7 @@ #include <linux/sched.h> #include <linux/rwsem.h> +#include <linux/hugetlb.h> #include <asm/pgtable.h> #include "internal.h" @@ -875,6 +875,49 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } +static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, + unsigned long end, int write, + struct page **pages, int *nr) +{ + int refs; + struct page *head, *page, *tail; + + if (write && !pgd_write(orig)) + return 0; + + refs = 0; + head = pgd_page(orig); + page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -902,6 +945,14 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pages, nr)) return 0; + } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { + /* + * architecture have different format for hugetlbfs + * pmd format and THP pmd format + */ + if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, write, pages, nr)) + return 0; } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -909,22 +960,26 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(pgdp, addr); + pudp = pud_offset(&pgd, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (pud_huge(pud)) { + if (unlikely(pud_huge(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, write, - pages, nr)) + pages, nr)) + return 0; + } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { + if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, + PUD_SHIFT, next, write, pages, nr)) return 0; } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) return 0; @@ -970,10 +1025,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { + pgd_t pgd = ACCESS_ONCE(*pgdp); + next = pgd_addr_end(addr, end); - if (pgd_none(*pgdp)) + if (pgd_none(pgd)) break; - else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr)) + if (unlikely(pgd_huge(pgd))) { + if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + pages, &nr)) + break; + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { + if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, write, pages, &nr)) + break; + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de984159cf0b..817a875f2b8c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (!pmd_none(*pmd)) return false; entry = mk_pmd(zero_page, vma->vm_page_prot); - entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -805,7 +804,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && + if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && transparent_hugepage_use_zero_page()) { spinlock_t *ptl; pgtable_t pgtable; @@ -1036,7 +1035,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1179,7 +1178,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1400,7 +1399,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); + orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1512,7 +1512,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry; ret = 1; if (!prot_numa) { - entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmdp_get_and_clear_notify(mm, addr, pmd); if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); @@ -1644,6 +1644,7 @@ static int __split_huge_page_splitting(struct page *page, * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); + ret = 1; spin_unlock(ptl); } @@ -2834,7 +2835,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9fd722769927..85032de5e20f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -582,7 +582,7 @@ retry_cpuset: for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { + if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) return 0; found: - BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ list_add(&m->list, &huge_boot_pages); m->hstate = h; @@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node) * devices of nodes that have memory. All on-line nodes should have * registered their associated device by this time. */ -static void hugetlb_register_all_nodes(void) +static void __init hugetlb_register_all_nodes(void) { int nid; @@ -2598,8 +2598,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_pte_at(dst, addr, dst_pte, entry); } else { - if (cow) + if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); + mmu_notifier_invalidate_range(src, mmun_start, + mmun_end); + } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); @@ -2638,8 +2641,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + address = start; again: - for (address = start; address < end; address += sz) { + for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -2686,6 +2690,7 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { + address += sz; spin_unlock(ptl); break; } @@ -2724,9 +2729,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * on its way out. We're lucky that the flag has such an appropriate * name, and can in fact be safely cleared here. We could clear it * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_mutex. This works + * is to clear it before releasing the i_mmap_rwsem. This works * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_mutex is held. + * destroyed and the i_mmap_rwsem is held. */ vma->vm_flags &= ~VM_MAYSHARE; } @@ -2772,7 +2777,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -2789,7 +2794,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* @@ -2899,6 +2904,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page); @@ -3346,7 +3352,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mmu_notifier_invalidate_range_start(mm, start, end); - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address); @@ -3368,13 +3374,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, spin_unlock(ptl); } /* - * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: - * once we release i_mmap_mutex, another task can do the final put_page + * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. */ flush_tlb_range(vma, start, end); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range(mm, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; @@ -3523,7 +3530,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_mutex section - otherwise + * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ @@ -3542,7 +3549,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -3570,7 +3577,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); return pte; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a67c26e0f360..037e1c00a5b7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -14,6 +14,7 @@ */ #include <linux/cgroup.h> +#include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> @@ -23,7 +24,7 @@ struct hugetlb_cgroup { /* * the counter to account for hugepages from hugetlb. */ - struct res_counter hugepage[HUGE_MAX_HSTATE]; + struct page_counter hugepage[HUGE_MAX_HSTATE]; }; #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) @@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) int idx; for (idx = 0; idx < hugetlb_max_hstate; idx++) { - if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) + if (page_counter_read(&h_cg->hugepage[idx])) return true; } return false; @@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], - &parent_h_cgroup->hugepage[idx]); + page_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); } else { root_h_cgroup = h_cgroup; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], NULL); + page_counter_init(&h_cgroup->hugepage[idx], NULL); } return &h_cgroup->css; } @@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { - int csize; - struct res_counter *counter; - struct res_counter *fail_res; + unsigned int nr_pages; + struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); @@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - csize = PAGE_SIZE << compound_order(page); + nr_pages = 1 << compound_order(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ - res_counter_charge_nofail(&parent->hugepage[idx], - csize, &fail_res); + page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; - res_counter_uncharge_until(counter, counter->parent, csize); + /* Take the pages off the local counter */ + page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(page, parent); out: @@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { int ret = 0; - struct res_counter *fail_res; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) goto done; @@ -187,7 +186,7 @@ again: } rcu_read_unlock(); - ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); + ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); css_put(&h_cg->css); done: *ptr = h_cg; @@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) { struct hugetlb_cgroup *h_cg; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) return; @@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (unlikely(!h_cg)) return; set_hugetlb_cgroup(page, NULL); - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { - unsigned long csize = nr_pages * PAGE_SIZE; - if (hugetlb_cgroup_disabled() || !h_cg) return; if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) return; - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, +}; + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - int idx, name; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; - return res_counter_read_u64(&h_cg->hugepage[idx], name); + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + default: + BUG(); + } } +static DEFINE_MUTEX(hugetlb_limit_mutex); + static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret; - unsigned long long val; + int ret, idx; + unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ + return -EINVAL; + buf = strstrip(buf); + ret = page_counter_memparse(buf, &nr_pages); + if (ret) + return ret; + idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: - if (hugetlb_cgroup_is_root(h_cg)) { - /* Can't set limit on root */ - ret = -EINVAL; - break; - } - /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) - break; - val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); - ret = res_counter_set_limit(&h_cg->hugepage[idx], val); + mutex_lock(&hugetlb_limit_mutex); + ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; @@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret = 0; + int ret = 0; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: - res_counter_reset_max(&h_cg->hugepage[idx]); + page_counter_reset_watermark(counter); break; case RES_FAILCNT: - res_counter_reset_failcnt(&h_cg->hugepage[idx]); + counter->failcnt = 0; break; default: ret = -EINVAL; diff --git a/mm/internal.h b/mm/internal.h index 829304090b90..efad241f7014 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -136,13 +161,10 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ - bool finished_update_free; /* True when the zone cached pfns are - * no longer being updated - */ - bool finished_update_migrate; - int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + const int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock * contention detected during diff --git a/mm/iov_iter.c b/mm/iov_iter.c index eafcf60f6b83..a1599ca4ab0e 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -3,95 +3,136 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> - -static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; +#include <net/checksum.h> + +#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ + size_t left; \ + size_t wanted = n; \ + __p = i->iov; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } else { \ + left = 0; \ + } \ + while (unlikely(!left && n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted - n; \ +} + +#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->kvec; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + (void)(STEP); \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + (void)(STEP); \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted; \ +} + +#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->bvec; \ + __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ + if (likely(__v.bv_len)) { \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset + skip; \ + (void)(STEP); \ + skip += __v.bv_len; \ + n -= __v.bv_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.bv_len = min_t(size_t, n, __p->bv_len); \ + if (unlikely(!__v.bv_len)) \ + continue; \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset; \ + (void)(STEP); \ + skip = __v.bv_len; \ + n -= __v.bv_len; \ + } \ + n = wanted; \ +} + +#define iterate_all_kinds(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ +} + +#define iterate_and_advance(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + if (skip == bvec->bv_len) { \ + bvec++; \ + skip = 0; \ + } \ + i->nr_segs -= bvec - i->bvec; \ + i->bvec = bvec; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ + } \ + i->count -= n; \ + i->iov_offset = skip; \ } static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, @@ -256,134 +297,6 @@ done: return wanted - bytes; } -static size_t zero_iovec(size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __clear_user(buf, copy); - copy -= left; - skip += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __clear_user(buf, copy); - copy -= left; - skip = copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -static size_t copy_from_user_atomic_iovec(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} - -static void advance_iovec(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} - /* * Fault in the first iovec of the given iov_iter, to a maximum length * of bytes. Returns 0 on success, or non-zero if the memory could not be @@ -395,7 +308,7 @@ static void advance_iovec(struct iov_iter *i, size_t bytes) */ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { - if (!(i->type & ITER_BVEC)) { + if (!(i->type & (ITER_BVEC|ITER_KVEC))) { char __user *buf = i->iov->iov_base + i->iov_offset; bytes = min(bytes, i->iov->iov_len - i->iov_offset); return fault_in_pages_readable(buf, bytes); @@ -404,136 +317,25 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) } EXPORT_SYMBOL(iov_iter_fault_in_readable); -static unsigned long alignment_iovec(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - unsigned long res; - size_t size = i->count; - size_t n; - - if (!size) - return 0; - - res = (unsigned long)iov->iov_base + i->iov_offset; - n = iov->iov_len - i->iov_offset; - if (n >= size) - return res | size; - size -= n; - res |= n; - while (size > (++iov)->iov_len) { - res |= (unsigned long)iov->iov_base | iov->iov_len; - size -= iov->iov_len; - } - res |= (unsigned long)iov->iov_base | size; - return res; -} - void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) + if (segment_eq(get_fs(), KERNEL_DS)) { direction |= ITER_KVEC; - i->type = direction; - i->iov = iov; + i->type = direction; + i->kvec = (struct kvec *)iov; + } else { + i->type = direction; + i->iov = iov; + } i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; } EXPORT_SYMBOL(iov_iter_init); -static ssize_t get_pages_iovec(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - size_t offset = i->iov_offset; - const struct iovec *iov = i->iov; - size_t len; - unsigned long addr; - int n; - int res; - - len = iov->iov_len - offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - addr = (unsigned long)iov->iov_base + offset; - len += *start = addr & (PAGE_SIZE - 1); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - addr &= ~(PAGE_SIZE - 1); - n = (len + PAGE_SIZE - 1) / PAGE_SIZE; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); - if (unlikely(res < 0)) - return res; - return (res == n ? len : res * PAGE_SIZE) - *start; -} - -static ssize_t get_pages_alloc_iovec(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - size_t offset = i->iov_offset; - const struct iovec *iov = i->iov; - size_t len; - unsigned long addr; - void *p; - int n; - int res; - - len = iov->iov_len - offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - addr = (unsigned long)iov->iov_base + offset; - len += *start = addr & (PAGE_SIZE - 1); - addr &= ~(PAGE_SIZE - 1); - n = (len + PAGE_SIZE - 1) / PAGE_SIZE; - - p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - if (!p) - return -ENOMEM; - - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); - if (unlikely(res < 0)) { - kvfree(p); - return res; - } - *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; -} - -static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages) -{ - size_t offset = i->iov_offset; - size_t size = i->count; - const struct iovec *iov = i->iov; - int npages = 0; - int n; - - for (n = 0; size && n < i->nr_segs; n++, iov++) { - unsigned long addr = (unsigned long)iov->iov_base + offset; - size_t len = iov->iov_len - offset; - offset = 0; - if (unlikely(!len)) /* empty segment */ - continue; - if (len > size) - len = size; - npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE - - addr / PAGE_SIZE; - if (npages >= maxpages) /* don't bother going further */ - return maxpages; - size -= len; - offset = 0; - } - return min(npages, maxpages); -} - static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_atomic(page); @@ -555,293 +357,78 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } -static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *from = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - copy = min_t(size_t, bytes, bvec->bv_len - skip); + iterate_and_advance(i, bytes, v, + __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, + v.iov_len), + memcpy_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len), + memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) + ) - memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); - skip += copy; - from += copy; - bytes -= copy; - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy); - skip = copy; - from += copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted - bytes; + return bytes; } +EXPORT_SYMBOL(copy_to_iter); -static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *to = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - - copy = min(bytes, bvec->bv_len - skip); - - memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); - - to += copy; - skip += copy; - bytes -= copy; - - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy); - skip = copy; - to += copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted; -} - -static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, - size_t bytes, struct iov_iter *i) -{ - void *kaddr = kmap_atomic(page); - size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; -} + iterate_and_advance(i, bytes, v, + __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, + v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) -static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, - size_t bytes, struct iov_iter *i) -{ - void *kaddr = kmap_atomic(page); - size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; + return bytes; } +EXPORT_SYMBOL(copy_from_iter); -static size_t zero_bvec(size_t bytes, struct iov_iter *i) +size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *to = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - copy = min_t(size_t, bytes, bvec->bv_len - skip); + iterate_and_advance(i, bytes, v, + __copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) - memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy); - skip += copy; - bytes -= copy; - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memzero_page(bvec->bv_page, bvec->bv_offset, copy); - skip = copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_from_user_bvec(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t left; - const struct bio_vec *bvec; - size_t base = i->iov_offset; - - kaddr = kmap_atomic(page); - for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) { - size_t copy = min(left, bvec->bv_len - base); - if (!bvec->bv_len) - continue; - memcpy_from_page(kaddr + offset, bvec->bv_page, - bvec->bv_offset + base, copy); - offset += copy; - left -= copy; - } - kunmap_atomic(kaddr); return bytes; } - -static void advance_bvec(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct bio_vec *bvec = i->bvec; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !bvec->bv_len)) { - int copy; - - copy = min(bytes, bvec->bv_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (bvec->bv_len == base) { - bvec++; - nr_segs--; - base = 0; - } - } - i->bvec = bvec; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} - -static unsigned long alignment_bvec(const struct iov_iter *i) -{ - const struct bio_vec *bvec = i->bvec; - unsigned long res; - size_t size = i->count; - size_t n; - - if (!size) - return 0; - - res = bvec->bv_offset + i->iov_offset; - n = bvec->bv_len - i->iov_offset; - if (n >= size) - return res | size; - size -= n; - res |= n; - while (size > (++bvec)->bv_len) { - res |= bvec->bv_offset | bvec->bv_len; - size -= bvec->bv_len; - } - res |= bvec->bv_offset | size; - return res; -} - -static ssize_t get_pages_bvec(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - const struct bio_vec *bvec = i->bvec; - size_t len = bvec->bv_len - i->iov_offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - /* can't be more than PAGE_SIZE */ - *start = bvec->bv_offset + i->iov_offset; - - get_page(*pages = bvec->bv_page); - - return len; -} - -static ssize_t get_pages_alloc_bvec(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - const struct bio_vec *bvec = i->bvec; - size_t len = bvec->bv_len - i->iov_offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - *start = bvec->bv_offset + i->iov_offset; - - *pages = kmalloc(sizeof(struct page *), GFP_KERNEL); - if (!*pages) - return -ENOMEM; - - get_page(**pages = bvec->bv_page); - - return len; -} - -static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages) -{ - size_t offset = i->iov_offset; - size_t size = i->count; - const struct bio_vec *bvec = i->bvec; - int npages = 0; - int n; - - for (n = 0; size && n < i->nr_segs; n++, bvec++) { - size_t len = bvec->bv_len - offset; - offset = 0; - if (unlikely(!len)) /* empty segment */ - continue; - if (len > size) - len = size; - npages++; - if (npages >= maxpages) /* don't bother going further */ - return maxpages; - size -= len; - offset = 0; - } - return min(npages, maxpages); -} +EXPORT_SYMBOL(copy_from_iter_nocache); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_page_to_iter_bvec(page, offset, bytes, i); - else + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else return copy_page_to_iter_iovec(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); @@ -849,57 +436,53 @@ EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_page_from_iter_bvec(page, offset, bytes, i); - else + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else return copy_page_from_iter_iovec(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_from_iter); -size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_to_iter_bvec(addr, bytes, i); - else - return copy_to_iter_iovec(addr, bytes, i); -} -EXPORT_SYMBOL(copy_to_iter); + if (unlikely(bytes > i->count)) + bytes = i->count; -size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - if (i->type & ITER_BVEC) - return copy_from_iter_bvec(addr, bytes, i); - else - return copy_from_iter_iovec(addr, bytes, i); -} -EXPORT_SYMBOL(copy_from_iter); + if (unlikely(!bytes)) + return 0; -size_t iov_iter_zero(size_t bytes, struct iov_iter *i) -{ - if (i->type & ITER_BVEC) { - return zero_bvec(bytes, i); - } else { - return zero_iovec(bytes, i); - } + iterate_and_advance(i, bytes, v, + __clear_user(v.iov_base, v.iov_len), + memzero_page(v.bv_page, v.bv_offset, v.bv_len), + memset(v.iov_base, 0, v.iov_len) + ) + + return bytes; } EXPORT_SYMBOL(iov_iter_zero); size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { - if (i->type & ITER_BVEC) - return copy_from_user_bvec(page, i, offset, bytes); - else - return copy_from_user_atomic_iovec(page, i, offset, bytes); + char *kaddr = kmap_atomic(page), *p = kaddr + offset; + iterate_all_kinds(i, bytes, v, + __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + kunmap_atomic(kaddr); + return bytes; } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); void iov_iter_advance(struct iov_iter *i, size_t size) { - if (i->type & ITER_BVEC) - advance_bvec(i, size); - else - advance_iovec(i, size); + iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -911,18 +494,39 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) - return min(i->count, i->iov->iov_len - i->iov_offset); - else return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); +void iov_iter_kvec(struct iov_iter *i, int direction, + const struct kvec *iov, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_KVEC)); + i->type = direction; + i->kvec = (struct kvec *)iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_kvec); + unsigned long iov_iter_alignment(const struct iov_iter *i) { - if (i->type & ITER_BVEC) - return alignment_bvec(i); - else - return alignment_iovec(i); + unsigned long res = 0; + size_t size = i->count; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (unsigned long)v.iov_base | v.iov_len, 0), + res |= v.bv_offset | v.bv_len, + res |= (unsigned long)v.iov_base | v.iov_len + ) + return res; } EXPORT_SYMBOL(iov_iter_alignment); @@ -930,29 +534,207 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { - if (i->type & ITER_BVEC) - return get_pages_bvec(i, pages, maxsize, maxpages, start); - else - return get_pages_iovec(i, pages, maxsize, maxpages, start); + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + get_page(*pages = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; } EXPORT_SYMBOL(iov_iter_get_pages); +static struct page **get_pages_array(size_t n) +{ + struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + return p; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { - if (i->type & ITER_BVEC) - return get_pages_alloc_bvec(i, pages, maxsize, start); - else - return get_pages_alloc_iovec(i, pages, maxsize, start); + struct page **p; + + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + p = get_pages_array(n); + if (!p) + return -ENOMEM; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + *pages = p = get_pages_array(1); + if (!p) + return -ENOMEM; + get_page(*p = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; } EXPORT_SYMBOL(iov_iter_get_pages_alloc); +size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_from_iter); + +size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *from = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, + p + v.bv_offset, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_to_iter); + int iov_iter_npages(const struct iov_iter *i, int maxpages) { - if (i->type & ITER_BVEC) - return iov_iter_npages_bvec(i, maxpages); - else - return iov_iter_npages_iovec(i, maxpages); + size_t size = i->count; + int npages = 0; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, ({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + 0;}),({ + npages++; + if (npages >= maxpages) + return maxpages; + }),({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + }) + ) + return npages; } EXPORT_SYMBOL(iov_iter_npages); @@ -892,7 +892,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. */ - entry = ptep_clear_flush(vma, addr, ptep); + entry = ptep_clear_flush_notify(vma, addr, ptep); /* * Check that no O_DIRECT or similar I/O is in progress on the * page @@ -960,7 +960,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); diff --git a/mm/madvise.c b/mm/madvise.c index 0938b30da4ab..a271adc93289 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -326,7 +326,7 @@ static long madvise_remove(struct vm_area_struct *vma, */ get_file(f); up_read(¤t->mm->mmap_sem); - error = do_fallocate(f, + error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); diff --git a/mm/memblock.c b/mm/memblock.c index 6ecb0d937fb5..252b77bdf65e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) } /** - * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. - * @base: the base phys addr of the region - * @size: the size of the region * - * This function isolates region [@base, @base + @size), and mark it with flag - * MEMBLOCK_HOTPLUG. + * This function isolates region [@base, @base + @size), and sets/clears flag * * Return 0 on succees, -errno on failure. */ -int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_setclr_flag(phys_addr_t base, + phys_addr_t size, int set, int flag) { struct memblock_type *type = &memblock.memory; int i, ret, start_rgn, end_rgn; @@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) return ret; for (i = start_rgn; i < end_rgn; i++) - memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + if (set) + memblock_set_region_flags(&type->regions[i], flag); + else + memblock_clear_region_flags(&type->regions[i], flag); memblock_merge_regions(type); return 0; } /** - * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. * @base: the base phys addr of the region * @size: the size of the region * - * This function isolates region [@base, @base + @size), and clear flag - * MEMBLOCK_HOTPLUG for the isolated regions. + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region * * Return 0 on succees, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { - struct memblock_type *type = &memblock.memory; - int i, ret, start_rgn, end_rgn; - - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); - if (ret) - return ret; - - for (i = start_rgn; i < end_rgn; i++) - memblock_clear_region_flags(&type->regions[i], - MEMBLOCK_HOTPLUG); - - memblock_merge_regions(type); - return 0; + return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6ac0e33e150..ef91e856c7e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ * GNU General Public License for more details. */ -#include <linux/res_counter.h> +#include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> @@ -51,7 +51,7 @@ #include <linux/seq_file.h> #include <linux/vmpressure.h> #include <linux/mm_inline.h> -#include <linux/page_cgroup.h> +#include <linux/swap_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> @@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; -struct mem_cgroup_reclaim_iter { - /* - * last scanned hierarchy member. Valid only if last_dead_count - * matches memcg->dead_count of the hierarchy root group. - */ - struct mem_cgroup *last_visited; - int last_dead_count; - +struct reclaim_iter { + struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; @@ -162,10 +156,10 @@ struct mem_cgroup_per_zone { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct reclaim_iter iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ + unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ @@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; - u64 threshold; + unsigned long threshold; }; /* For threshold */ @@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); */ struct mem_cgroup { struct cgroup_subsys_state css; - /* - * the counter to account for memory usage - */ - struct res_counter res; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; @@ -296,19 +293,9 @@ struct mem_cgroup { int initialized; /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * the counter to account for kernel memory usage. - */ - struct res_counter kmem; - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -352,7 +339,6 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; - atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif @@ -379,38 +365,10 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; -/* internal only representation about the status of kmem accounting. */ -enum { - KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ - KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ -}; - #ifdef CONFIG_MEMCG_KMEM -static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) -{ - set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); -} - static bool memcg_kmem_is_active(struct mem_cgroup *memcg) { - return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); -} - -static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) -{ - /* - * Our caller must use css_get() first, because memcg_uncharge_kmem() - * will call css_put() if it sees the memcg is dead. - */ - smp_wmb(); - if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) - set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); -} - -static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) -{ - return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, - &memcg->kmem_account_flags); + return memcg->kmemcg_id >= 0; } #endif @@ -650,7 +608,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) * This check can't live in kmem destruction function, * since the charges will outlive the cgroup */ - WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); + WARN_ON(page_counter_read(&memcg->kmem)); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -664,8 +622,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg) disarm_kmem_keys(memcg); } -static void drain_all_stock_async(struct mem_cgroup *memcg); - static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { @@ -706,7 +662,7 @@ soft_limit_tree_from_page(struct page *page) static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) + unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -755,10 +711,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, spin_unlock_irqrestore(&mctz->lock, flags); } +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { - unsigned long long excess; + unsigned long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; @@ -769,7 +736,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_zoneinfo(memcg, page); - excess = res_counter_soft_limit_excess(&memcg->res); + excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -825,7 +792,7 @@ retry: * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || + if (!soft_limit_excess(mz->memcg) || !css_tryget_online(&mz->memcg->css)) goto retry; done: @@ -1062,122 +1029,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* - * Returns a next (in a pre-order walk) alive memcg (with elevated css - * ref. count) or NULL if the whole root's subtree has been visited. - * - * helper function to be used by mem_cgroup_iter - */ -static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) -{ - struct cgroup_subsys_state *prev_css, *next_css; - - prev_css = last_visited ? &last_visited->css : NULL; -skip_node: - next_css = css_next_descendant_pre(prev_css, &root->css); - - /* - * Even if we found a group we have to make sure it is - * alive. css && !memcg means that the groups should be - * skipped and we should continue the tree walk. - * last_visited css is safe to use because it is - * protected by css_get and the tree walk is rcu safe. - * - * We do not take a reference on the root of the tree walk - * because we might race with the root removal when it would - * be the only node in the iterated hierarchy and mem_cgroup_iter - * would end up in an endless loop because it expects that at - * least one valid node will be returned. Root cannot disappear - * because caller of the iterator should hold it already so - * skipping css reference should be safe. - */ - if (next_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); - - if (next_css == &root->css) - return memcg; - - if (css_tryget_online(next_css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - return memcg; - css_put(next_css); - } - - prev_css = next_css; - goto skip_node; - } - - return NULL; -} - -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) -{ - /* - * When a group in the hierarchy below root is destroyed, the - * hierarchy iterator can no longer be trusted since it might - * have pointed to the destroyed group. Invalidate it. - */ - atomic_inc(&root->dead_count); -} - -static struct mem_cgroup * -mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *root, - int *sequence) -{ - struct mem_cgroup *position = NULL; - /* - * A cgroup destruction happens in two stages: offlining and - * release. They are separated by a RCU grace period. - * - * If the iterator is valid, we may still race with an - * offlining. The RCU lock ensures the object won't be - * released, tryget will fail if we lost the race. - */ - *sequence = atomic_read(&root->dead_count); - if (iter->last_dead_count == *sequence) { - smp_rmb(); - position = iter->last_visited; - - /* - * We cannot take a reference to root because we might race - * with root removal and returning NULL would end up in - * an endless loop on the iterator user level when root - * would be returned all the time. - */ - if (position && position != root && - !css_tryget_online(&position->css)) - position = NULL; - } - return position; -} - -static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *last_visited, - struct mem_cgroup *new_position, - struct mem_cgroup *root, - int sequence) -{ - /* root reference counting symmetric to mem_cgroup_iter_load */ - if (last_visited && last_visited != root) - css_put(&last_visited->css); - /* - * We store the sequence count from the time @last_visited was - * loaded successfully instead of rereading it here so that we - * don't lose destruction events in between. We could have - * raced with the destruction of @new_position after all. - */ - iter->last_visited = new_position; - smp_wmb(); - iter->last_dead_count = sequence; -} - /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1199,8 +1050,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { + struct reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; - struct mem_cgroup *last_visited = NULL; + struct mem_cgroup *pos = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1209,50 +1062,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - last_visited = prev; + pos = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - goto out_css_put; + goto out; return root; } rcu_read_lock(); - while (!memcg) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - int uninitialized_var(seq); - - if (reclaim) { - struct mem_cgroup_per_zone *mz; - - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); - iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) { - iter->last_visited = NULL; - goto out_unlock; - } - last_visited = mem_cgroup_iter_load(iter, root, &seq); + if (reclaim) { + struct mem_cgroup_per_zone *mz; + + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + iter = &mz->iter[reclaim->priority]; + + if (prev && reclaim->generation != iter->generation) + goto out_unlock; + + do { + pos = ACCESS_ONCE(iter->position); + /* + * A racing update may change the position and + * put the last reference, hence css_tryget(), + * or retry to see the updated position. + */ + } while (pos && !css_tryget(&pos->css)); + } + + if (pos) + css = &pos->css; + + for (;;) { + css = css_next_descendant_pre(css, &root->css); + if (!css) { + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + continue; + break; } - memcg = __mem_cgroup_iter_next(root, last_visited); + /* + * Verify the css and acquire a reference. The root + * is provided by the caller, so we know it's alive + * and kicking, and don't take an extra reference. + */ + memcg = mem_cgroup_from_css(css); + + if (css == &root->css) + break; + + if (css_tryget(css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + break; + + css_put(css); + } - if (reclaim) { - mem_cgroup_iter_update(iter, last_visited, memcg, root, - seq); + memcg = NULL; + } - if (!memcg) - iter->generation++; - else if (!prev && memcg) - reclaim->generation = iter->generation; + if (reclaim) { + if (cmpxchg(&iter->position, pos, memcg) == pos) { + if (memcg) + css_get(&memcg->css); + if (pos) + css_put(&pos->css); } - if (prev && !memcg) - goto out_unlock; + /* + * pairs with css_tryget when dereferencing iter->position + * above. + */ + if (pos) + css_put(&pos->css); + + if (!memcg) + iter->generation++; + else if (!prev) + reclaim->generation = iter->generation; } + out_unlock: rcu_read_unlock(); -out_css_put: +out: if (prev && prev != root) css_put(&prev->css); @@ -1346,15 +1250,18 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for adding an lru page + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page + * + * This function is only safe when following the LRU page isolation + * and putback protocol: the LRU lock must be held, and the page must + * either be PageLRU() or the caller must have isolated/allocated it. */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; - struct page_cgroup *pc; struct lruvec *lruvec; if (mem_cgroup_disabled()) { @@ -1362,20 +1269,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) goto out; } - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - + memcg = page->mem_cgroup; /* - * Surreptitiously switch any uncharged offlist page to root: - * an uncharged page off lru does nothing to secure - * its former mem_cgroup from sudden removal. - * - * Our caller holds lru_lock, and PageCgroupUsed is updated - * under page_cgroup lock: between them, they make all uses - * of pc->mem_cgroup safe. + * Swapcache readahead pages are added to the LRU - and + * possibly migrated - before they are charged. */ - if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) - pc->mem_cgroup = memcg = root_mem_cgroup; + if (!memcg) + memcg = root_mem_cgroup; mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; @@ -1414,41 +1314,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -/* - * Checks whether given mem is same or in the root_mem_cgroup's - * hierarchy subtree - */ -bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { - if (root_memcg == memcg) + if (root == memcg) return true; - if (!root_memcg->use_hierarchy || !memcg) + if (!root->use_hierarchy) return false; - return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); -} - -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); - rcu_read_unlock(); - return ret; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } -bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { - struct mem_cgroup *curr = NULL; + struct mem_cgroup *task_memcg; struct task_struct *p; bool ret; p = find_lock_task_mm(task); if (p) { - curr = get_mem_cgroup_from_mm(p->mm); + task_memcg = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1457,19 +1340,12 @@ bool task_in_mem_cgroup(struct task_struct *task, * killed to prevent needlessly killing additional tasks. */ rcu_read_lock(); - curr = mem_cgroup_from_task(task); - if (curr) - css_get(&curr->css); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); rcu_read_unlock(); } - /* - * We should check use_hierarchy of "memcg" not "curr". Because checking - * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "memcg"). - */ - ret = mem_cgroup_same_or_subtree(memcg, curr); - css_put(&curr->css); + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); return ret; } @@ -1492,7 +1368,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -#define mem_cgroup_from_res_counter(counter, member) \ +#define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** @@ -1504,12 +1380,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) */ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { - unsigned long long margin; + unsigned long margin = 0; + unsigned long count; + unsigned long limit; - margin = res_counter_margin(&memcg->res); - if (do_swap_account) - margin = min(margin, res_counter_margin(&memcg->memsw)); - return margin >> PAGE_SHIFT; + count = page_counter_read(&memcg->memory); + limit = ACCESS_ONCE(memcg->memory.limit); + if (count < limit) + margin = limit - count; + + if (do_swap_account) { + count = page_counter_read(&memcg->memsw); + limit = ACCESS_ONCE(memcg->memsw.limit); + if (count <= limit) + margin = min(margin, limit - count); + } + + return margin; } int mem_cgroup_swappiness(struct mem_cgroup *memcg) @@ -1522,37 +1409,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) } /* - * memcg->moving_account is used for checking possibility that some thread is - * calling move_account(). When a thread on CPU-A starts moving pages under - * a memcg, other threads should check memcg->moving_account under - * rcu_read_lock(), like this: - * - * CPU-A CPU-B - * rcu_read_lock() - * memcg->moving_account+1 if (memcg->mocing_account) - * take heavy locks. - * synchronize_rcu() update something. - * rcu_read_unlock() - * start move here. - */ - -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->moving_account); - synchronize_rcu(); -} - -static void mem_cgroup_end_move(struct mem_cgroup *memcg) -{ - /* - * Now, mem_cgroup_clear_mc() may call this function with NULL. - * We check NULL in callee rather than caller. - */ - if (memcg) - atomic_dec(&memcg->moving_account); -} - -/* * A routine for checking "mem" is under move_account() or not. * * Checking a cgroup is mc.from or mc.to or under hierarchy of @@ -1574,8 +1430,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(memcg, from) - || mem_cgroup_same_or_subtree(memcg, to); + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); unlock: spin_unlock(&mc.lock); return ret; @@ -1597,23 +1453,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -/* - * Take this lock when - * - a code tries to modify page's memcg while it's USED. - * - a code tries to modify page state accounting in a memcg. - */ -static void move_lock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_lock_irqsave(&memcg->move_lock, *flags); -} - -static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_unlock_irqrestore(&memcg->move_lock, *flags); -} - #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1644,18 +1483,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->res, RES_FAILCNT)); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)memcg->memory.limit), memcg->memory.failcnt); + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1695,28 +1531,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { - u64 limit; - - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + unsigned long limit; - /* - * Do not consider swap space if we cannot swap due to swappiness - */ + limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { - u64 memsw; - - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + unsigned long memsw_limit; - /* - * If memsw is finite and limits the amount of swap space - * available to this memcg, return that limit. - */ - limit = min(limit, memsw); + memsw_limit = memcg->memsw.limit; + limit = min(limit + total_swap_pages, memsw_limit); } - return limit; } @@ -1734,13 +1559,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || current->flags & PF_EXITING) { + if (fatal_signal_pending(current) || task_will_free_mem(current)) { set_thread_flag(TIF_MEMDIE); return; } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -1791,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, NULL, "Memory cgroup out of memory"); } +#if MAX_NUMNODES > 1 + /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1813,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } -#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1880,52 +1706,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) memcg->last_scanned_node = node; return node; } - -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } - -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, @@ -1943,7 +1728,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, .priority = 0, }; - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + excess = soft_limit_excess(root_memcg); while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); @@ -1969,12 +1754,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - if (!mem_cgroup_reclaimable(victim, false)) - continue; total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, zone, &nr_scanned); *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) + if (!soft_limit_excess(root_memcg)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -2081,12 +1864,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, oom_wait_info = container_of(wait, struct oom_wait_info, wait); oom_wait_memcg = oom_wait_info->memcg; - /* - * Both of oom_wait_info->memcg and wake_memcg are stable under us. - * Then we can use css_is_ancestor without taking care of RCU. - */ - if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) - && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } @@ -2228,26 +2007,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, unsigned long *flags) { struct mem_cgroup *memcg; - struct page_cgroup *pc; rcu_read_lock(); if (mem_cgroup_disabled()) return NULL; - - pc = lookup_page_cgroup(page); again: - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) + memcg = page->mem_cgroup; + if (unlikely(!memcg)) return NULL; *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - move_lock_mem_cgroup(memcg, flags); - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { - move_unlock_mem_cgroup(memcg, flags); + spin_lock_irqsave(&memcg->move_lock, *flags); + if (memcg != page->mem_cgroup) { + spin_unlock_irqrestore(&memcg->move_lock, *flags); goto again; } *locked = true; @@ -2261,11 +2037,11 @@ again: * @locked: value received from mem_cgroup_begin_page_stat() * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, - unsigned long flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, + unsigned long *flags) { - if (memcg && locked) - move_unlock_mem_cgroup(memcg, &flags); + if (memcg && *locked) + spin_unlock_irqrestore(&memcg->move_lock, *flags); rcu_read_unlock(); } @@ -2316,33 +2092,32 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - bool ret = true; + bool ret = false; if (nr_pages > CHARGE_BATCH) - return false; + return ret; stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; - else /* need to call res_counter_charge */ - ret = false; + ret = true; + } put_cpu_var(memcg_stock); return ret; } /* - * Returns stocks cached in percpu to res_counter and reset cached information. + * Returns stocks cached in percpu and reset cached information. */ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; if (stock->nr_pages) { - unsigned long bytes = stock->nr_pages * PAGE_SIZE; - - res_counter_uncharge(&old->res, bytes); + page_counter_uncharge(&old->memory, stock->nr_pages); if (do_swap_account) - res_counter_uncharge(&old->memsw, bytes); + page_counter_uncharge(&old->memsw, stock->nr_pages); + css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } stock->cached = NULL; @@ -2371,7 +2146,7 @@ static void __init memcg_stock_init(void) } /* - * Cache charges(val) which is from res_counter, to local per_cpu area. + * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2388,13 +2163,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) /* * Drains all per-CPU charge caches for given root_memcg resp. subtree - * of the hierarchy under it. sync flag says whether we should block - * until the work is done. + * of the hierarchy under it. */ -static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg) { int cpu, curcpu; + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); curcpu = get_cpu(); @@ -2405,7 +2182,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) memcg = stock->cached; if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) @@ -2415,42 +2192,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) } } put_cpu(); - - if (!sync) - goto out; - - for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) - flush_work(&stock->work); - } -out: put_online_cpus(); -} - -/* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. - */ -static void drain_all_stock_async(struct mem_cgroup *root_memcg) -{ - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; - drain_all_stock(root_memcg, false); - mutex_unlock(&percpu_charge_mutex); -} - -/* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_memcg) -{ - /* called when force_empty is called */ - mutex_lock(&percpu_charge_mutex); - drain_all_stock(root_memcg, true); mutex_unlock(&percpu_charge_mutex); } @@ -2506,9 +2248,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; - struct res_counter *fail_res; + struct page_counter *counter; unsigned long nr_reclaimed; - unsigned long long size; bool may_swap = true; bool drained = false; int ret = 0; @@ -2519,16 +2260,15 @@ retry: if (consume_stock(memcg, nr_pages)) goto done; - size = batch * PAGE_SIZE; if (!do_swap_account || - !res_counter_charge(&memcg->memsw, size, &fail_res)) { - if (!res_counter_charge(&memcg->res, size, &fail_res)) + !page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + mem_over_limit = mem_cgroup_from_counter(counter, memsw); may_swap = false; } @@ -2561,7 +2301,7 @@ retry: goto retry; if (!drained) { - drain_all_stock_async(mem_over_limit); + drain_all_stock(mem_over_limit); drained = true; goto retry; } @@ -2603,6 +2343,7 @@ bypass: return -EINTR; done_restock: + css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: @@ -2611,32 +2352,14 @@ done: static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) return; - res_counter_uncharge(&memcg->res, bytes); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); -} - -/* - * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. - * This is useful when moving usage to parent cgroup. - */ -static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ - unsigned long bytes = nr_pages * PAGE_SIZE; + page_counter_uncharge(&memcg->memsw, nr_pages); - if (mem_cgroup_is_root(memcg)) - return; - - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); - if (do_swap_account) - res_counter_uncharge_until(&memcg->memsw, - memcg->memsw.parent, bytes); + css_put_many(&memcg->css, nr_pages); } /* @@ -2665,17 +2388,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; + struct mem_cgroup *memcg; unsigned short id; swp_entry_t ent; VM_BUG_ON_PAGE(!PageLocked(page), page); - pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - if (memcg && !css_tryget_online(&memcg->css)) + memcg = page->mem_cgroup; + if (memcg) { + if (!css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); @@ -2723,14 +2444,9 @@ static void unlock_page_lru(struct page *page, int isolated) static void commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare) { - struct page_cgroup *pc = lookup_page_cgroup(page); int isolated; - VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); - /* - * we don't need page_cgroup_lock about tail pages, becase they are not - * accessed by any other context at this point. - */ + VM_BUG_ON_PAGE(page->mem_cgroup, page); /* * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page @@ -2741,7 +2457,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point: + * page->mem_cgroup at this point: * * - the page is uncharged * @@ -2753,15 +2469,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * - a page cache insertion, a swapin fault, or a migration * have the page locked */ - pc->mem_cgroup = memcg; - pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); + page->mem_cgroup = memcg; if (lrucare) unlock_page_lru(page, isolated); } -static DEFINE_MUTEX(set_limit_mutex); - #ifdef CONFIG_MEMCG_KMEM /* * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or @@ -2769,8 +2482,6 @@ static DEFINE_MUTEX(set_limit_mutex); */ static DEFINE_MUTEX(memcg_slab_mutex); -static DEFINE_MUTEX(activate_kmem_mutex); - /* * This is a bit cumbersome, but it is rarely used and avoids a backpointer * in the memcg_cache_params struct. @@ -2784,36 +2495,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -#ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - struct memcg_cache_params *params; - - if (!memcg_kmem_is_active(memcg)) - return -EIO; - - print_slabinfo_header(m); - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) - cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg_slab_mutex); - - return 0; -} -#endif - -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { - struct res_counter *fail_res; + struct page_counter *counter; int ret = 0; - ret = res_counter_charge(&memcg->kmem, size, &fail_res); - if (ret) + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret < 0) return ret; - ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); + ret = try_charge(memcg, gfp, nr_pages); if (ret == -EINTR) { /* * try_charge() chose to bypass to root due to OOM kill or @@ -2830,37 +2522,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ - res_counter_charge_nofail(&memcg->res, size, &fail_res); + page_counter_charge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_charge_nofail(&memcg->memsw, size, - &fail_res); + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); ret = 0; } else if (ret) - res_counter_uncharge(&memcg->kmem, size); + page_counter_uncharge(&memcg->kmem, nr_pages); return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) { - res_counter_uncharge(&memcg->res, size); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); + page_counter_uncharge(&memcg->memsw, nr_pages); - /* Not down to 0 */ - if (res_counter_uncharge(&memcg->kmem, size)) - return; + page_counter_uncharge(&memcg->kmem, nr_pages); - /* - * Releases a reference taken in kmem_cgroup_css_offline in case - * this last uncharge is racing with the offlining code or it is - * outliving the memcg existence. - * - * The memory barrier imposed by test&clear is paired with the - * explicit one in memcg_kmem_mark_dead(). - */ - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); + css_put_many(&memcg->css, nr_pages); } /* @@ -2953,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (!cachep) return; - css_get(&memcg->css); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* @@ -2987,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) list_del(&cachep->memcg_params->list); kmem_cache_destroy(cachep); - - /* drop the reference taken in memcg_register_cache */ - css_put(&memcg->css); -} - -/* - * During the creation a new cache, we need to disable our accounting mechanism - * altogether. This is true even if we are not creating, but rather just - * enqueing new caches to be created. - * - * This is because that process will trigger allocations; some visible, like - * explicit kmallocs to auxiliary data structures, name strings and internal - * cache structures; some well concealed, like INIT_WORK() that can allocate - * objects during debug. - * - * If any allocation happens during memcg_kmem_get_cache, we will recurse back - * to it. This may not be a bounded recursion: since the first cache creation - * failed to complete (waiting on the allocation), we'll just try to create the - * cache again, failing at the same point. - * - * memcg_kmem_get_cache is prepared to abort after seeing a positive count of - * memcg_kmem_skip_account. So we enclose anything that might allocate memory - * inside the following two functions. - */ -static inline void memcg_stop_kmem_account(void) -{ - VM_BUG_ON(!current->mm); - current->memcg_kmem_skip_account++; -} - -static inline void memcg_resume_kmem_account(void) -{ - VM_BUG_ON(!current->mm); - current->memcg_kmem_skip_account--; } int __memcg_cleanup_cache_params(struct kmem_cache *s) @@ -3054,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) mutex_lock(&memcg_slab_mutex); list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_unregister_cache(cachep); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } @@ -3091,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, struct memcg_register_cache_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); - if (cw == NULL) { - css_put(&memcg->css); + if (!cw) return; - } + + css_get(&memcg->css); cw->memcg = memcg; cw->cachep = cachep; @@ -3117,26 +2762,23 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * this point we can't allow ourselves back into memcg_kmem_get_cache, * the safest choice is to do it like this, wrapping the whole function. */ - memcg_stop_kmem_account(); + current->memcg_kmem_skip_account = 1; __memcg_schedule_register_cache(memcg, cachep); - memcg_resume_kmem_account(); + current->memcg_kmem_skip_account = 0; } int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) { - int res; + unsigned int nr_pages = 1 << order; - res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, - PAGE_SIZE << order); - if (!res) - atomic_add(1 << order, &cachep->memcg_params->nr_pages); - return res; + return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); } void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) { - memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); + unsigned int nr_pages = 1 << order; + + memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); } /* @@ -3152,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, - gfp_t gfp) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -3161,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); - if (!current->mm || current->memcg_kmem_skip_account) + if (current->memcg_kmem_skip_account) return cachep; - rcu_read_lock(); - memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) goto out; memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); - if (likely(memcg_cachep)) { - cachep = memcg_cachep; - goto out; - } - - /* The corresponding put will be done in the workqueue. */ - if (!css_tryget_online(&memcg->css)) - goto out; - rcu_read_unlock(); + if (likely(memcg_cachep)) + return memcg_cachep; /* * If we are in a safe context (can wait, and not in interrupt @@ -3194,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * defer everything. */ memcg_schedule_register_cache(memcg, cachep); - return cachep; out: - rcu_read_unlock(); + css_put(&memcg->css); return cachep; } +void __memcg_kmem_put_cache(struct kmem_cache *cachep) +{ + if (!is_root_cache(cachep)) + css_put(&cachep->memcg_params->memcg->css); +} + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll @@ -3222,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) *_memcg = NULL; - /* - * Disabling accounting is only relevant for some specific memcg - * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are - * accounted to kmemcg (alloc_kmem_pages and friends) only happen - * outside memcg core. We are mostly concerned with cache allocations, - * and by having this test at memcg_kmem_get_cache, we are already able - * to relay the allocation to the root cache and bypass the memcg cache - * altogether. - * - * There is one exception, though: the SLUB allocator does not create - * large order caches, but rather service large kmallocs directly from - * the page allocator. Therefore, the following sequence when backed by - * the SLUB allocator: - * - * memcg_stop_kmem_account(); - * kmalloc(<large_number>) - * memcg_resume_kmem_account(); - * - * would effectively ignore the fact that we should skip accounting, - * since it will drive us directly to this function without passing - * through the cache selector memcg_kmem_get_cache. Such large - * allocations are extremely rare but can happen, for instance, for the - * cache arrays. We bring this test here. - */ - if (!current->mm || current->memcg_kmem_skip_account) - return true; - memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) { @@ -3257,7 +2866,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) return true; } - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + ret = memcg_charge_kmem(memcg, gfp, 1 << order); if (!ret) *_memcg = memcg; @@ -3268,50 +2877,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) { - struct page_cgroup *pc; - VM_BUG_ON(mem_cgroup_is_root(memcg)); /* The page allocation failed. Revert */ if (!page) { - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + memcg_uncharge_kmem(memcg, 1 << order); return; } - /* - * The page is freshly allocated and not visible to any - * outside callers yet. Set up pc non-atomically. - */ - pc = lookup_page_cgroup(page); - pc->mem_cgroup = memcg; - pc->flags = PCG_USED; + page->mem_cgroup = memcg; } void __memcg_kmem_uncharge_pages(struct page *page, int order) { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - - - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) - return; - - memcg = pc->mem_cgroup; - pc->flags = 0; + struct mem_cgroup *memcg = page->mem_cgroup; - /* - * We trust that only if there is a memcg associated with the page, it - * is a valid allocation - */ if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); -} -#else -static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ + + memcg_uncharge_kmem(memcg, 1 << order); + page->mem_cgroup = NULL; } #endif /* CONFIG_MEMCG_KMEM */ @@ -3325,21 +2911,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *pc; - struct mem_cgroup *memcg; int i; if (mem_cgroup_disabled()) return; - memcg = head_pc->mem_cgroup; - for (i = 1; i < HPAGE_PMD_NR; i++) { - pc = head_pc + i; - pc->mem_cgroup = memcg; - pc->flags = head_pc->flags; - } - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + for (i = 1; i < HPAGE_PMD_NR; i++) + head[i].mem_cgroup = head->mem_cgroup; + + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3348,7 +2928,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) * mem_cgroup_move_account - move account of the page * @page: the page * @nr_pages: number of regular pages (>1 for huge pages) - * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -3361,7 +2940,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) */ static int mem_cgroup_move_account(struct page *page, unsigned int nr_pages, - struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to) { @@ -3381,7 +2959,7 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup * of its source page while we change it: page migration takes * both pages off the LRU, but page cache replacement doesn't. */ @@ -3389,10 +2967,10 @@ static int mem_cgroup_move_account(struct page *page, goto out; ret = -EINVAL; - if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + if (page->mem_cgroup != from) goto out_unlock; - move_lock_mem_cgroup(from, &flags); + spin_lock_irqsave(&from->move_lock, flags); if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3409,14 +2987,15 @@ static int mem_cgroup_move_account(struct page *page, } /* - * It is safe to change pc->mem_cgroup here because the page + * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with * uncharging, charging, migration, or LRU putback. */ /* caller should have done css_get */ - pc->mem_cgroup = to; - move_unlock_mem_cgroup(from, &flags); + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + ret = 0; local_irq_disable(); @@ -3431,72 +3010,6 @@ out: return ret; } -/** - * mem_cgroup_move_parent - moves page to the parent group - * @page: the page to move - * @pc: page_cgroup of the page - * @child: page's cgroup - * - * move charges to its parent or the root cgroup if the group has no - * parent (aka use_hierarchy==0). - * Although this might fail (get_page_unless_zero, isolate_lru_page or - * mem_cgroup_move_account fails) the failure is always temporary and - * it signals a race with a page removal/uncharge or migration. In the - * first case the page is on the way out and it will vanish from the LRU - * on the next attempt and the call should be retried later. - * Isolation from the LRU fails only if page has been isolated from - * the LRU since we looked at it and that usually means either global - * reclaim or migration going on. The page will either get back to the - * LRU or vanish. - * Finaly mem_cgroup_move_account fails only if the page got uncharged - * (!PageCgroupUsed) or moved to a different group. The page will - * disappear in the next attempt. - */ -static int mem_cgroup_move_parent(struct page *page, - struct page_cgroup *pc, - struct mem_cgroup *child) -{ - struct mem_cgroup *parent; - unsigned int nr_pages; - unsigned long uninitialized_var(flags); - int ret; - - VM_BUG_ON(mem_cgroup_is_root(child)); - - ret = -EBUSY; - if (!get_page_unless_zero(page)) - goto out; - if (isolate_lru_page(page)) - goto put; - - nr_pages = hpage_nr_pages(page); - - parent = parent_mem_cgroup(child); - /* - * If no parent, move charges to root cgroup. - */ - if (!parent) - parent = root_mem_cgroup; - - if (nr_pages > 1) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - flags = compound_lock_irqsave(page); - } - - ret = mem_cgroup_move_account(page, nr_pages, - pc, child, parent); - if (!ret) - __mem_cgroup_cancel_local_charge(child, nr_pages); - - if (nr_pages > 1) - compound_unlock_irqrestore(page, flags); - putback_lru_page(page); -put: - put_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -3516,7 +3029,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, * * Returns 0 on success, -EINVAL on failure. * - * The caller must have charged to @to, IOW, called res_counter_charge() about + * The caller must have charged to @to, IOW, called page_counter_charge() about * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, @@ -3532,7 +3045,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, mem_cgroup_swap_statistics(to, true); /* * This function is only called from task migration context now. - * It postpones res_counter and refcount handling till the end + * It postpones page_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance * improvement. But we cannot postpone css_get(to) because if * the process that has been moved to @to does swap-in, the @@ -3554,96 +3067,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -#ifdef CONFIG_DEBUG_VM -static struct page_cgroup *lookup_page_cgroup_used(struct page *page) -{ - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - /* - * Can be NULL while feeding pages into the page allocator for - * the first time, i.e. during boot or memory hotplug; - * or when mem_cgroup_disabled(). - */ - if (likely(pc) && PageCgroupUsed(pc)) - return pc; - return NULL; -} - -bool mem_cgroup_bad_page_check(struct page *page) -{ - if (mem_cgroup_disabled()) - return false; - - return lookup_page_cgroup_used(page) != NULL; -} - -void mem_cgroup_print_bad_page(struct page *page) -{ - struct page_cgroup *pc; - - pc = lookup_page_cgroup_used(page); - if (pc) { - pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); - } -} -#endif +static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - int ret = 0; - int children = mem_cgroup_count_children(memcg); - u64 curusage, oldusage; - int enlarge; + int ret; /* * For keeping hierarchical_reclaim simple, how long we should retry * is depends on callers. We set our retry-count to be function * of # of children which we should visit in this loop. */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); - oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + oldusage = page_counter_read(&memcg->memory); - enlarge = 0; - while (retry_count) { + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { + + mutex_lock(&memcg_limit_mutex); + if (limit > memcg->memsw.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - - if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) - enlarge = 1; - - ret = res_counter_set_limit(&memcg->res, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memory.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memory, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + curusage = page_counter_read(&memcg->memory); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); @@ -3651,52 +3125,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - u64 oldusage, curusage; - int children = mem_cgroup_count_children(memcg); - int ret = -EBUSY; - int enlarge = 0; + int ret; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - while (retry_count) { + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); + + oldusage = page_counter_read(&memcg->memsw); + + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { + + mutex_lock(&memcg_limit_mutex); + if (limit < memcg->memory.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) - enlarge = 1; - ret = res_counter_set_limit(&memcg->memsw, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memsw.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memsw, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + curusage = page_counter_read(&memcg->memsw); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); + return ret; } @@ -3709,7 +3184,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long reclaimed; int loop = 0; struct mem_cgroup_tree_per_zone *mctz; - unsigned long long excess; + unsigned long excess; unsigned long nr_scanned; if (order > 0) @@ -3735,35 +3210,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock_irq(&mctz->lock); + __mem_cgroup_remove_exceeded(mz, mctz); /* * If we failed to reclaim anything from this memory cgroup * it is time to move on to the next cgroup */ next_mz = NULL; - if (!reclaimed) { - do { - /* - * Loop until we find yet another one. - * - * By the time we get the soft_limit lock - * again, someone might have aded the - * group back on the RB tree. Iterate to - * make sure we get a different mem. - * mem_cgroup_largest_soft_limit_node returns - * NULL if no other cgroup is present on - * the tree - */ - next_mz = - __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) - css_put(&next_mz->memcg->css); - else /* next_mz == NULL or other memcg */ - break; - } while (1); - } - __mem_cgroup_remove_exceeded(mz, mctz); - excess = res_counter_soft_limit_excess(&mz->memcg->res); + if (!reclaimed) + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); + + excess = soft_limit_excess(mz->memcg); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -3792,107 +3249,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/** - * mem_cgroup_force_empty_list - clears LRU of a group - * @memcg: group to clear - * @node: NUMA node - * @zid: zone id - * @lru: lru to to clear - * - * Traverse a specified page_cgroup list and try to drop them all. This doesn't - * reclaim the pages page themselves - pages are moved to the parent (or root) - * group. - */ -static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, - int node, int zid, enum lru_list lru) -{ - struct lruvec *lruvec; - unsigned long flags; - struct list_head *list; - struct page *busy; - struct zone *zone; - - zone = &NODE_DATA(node)->node_zones[zid]; - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - list = &lruvec->lists[lru]; - - busy = NULL; - do { - struct page_cgroup *pc; - struct page *page; - - spin_lock_irqsave(&zone->lru_lock, flags); - if (list_empty(list)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - break; - } - page = list_entry(list->prev, struct page, lru); - if (busy == page) { - list_move(&page->lru, list); - busy = NULL; - spin_unlock_irqrestore(&zone->lru_lock, flags); - continue; - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - - pc = lookup_page_cgroup(page); - - if (mem_cgroup_move_parent(page, pc, memcg)) { - /* found lock contention or "pc" is obsolete. */ - busy = page; - } else - busy = NULL; - cond_resched(); - } while (!list_empty(list)); -} - -/* - * make mem_cgroup's charge to be 0 if there is no task by moving - * all the charges and pages to the parent. - * This enables deleting this mem_cgroup. - * - * Caller is responsible for holding css reference on the memcg. - */ -static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) -{ - int node, zid; - u64 usage; - - do { - /* This is for making all *used* pages to be on LRU. */ - lru_add_drain_all(); - drain_all_stock_sync(memcg); - mem_cgroup_start_move(memcg); - for_each_node_state(node, N_MEMORY) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - enum lru_list lru; - for_each_lru(lru) { - mem_cgroup_force_empty_list(memcg, - node, zid, lru); - } - } - } - mem_cgroup_end_move(memcg); - memcg_oom_recover(memcg); - cond_resched(); - - /* - * Kernel memory may not necessarily be trackable to a specific - * process. So they are not migrated, and therefore we can't - * expect their value to drop to 0 here. - * Having res filled up with kmem only is enough. - * - * This is a safety check because mem_cgroup_force_empty_list - * could have raced with mem_cgroup_replace_page_cache callers - * so the lru seemed empty but the page could have been added - * right after the check. RES_USAGE should be safe as we always - * charge before adding to the LRU. - */ - usage = res_counter_read_u64(&memcg->res, RES_USAGE) - - res_counter_read_u64(&memcg->kmem, RES_USAGE); - } while (usage > 0); -} - /* * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and @@ -3930,7 +3286,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { + while (nr_retries && page_counter_read(&memcg->memory)) { int progress; if (signal_pending(current)) @@ -4001,8 +3357,8 @@ out: return retval; } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static unsigned long tree_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; long val = 0; @@ -4020,55 +3376,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { u64 val; - if (!mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(memcg)) { + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); + if (swap) + val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + } else { if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); + val = page_counter_read(&memcg->memory); else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); + val = page_counter_read(&memcg->memsw); } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - return val << PAGE_SHIFT; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, + RES_SOFT_LIMIT, +}; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - enum res_type type = MEMFILE_TYPE(cft->private); - int name = MEMFILE_ATTR(cft->private); + struct page_counter *counter; - switch (type) { + switch (MEMFILE_TYPE(cft->private)) { case _MEM: - if (name == RES_USAGE) - return mem_cgroup_usage(memcg, false); - return res_counter_read_u64(&memcg->res, name); + counter = &memcg->memory; + break; case _MEMSWAP: - if (name == RES_USAGE) - return mem_cgroup_usage(memcg, true); - return res_counter_read_u64(&memcg->memsw, name); + counter = &memcg->memsw; + break; case _KMEM: - return res_counter_read_u64(&memcg->kmem, name); + counter = &memcg->kmem; break; default: BUG(); } + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + if (counter == &memcg->memory) + return mem_cgroup_usage(memcg, false); + if (counter == &memcg->memsw) + return mem_cgroup_usage(memcg, true); + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + case RES_SOFT_LIMIT: + return (u64)memcg->soft_limit * PAGE_SIZE; + default: + BUG(); + } } #ifdef CONFIG_MEMCG_KMEM -/* should be called with activate_kmem_mutex held */ -static int __memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long long limit) +static int memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) { int err = 0; int memcg_id; @@ -4077,12 +3449,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, return 0; /* - * We are going to allocate memory for data shared by all memory - * cgroups so let's stop accounting here. - */ - memcg_stop_kmem_account(); - - /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had * already joined. @@ -4108,48 +3474,36 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, goto out; } - memcg->kmemcg_id = memcg_id; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); - /* - * We couldn't have accounted to this cgroup, because it hasn't got the - * active bit set yet, so this should succeed. + * We couldn't have accounted to this cgroup, because it hasn't got + * activated yet, so this should succeed. */ - err = res_counter_set_limit(&memcg->kmem, limit); + err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); static_key_slow_inc(&memcg_kmem_enabled_key); /* - * Setting the active bit after enabling static branching will + * A memory cgroup is considered kmem-active as soon as it gets + * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ - memcg_kmem_set_active(memcg); + memcg->kmemcg_id = memcg_id; out: - memcg_resume_kmem_account(); return err; } -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long long limit) -{ - int ret; - - mutex_lock(&activate_kmem_mutex); - ret = __memcg_activate_kmem(memcg, limit); - mutex_unlock(&activate_kmem_mutex); - return ret; -} - static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { int ret; + mutex_lock(&memcg_limit_mutex); if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, val); + ret = memcg_activate_kmem(memcg, limit); else - ret = res_counter_set_limit(&memcg->kmem, val); + ret = page_counter_limit(&memcg->kmem, limit); + mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4161,19 +3515,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) if (!parent) return 0; - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_limit_mutex); /* * If the parent cgroup is not kmem-active now, it cannot be activated * after this point, because it has at least one child already. */ if (memcg_kmem_is_active(parent)) - ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); - mutex_unlock(&activate_kmem_mutex); + ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); + mutex_unlock(&memcg_limit_mutex); return ret; } #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { return -EINVAL; } @@ -4187,110 +3541,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - enum res_type type; - int name; - unsigned long long val; + unsigned long nr_pages; int ret; buf = strstrip(buf); - type = MEMFILE_TYPE(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + ret = page_counter_memparse(buf, &nr_pages); + if (ret) + return ret; - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ ret = -EINVAL; break; } - /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + ret = mem_cgroup_resize_limit(memcg, nr_pages); break; - if (type == _MEM) - ret = mem_cgroup_resize_limit(memcg, val); - else if (type == _MEMSWAP) - ret = mem_cgroup_resize_memsw_limit(memcg, val); - else if (type == _KMEM) - ret = memcg_update_kmem_limit(memcg, val); - else - return -EINVAL; - break; - case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) + case _MEMSWAP: + ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); break; - /* - * For memsw, soft limits are hard to implement in terms - * of semantics, for now, we support soft limits for - * control without swap - */ - if (type == _MEM) - ret = res_counter_set_soft_limit(&memcg->res, val); - else - ret = -EINVAL; + case _KMEM: + ret = memcg_update_kmem_limit(memcg, nr_pages); + break; + } break; - default: - ret = -EINVAL; /* should be BUG() ? */ + case RES_SOFT_LIMIT: + memcg->soft_limit = nr_pages; + ret = 0; break; } return ret ?: nbytes; } -static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, - unsigned long long *mem_limit, unsigned long long *memsw_limit) -{ - unsigned long long min_limit, min_memsw_limit, tmp; - - min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (!memcg->use_hierarchy) - goto out; - - while (memcg->css.parent) { - memcg = mem_cgroup_from_css(memcg->css.parent); - if (!memcg->use_hierarchy) - break; - tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); - min_limit = min(min_limit, tmp); - tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - min_memsw_limit = min(min_memsw_limit, tmp); - } -out: - *mem_limit = min_limit; - *memsw_limit = min_memsw_limit; -} - static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - int name; - enum res_type type; + struct page_counter *counter; - type = MEMFILE_TYPE(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + default: + BUG(); + } - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: - if (type == _MEM) - res_counter_reset_max(&memcg->res); - else if (type == _MEMSWAP) - res_counter_reset_max(&memcg->memsw); - else if (type == _KMEM) - res_counter_reset_max(&memcg->kmem); - else - return -EINVAL; + page_counter_reset_watermark(counter); break; case RES_FAILCNT: - if (type == _MEM) - res_counter_reset_failcnt(&memcg->res); - else if (type == _MEMSWAP) - res_counter_reset_failcnt(&memcg->memsw); - else if (type == _KMEM) - res_counter_reset_failcnt(&memcg->kmem); - else - return -EINVAL; + counter->failcnt = 0; break; + default: + BUG(); } return nbytes; @@ -4379,17 +3692,15 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ -static inline void mem_cgroup_lru_names_not_uptodate(void) -{ - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); -} - static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; @@ -4406,14 +3717,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); /* Hierarchical information */ - { - unsigned long long limit, memsw_limit; - memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); - seq_printf(m, "hierarchical_memory_limit %llu\n", limit); - if (do_swap_account) - seq_printf(m, "hierarchical_memsw_limit %llu\n", - memsw_limit); + memory = memsw = PAGE_COUNTER_MAX; + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { + memory = min(memory, mi->memory.limit); + memsw = min(memsw, mi->memsw.limit); } + seq_printf(m, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); + if (do_swap_account) + seq_printf(m, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { long long val = 0; @@ -4497,7 +3810,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; - u64 usage; + unsigned long usage; int i; rcu_read_lock(); @@ -4596,10 +3909,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, { struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - u64 threshold, usage; + unsigned long threshold; + unsigned long usage; int i, size, ret; - ret = res_counter_memparse_write_strategy(args, &threshold); + ret = page_counter_memparse(args, &threshold); if (ret) return ret; @@ -4689,7 +4003,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, { struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - u64 usage; + unsigned long usage; int i, j, size; mutex_lock(&memcg->thresholds_lock); @@ -4843,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { int ret; - memcg->kmemcg_id = -1; ret = memcg_propagate_kmem(memcg); if (ret) return ret; @@ -4853,42 +4166,9 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { + memcg_unregister_all_caches(memcg); mem_cgroup_sockets_destroy(memcg); } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ - if (!memcg_kmem_is_active(memcg)) - return; - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes. As we prevent from taking a reference for every - * such allocation we have to be careful when doing uncharge - * (see memcg_uncharge_kmem) and here during offlining. - * - * The idea is that that only the _last_ uncharge which sees - * the dead memcg will drop the last reference. An additional - * reference is taken here before the group is marked dead - * which is then paired with css_put during uncharge resp. here. - * - * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 and - * shouldn't be incremented anymore (css_tryget_online() would - * fail) we do not have other options because of the kmem - * allocations lifetime. - */ - css_get(&memcg->css); - - memcg_kmem_mark_dead(memcg); - - if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) - return; - - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); -} #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -4898,10 +4178,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ -} #endif /* @@ -5064,7 +4340,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_dentry->d_name.name; + name = cfile.file->f_path.dentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -5088,7 +4364,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) @@ -5228,7 +4504,10 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_show = mem_cgroup_slabinfo_read, + .seq_start = slab_start, + .seq_next = slab_next, + .seq_stop = slab_stop, + .seq_show = memcg_slab_show, }, #endif #endif @@ -5343,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_percpu(memcg->stat); - /* - * We need to make sure that (at least for now), the jump label - * destruction code runs outside of the cgroup lock. This is because - * get_online_cpus(), which is called from the static_branch update, - * can't be called inside the cgroup_lock. cpusets are the ones - * enforcing this dependency, so if they ever change, we might as well. - * - * schedule_work() will guarantee this happens. Be careful if you need - * to move this code around, and make sure it is outside - * the cgroup_lock. - */ disarm_static_keys(memcg); kfree(memcg); } @@ -5363,9 +4631,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) */ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!memcg->res.parent) + if (!memcg->memory.parent) return NULL; - return mem_cgroup_from_res_counter(memcg->res.parent, res); + return mem_cgroup_from_counter(memcg->memory.parent, memory); } EXPORT_SYMBOL(parent_mem_cgroup); @@ -5410,9 +4678,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); } memcg->last_scanned_node = MAX_NUMNODES; @@ -5423,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); +#ifdef CONFIG_MEMCG_KMEM + memcg->kmemcg_id = -1; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); +#endif return &memcg->css; @@ -5451,18 +4723,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->swappiness = mem_cgroup_swappiness(parent); if (parent->use_hierarchy) { - res_counter_init(&memcg->res, &parent->res); - res_counter_init(&memcg->memsw, &parent->memsw); - res_counter_init(&memcg->kmem, &parent->kmem); + page_counter_init(&memcg->memory, &parent->memory); + page_counter_init(&memcg->memsw, &parent->memsw); + page_counter_init(&memcg->kmem, &parent->kmem); /* * No need to take a reference to the parent because cgroup * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5487,29 +4759,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } -/* - * Announce all parents that a group from their hierarchy is gone. - */ -static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - while ((parent = parent_mem_cgroup(parent))) - mem_cgroup_iter_invalidate(parent); - - /* - * if the root memcg is not hierarchical we have to check it - * explicitely. - */ - if (!root_mem_cgroup->use_hierarchy) - mem_cgroup_iter_invalidate(root_mem_cgroup); -} - static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; - struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -5523,60 +4776,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - kmem_cgroup_css_offline(memcg); - - mem_cgroup_invalidate_reclaim_iterators(memcg); - - /* - * This requires that offlining is serialized. Right now that is - * guaranteed because css_killed_work_fn() holds the cgroup_mutex. - */ - css_for_each_descendant_post(iter, css) - mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - - memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - /* - * XXX: css_offline() would be where we should reparent all - * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget_online() and res_counter charging - * under the same RCU lock region, which means that charging - * could race with offlining. Offlining only happens to - * cgroups with no tasks in them but charges can show up - * without any tasks from the swapin path when the target - * memcg is looked up from the swapout record and not from the - * current task as it usually is. A race like this can leak - * charges and put pages with stale cgroup pointers into - * circulation: - * - * #0 #1 - * lookup_swap_cgroup_id() - * rcu_read_lock() - * mem_cgroup_lookup() - * css_tryget_online() - * rcu_read_unlock() - * disable css_tryget_online() - * call_rcu() - * offline_css() - * reparent_charges() - * res_counter_charge() - * css_put() - * css_free() - * pc->mem_cgroup = dead memcg - * add page to lru - * - * The bulk of the charges are still moved in offline_css() to - * avoid pinning a lot of pages in case a long-term reference - * like a swapout record is deferring the css_free() to long - * after offlining. But this makes sure we catch any charges - * made after offlining: - */ - mem_cgroup_reparent_charges(memcg); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -5599,10 +4804,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, ULLONG_MAX); - mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); - memcg_update_kmem_limit(memcg, ULLONG_MAX); - res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->soft_limit = 0; } #ifdef CONFIG_MMU @@ -5758,7 +4963,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -5772,13 +4976,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (!page && !ent.val) return ret; if (page) { - pc = lookup_page_cgroup(page); /* * Do only loose check w/o serialization. - * mem_cgroup_move_account() checks the pc is valid or + * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) target->page = page; @@ -5806,15 +5009,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!move_anon()) return ret; - pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -5897,7 +5098,6 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; - int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -5916,19 +5116,17 @@ static void __mem_cgroup_clear_mc(void) if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); - - for (i = 0; i < mc.moved_swap; i++) - css_put(&mc.from->css); + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. + * we charged both to->memory and to->memsw, so we + * should uncharge to->memory. */ if (!mem_cgroup_is_root(mc.to)) - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); + page_counter_uncharge(&mc.to->memory, mc.moved_swap); + + css_put_many(&mc.from->css, mc.moved_swap); + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -5939,8 +5137,6 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { - struct mem_cgroup *from = mc.from; - /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -5951,7 +5147,6 @@ static void mem_cgroup_clear_mc(void) mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mem_cgroup_end_move(from); } static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, @@ -5984,7 +5179,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - mem_cgroup_start_move(from); + spin_lock(&mc.lock); mc.from = from; mc.to = memcg; @@ -6004,7 +5199,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, @@ -6018,7 +5214,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; - struct page_cgroup *pc; /* * We don't take compound_lock() here but no race with splitting thp @@ -6039,9 +5234,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, - pc, mc.from, mc.to)) { + mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } @@ -6069,9 +5263,7 @@ retry: page = target.page; if (isolate_lru_page(page)) goto put; - pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(page, 1, pc, - mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -6115,6 +5307,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); + /* + * Signal mem_cgroup_begin_page_stat() to take the memcg's + * move_lock while we're moving its pages to another memcg. + * Then wait for already started RCU-only updates to finish. + */ + atomic_inc(&mc.from->moving_account); + synchronize_rcu(); retry: if (unlikely(!down_read_trylock(&mm->mmap_sem))) { /* @@ -6147,6 +5346,7 @@ retry: break; } up_read(&mm->mmap_sem); + atomic_dec(&mc.from->moving_account); } static void mem_cgroup_move_task(struct cgroup_subsys_state *css, @@ -6250,7 +5450,7 @@ static void __init enable_swap_cgroup(void) */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct page_cgroup *pc; + struct mem_cgroup *memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -6259,20 +5459,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!do_swap_account) return; - pc = lookup_page_cgroup(page); + memcg = page->mem_cgroup; /* Readahead page, never charged */ - if (!PageCgroupUsed(pc)) + if (!memcg) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); - - oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); - pc->flags &= ~PCG_MEMSW; - css_get(&pc->mem_cgroup->css); - mem_cgroup_swap_statistics(pc->mem_cgroup, true); + page->mem_cgroup = NULL; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, 1); + + /* XXX: caller holds IRQ-safe mapping->tree_lock */ + VM_BUG_ON(!irqs_disabled()); + + mem_cgroup_charge_statistics(memcg, page, -1); + memcg_check_events(memcg, page); } /** @@ -6294,7 +5500,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) memcg = mem_cgroup_lookup(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + page_counter_uncharge(&memcg->memsw, 1); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -6330,7 +5536,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, goto out; if (PageSwapCache(page)) { - struct page_cgroup *pc = lookup_page_cgroup(page); /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters @@ -6338,7 +5543,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ - if (PageCgroupUsed(pc)) + if (page->mem_cgroup) goto out; } @@ -6452,19 +5657,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) } static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_mem, unsigned long nr_memsw, unsigned long nr_anon, unsigned long nr_file, unsigned long nr_huge, struct page *dummy_page) { + unsigned long nr_pages = nr_anon + nr_file; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { - if (nr_mem) - res_counter_uncharge(&memcg->res, - nr_mem * PAGE_SIZE); - if (nr_memsw) - res_counter_uncharge(&memcg->memsw, - nr_memsw * PAGE_SIZE); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -6473,27 +5675,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); + + if (!mem_cgroup_is_root(memcg)) + css_put_many(&memcg->css, nr_pages); } static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; - unsigned long nr_memsw = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; unsigned long pgpgout = 0; - unsigned long nr_mem = 0; struct list_head *next; struct page *page; next = page_list->next; do { unsigned int nr_pages = 1; - struct page_cgroup *pc; page = list_entry(next, struct page, lru); next = page->lru.next; @@ -6501,24 +5703,22 @@ static void uncharge_list(struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!page->mem_cgroup) continue; /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point, we have - * fully exclusive access to the page. + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. */ - if (memcg != pc->mem_cgroup) { + if (memcg != page->mem_cgroup) { if (memcg) { - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); - pgpgout = nr_mem = nr_memsw = 0; - nr_anon = nr_file = nr_huge = 0; + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); + pgpgout = nr_anon = nr_file = nr_huge = 0; } - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; } if (PageTransHuge(page)) { @@ -6532,18 +5732,14 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; - if (pc->flags & PCG_MEM) - nr_mem += nr_pages; - if (pc->flags & PCG_MEMSW) - nr_memsw += nr_pages; - pc->flags = 0; + page->mem_cgroup = NULL; pgpgout++; } while (next != page_list); if (memcg) - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); } /** @@ -6555,14 +5751,11 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; /* Don't touch page->lru of any random page, pre-check: */ - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!page->mem_cgroup) return; INIT_LIST_HEAD(&page->lru); @@ -6598,7 +5791,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare) { - struct page_cgroup *pc; + struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); @@ -6613,27 +5806,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, return; /* Page cache replacement: new page already charged? */ - pc = lookup_page_cgroup(newpage); - if (PageCgroupUsed(pc)) + if (newpage->mem_cgroup) return; - /* Re-entrant migration: old page already uncharged? */ - pc = lookup_page_cgroup(oldpage); - if (!PageCgroupUsed(pc)) + /* + * Swapcache readahead pages can get migrated before being + * charged, and migration from compaction can happen to an + * uncharged page when the PFN walker finds a page that + * reclaim just put back on the LRU but has not released yet. + */ + memcg = oldpage->mem_cgroup; + if (!memcg) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); - VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); - if (lrucare) lock_page_lru(oldpage, &isolated); - pc->flags = 0; + oldpage->mem_cgroup = NULL; if (lrucare) unlock_page_lru(oldpage, isolated); - commit_charge(newpage, pc->mem_cgroup, lrucare); + commit_charge(newpage, memcg, lrucare); } /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8639f6b28746..feb803bf3443 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,25 +233,20 @@ void shake_page(struct page *p, int access) lru_add_drain_all(); if (PageLRU(p)) return; - drain_all_pages(); + drain_all_pages(page_zone(p)); if (PageLRU(p) || is_free_buddy_page(p)) return; } /* - * Only call shrink_slab here (which would also shrink other caches) if - * access is not potentially fatal. + * Only call shrink_node_slabs here (which would also shrink + * other caches) if access is not potentially fatal. */ if (access) { int nr; int nid = page_to_nid(p); do { - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - node_set(nid, shrink.nodes_to_scan); - - nr = shrink_slab(&shrink, 1000, 1000); + nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); if (page_count(p) == 1) break; } while (nr > 10); @@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct address_space *mapping = page->mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page_to_pgoff(page); @@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); } /* @@ -860,7 +855,6 @@ static int page_action(struct page_state *ps, struct page *p, int count; result = ps->action(p, pfn); - action_result(pfn, ps->msg, result); count = page_count(p) - 1; if (ps->action == me_swapcache_dirty && result == DELAYED) @@ -871,6 +865,7 @@ static int page_action(struct page_state *ps, struct page *p, pfn, ps->msg, count); result = FAILED; } + action_result(pfn, ps->msg, result); /* Could do more checks here if page looks ok */ /* @@ -1661,7 +1656,7 @@ static int __soft_offline_page(struct page *page, int flags) if (!is_free_buddy_page(page)) lru_add_drain_all(); if (!is_free_buddy_page(page)) - drain_all_pages(); + drain_all_pages(page_zone(page)); SetPageHWPoison(page); if (!is_free_buddy_page(page)) pr_info("soft offline: %#lx: page leaked\n", diff --git a/mm/memory.c b/mm/memory.c index d86aa88902a0..649e7d440bd7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -220,9 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long /* Is it from 0 to ~0? */ tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = start; - tlb->end = end; - tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); @@ -232,15 +229,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif + + __tlb_reset_range(tlb); } static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - tlb->need_flush = 0; tlb_flush(tlb); + mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif + __tlb_reset_range(tlb); } static void tlb_flush_mmu_free(struct mmu_gather *tlb) @@ -256,8 +256,9 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) void tlb_flush_mmu(struct mmu_gather *tlb) { - if (!tlb->need_flush) + if (!tlb->end) return; + tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } @@ -292,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - VM_BUG_ON(!tlb->need_flush); + VM_BUG_ON(!tlb->end); batch = tlb->active; batch->pages[batch->nr++] = page; @@ -359,8 +360,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; - tlb->need_flush = 1; - /* * When there's less then two users of this mm there cannot be a * concurrent page-table walk. @@ -815,20 +814,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (likely(!non_swap_entry(entry))) + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } rss[MM_SWAPENTS]++; - else if (is_migration_entry(entry)) { + } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); if (PageAnon(page)) @@ -1186,20 +1185,8 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) { - unsigned long old_end; - - /* - * Flush the TLB just for the previous segment, - * then update the range to be the remaining - * TLB range. - */ - old_end = tlb->end; - tlb->end = addr; + if (force_flush) tlb_flush_mmu_tlbonly(tlb); - tlb->start = addr; - tlb->end = old_end; - } pte_unmap_unlock(start_pte, ptl); /* @@ -1340,9 +1327,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * safe to do nothing in this case. */ if (vma->vm_file) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_unlock_write(vma->vm_file->f_mapping); } } else unmap_page_range(tlb, vma, start, end, details); @@ -2234,7 +2221,7 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); + ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -2391,12 +2378,12 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); } EXPORT_SYMBOL(unmap_mapping_range); @@ -2641,7 +2628,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_SIGBUS; /* Use the zero-page for reads */ - if (!(flags & FAULT_FLAG_WRITE)) { + if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -3009,6 +2996,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (set_page_dirty(fault_page)) dirtied = 1; + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ mapping = fault_page->mapping; unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { @@ -3388,6 +3381,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +EXPORT_SYMBOL_GPL(handle_mm_fault); #ifndef __PAGETABLE_PUD_FOLDED /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 252e1dbbed86..9fab10795bea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/tlbflush.h> @@ -1066,6 +1067,16 @@ out: } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { @@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + */ + reset_node_present_pages(pgdat); + return pgdat; } @@ -1699,7 +1725,7 @@ repeat: if (drain) { lru_add_drain_all(); cond_resched(); - drain_all_pages(); + drain_all_pages(zone); } pfn = scan_movable_pages(start_pfn, end_pfn); @@ -1721,7 +1747,7 @@ repeat: lru_add_drain_all(); yield(); /* drain pcp pages, this is synchronous. */ - drain_all_pages(); + drain_all_pages(zone); /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e58725aff7e9..0e0961b8c39c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -162,12 +162,6 @@ static const struct mempolicy_operations { enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; -/* Check that the nodemask contains at least one populated zone */ -static int is_valid_nodemask(const nodemask_t *nodemask) -{ - return nodes_intersects(*nodemask, node_states[N_MEMORY]); -} - static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; @@ -202,7 +196,7 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { - if (!is_valid_nodemask(nodes)) + if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; @@ -234,7 +228,7 @@ static int mpol_set_nodemask(struct mempolicy *pol, nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); @@ -1047,10 +1041,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from, to, flags); - if (err) - goto out; - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1130,7 +1120,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } -out: up_read(&mm->mmap_sem); if (err < 0) return err; diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf5..344cdf692fc8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping, * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, enum migrate_mode mode) + int page_was_mapped, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, newpage->mapping = NULL; } else { mem_cgroup_migrate(page, newpage, false); - if (remap_swapcache) + if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; } @@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; - int remap_swapcache = 1; + int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { @@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * migrated but are not remapped when migration * completes */ - remap_swapcache = 0; } else { goto out_unlock; } @@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + if (page_mapped(page)) { + try_to_unmap(page, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, mode); + rc = move_to_new_page(newpage, page, page_was_mapped, mode); - if (rc && remap_swapcache) + if (rc && page_was_mapped) remove_migration_ptes(page, page); /* Drop an anon_vma reference if we took one */ @@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = 0; int *result = NULL; + int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); - try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + if (page_mapped(hpage)) { + try_to_unmap(hpage, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, mode); + rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) remove_migration_ptes(hpage, hpage); if (anon_vma) @@ -1528,27 +1536,6 @@ out: return err; } -/* - * Call migration functions in the vma_ops that may prepare - * memory in a vm for migration. migration functions may perform - * the migration for vmas that do not have an underlying page struct. - */ -int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, - const nodemask_t *from, unsigned long flags) -{ - struct vm_area_struct *vma; - int err = 0; - - for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { - if (vma->vm_ops && vma->vm_ops->migrate) { - err = vma->vm_ops->migrate(vma, to, from, flags); - if (err) - break; - } - } - return err; -} - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA @@ -1854,7 +1841,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush(vma, mmun_start, pmd); + pmdp_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); @@ -1862,6 +1849,7 @@ fail_putback: if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); flush_tlb_range(vma, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page); goto fail_putback; diff --git a/mm/mincore.c b/mm/mincore.c index 725c80961048..c8c528b36641 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); - if (is_migration_entry(entry)) { - /* migration entries are always uptodate */ + if (non_swap_entry(entry)) { + /* + * migration or hwpoison entries are always + * uptodate + */ *vec = 1; } else { #ifdef CONFIG_SWAP diff --git a/mm/mmap.c b/mm/mmap.c index 87e82b38453c..7b36aa7cc89a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -232,7 +232,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_mutex + * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, file, mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } } @@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); mm->map_count++; validate_mm(mm); @@ -776,8 +776,11 @@ again: remove_next = 1 + (end > next->vm_end); * shrinking vma had, to cover any anon pages imported. */ if (exporter && exporter->anon_vma && !importer->anon_vma) { - if (anon_vma_clone(importer, exporter)) - return -ENOMEM; + int error; + + error = anon_vma_clone(importer, exporter); + if (error) + return error; importer->anon_vma = exporter->anon_vma; } } @@ -793,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end); next->vm_end); } - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (insert) { /* * Put into interval tree now, so instantiated pages @@ -880,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end); anon_vma_unlock_write(anon_vma); } if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); if (root) { uprobe_mmap(vma); @@ -2359,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) } #endif +EXPORT_SYMBOL_GPL(find_extend_vma); + /* * Ok - we have the memory areas we should free on the vma list, * so release them, and do the vma updates. @@ -2469,7 +2474,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (err) goto out_free_vma; - if (anon_vma_clone(new, vma)) + err = anon_vma_clone(new, vma); + if (err) goto out_free_mpol; if (new->vm_file) @@ -2597,6 +2603,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) detach_vmas_to_be_unmapped(mm, vma, prev, end); unmap_region(mm, vma, prev, start, end); + arch_unmap(mm, vma, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); @@ -2785,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_mutex is taken here. + * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -3080,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); } } @@ -3107,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never + * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -3176,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping) * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 2c8da9825fe3..3b9b3d0741b2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -193,6 +193,16 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + /* + * Call invalidate_range here too to avoid the need for the + * subsystem of having to register an invalidate_range_end + * call-back when there is invalidate_range already. Usually a + * subsystem registers either invalidate_range_start()/end() or + * invalidate_range(), so this will be no additional overhead + * (besides the pointer check). + */ + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } @@ -200,6 +210,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); +void __mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mremap.c b/mm/mremap.c index b147f66f4c40..17fa018f5f39 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spinlock_t *old_ptl, *new_ptl; /* - * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma * locks to ensure that rmap will always observe either the old or the * new ptes. This is the easiest way to avoid races with * truncate_pagecache(), page migration, etc... @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (need_rmap_locks) { if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); } if (vma->anon_vma) { anon_vma = vma->anon_vma; @@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (anon_vma) anon_vma_unlock_write(anon_vma); if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -288,7 +288,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } + } else if (vma->vm_file && vma->vm_file->f_op->mremap) + vma->vm_file->f_op->mremap(vma->vm_file, new_vma); /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7c7ab32ee503..90b50468333e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } diff --git a/mm/nommu.c b/mm/nommu.c index bd1808e194a7..b51eadf6d952 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* add the VMA to the tree */ @@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* remove from the MM's tree and list */ @@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len, unsigned long capabilities) { - struct page *pages; - unsigned long total, point, n; + unsigned long total, point; void *base; int ret, order; @@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma, order = get_order(len); kdebug("alloc order %d for %lx", order, len); - pages = alloc_pages(GFP_KERNEL, order); - if (!pages) - goto enomem; - total = 1 << order; - atomic_long_add(total, &mmap_pages_allocated); - point = len >> PAGE_SHIFT; - /* we allocated a power-of-2 sized page set, so we may want to trim off - * the excess */ + /* we don't want to allocate a power-of-2 sized page set */ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { - while (total > point) { - order = ilog2(total - point); - n = 1 << order; - kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_long_sub(n, &mmap_pages_allocated); - total -= n; - set_page_refcounted(pages + total); - __free_pages(pages + total, order); - } + total = point; + kdebug("try to alloc exact %lu pages", total); + base = alloc_pages_exact(len, GFP_KERNEL); + } else { + base = (void *)__get_free_pages(GFP_KERNEL, order); } - for (point = 1; point < total; point++) - set_page_refcounted(&pages[point]); + if (!base) + goto enomem; + + atomic_long_add(total, &mmap_pages_allocated); - base = page_address(pages); region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; @@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; down_write(&nommu_region_sem); - mutex_lock(&inode->i_mapping->i_mmap_mutex); + i_mmap_lock_read(inode->i_mapping); /* search for VMAs that fall within the dead zone */ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { - mutex_unlock(&inode->i_mapping->i_mmap_mutex); + i_mmap_unlock_read(inode->i_mapping); up_write(&nommu_region_sem); return -ETXTBSY; /* not quite true, but near enough */ } @@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; @@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, } } - mutex_unlock(&inode->i_mapping->i_mmap_mutex); + i_mmap_unlock_read(inode->i_mapping); up_write(&nommu_region_sem); return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5340f6b91312..d503e9ce1c7b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -119,7 +119,7 @@ found: /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *memcg, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -233,7 +233,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) - if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) + if (!cpuset_zone_allowed(zone, gfp_mask)) cpuset_limited = true; if (cpuset_limited) { @@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task->flags & PF_EXITING && !force_kill) { - /* - * If this task is not being ptraced on exit, then wait for it - * to finish before killing some other task unnecessarily. - */ - if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) - return OOM_SCAN_ABORT; - } + if (task_will_free_mem(task) && !force_kill) + return OOM_SCAN_ABORT; + return OOM_SCAN_OK; } @@ -353,7 +348,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, * swapents, oom_score_adj value, and name. */ -static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; @@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ - if (p->flags & PF_EXITING) { + if (task_will_free_mem(p)) { set_tsk_thread_flag(p, TIF_MEMDIE); put_task_struct(p); return; @@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || current->flags & PF_EXITING) { + if (fatal_signal_pending(current) || task_will_free_mem(current)) { set_thread_flag(TIF_MEMDIE); return; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 19ceae87522d..d5d81f5384d1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } @@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9cd36b822444..7633c503a116 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,7 +48,7 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/page_cgroup.h> +#include <linux/page_ext.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/compaction.h> @@ -56,9 +56,10 @@ #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> -#include <linux/page-debug-flags.h> +#include <linux/page_ext.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <linux/page_owner.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -110,6 +111,7 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +unsigned long totalcma_pages __read_mostly; /* * When calculating the number of globally allowed dirty pages, there * is a certain number of per-zone reserves that should not be @@ -425,6 +427,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order, #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; +bool _debug_pagealloc_enabled __read_mostly; +bool _debug_guardpage_enabled __read_mostly; + +static int __init early_debug_pagealloc(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + _debug_pagealloc_enabled = true; + + return 0; +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static bool need_debug_guardpage(void) +{ + /* If we don't use debug_pagealloc, we don't need guard page */ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_debug_guardpage(void) +{ + if (!debug_pagealloc_enabled()) + return; + + _debug_guardpage_enabled = true; +} + +struct page_ext_operations debug_guardpage_ops = { + .need = need_debug_guardpage, + .init = init_debug_guardpage, +}; static int __init debug_guardpage_minorder_setup(char *buf) { @@ -440,18 +478,44 @@ static int __init debug_guardpage_minorder_setup(char *buf) } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); -static inline void set_page_guard_flag(struct page *page) +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { - __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + INIT_LIST_HEAD(&page->lru); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + __mod_zone_freepage_state(zone, -(1 << order), migratetype); } -static inline void clear_page_guard_flag(struct page *page) +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { - __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -static inline void set_page_guard_flag(struct page *page) { } -static inline void clear_page_guard_flag(struct page *page) { } +struct page_ext_operations debug_guardpage_ops = { NULL, }; +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} #endif static inline void set_page_order(struct page *page, unsigned int order) @@ -467,29 +531,6 @@ static inline void rmv_page_order(struct page *page) } /* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - -/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && @@ -569,6 +610,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -577,13 +619,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -593,10 +646,7 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) { - clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + clear_page_guard(zone, buddy, order, migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -650,8 +700,10 @@ static inline int free_pages_check(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -715,14 +767,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -739,9 +789,11 @@ static void free_one_page(struct zone *zone, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -750,6 +802,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); @@ -760,6 +815,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) if (bad) return false; + reset_page_owner(page, order); + if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), PAGE_SIZE << order); @@ -866,23 +923,18 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (high < debug_guardpage_minorder()) { + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + debug_guardpage_enabled() && + high < debug_guardpage_minorder()) { /* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - INIT_LIST_HEAD(&page[size].lru); - set_page_guard_flag(&page[size]); - set_page_private(&page[size], high); - /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << high), - migratetype); + set_page_guard(zone, &page[size], high, migratetype); continue; } -#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -907,8 +959,10 @@ static inline int check_new_page(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -938,6 +992,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); + set_page_owner(page, order, gfp_flags); + return 0; } @@ -1276,55 +1332,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) #endif /* - * Drain pages of the indicated processor. + * Drain pcplists of the indicated processor and zone. * * The processor must either be the current processor and the * thread pinned to the current processor or a processor that * is not online. */ -static void drain_pages(unsigned int cpu) +static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct zone *zone; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; - for_each_populated_zone(zone) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); - local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } + local_irq_restore(flags); +} - pcp = &pset->pcp; - if (pcp->count) { - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } - local_irq_restore(flags); +/* + * Drain pcplists of all zones on the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + drain_pages_zone(cpu, zone); } } /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. + * + * The CPU has to be pinned. When zone parameter is non-NULL, spill just + * the single zone's pages. */ -void drain_local_pages(void *arg) +void drain_local_pages(struct zone *zone) { - drain_pages(smp_processor_id()); + int cpu = smp_processor_id(); + + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * + * When zone parameter is non-NULL, spill just the single zone's pages. + * * Note that this code is protected against sending an IPI to an offline * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but * nothing keeps CPUs from showing up after we populated the cpumask and * before the call to on_each_cpu_mask(). */ -void drain_all_pages(void) +void drain_all_pages(struct zone *zone) { int cpu; - struct per_cpu_pageset *pcp; - struct zone *zone; /* * Allocate in the BSS so we wont require allocation in @@ -1339,20 +1415,31 @@ void drain_all_pages(void) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { + struct per_cpu_pageset *pcp; + struct zone *z; bool has_pcps = false; - for_each_populated_zone(zone) { + + if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) { + if (pcp->pcp.count) has_pcps = true; - break; + } else { + for_each_populated_zone(z) { + pcp = per_cpu_ptr(z->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } } } + if (has_pcps) cpumask_set_cpu(cpu, &cpus_with_pcps); else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); + on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, + zone, 1); } #ifdef CONFIG_HIBERNATION @@ -1479,12 +1566,15 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - for (i = 1; i < (1 << order); i++) + set_page_owner(page, 0, 0); + for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); + set_page_owner(page + i, 0, 0); + } } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; @@ -1520,6 +1610,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) } } + set_page_owner(page, order, 0); return 1UL << order; } @@ -1714,7 +1805,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages my go negative - that's OK */ + /* free_pages may go negative - that's OK */ long min = mark; int o; long free_cma = 0; @@ -1962,7 +2053,7 @@ zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1973,7 +2064,7 @@ zonelist_scan: continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed_softwall(zone, gfp_mask)) + !cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual @@ -2305,7 +2396,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int classzone_idx, int migratetype, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { - struct zone *last_compact_zone = NULL; unsigned long compact_result; struct page *page; @@ -2316,7 +2406,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, contended_compaction, - &last_compact_zone); + alloc_flags, classzone_idx); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2335,10 +2425,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); - page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2354,14 +2440,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } /* - * last_compact_zone is where try_to_compact_pages thought allocation - * should succeed, so it did not defer compaction. But here we know - * that it didn't succeed, so we do the defer. - */ - if (last_compact_zone && mode != MIGRATE_ASYNC) - defer_compaction(last_compact_zone, order); - - /* * It's bad if compaction run occurs and fails. The most likely reason * is that pages exist, but not enough to satisfy watermarks. */ @@ -2442,7 +2520,7 @@ retry: * pages are pinned on the per-cpu lists. Drain them and try again */ if (!page && !drained) { - drain_all_pages(); + drain_all_pages(NULL); drained = true; goto retry; } @@ -2514,7 +2592,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed_softwall(). + * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt()) @@ -3902,14 +3980,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - printk("Built %i zonelists in %s order, mobility grouping %s. " + pr_info("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA - printk("Policy zone: %s\n", zone_names[policy_zone]); + pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif } @@ -4841,7 +4919,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); - pgdat_page_cgroup_init(pgdat); + pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -4860,16 +4938,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * and per-cpu initialisations */ memmap_pages = calc_memmap_size(size, realsize); - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); + if (!is_highmem_idx(j)) { + if (freesize >= memmap_pages) { + freesize -= memmap_pages; + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); + } /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { @@ -5343,33 +5423,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone ranges:\n"); + pr_info("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(KERN_CONT " %-8s ", zone_names[i]); + pr_info(" %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) - printk(KERN_CONT "empty\n"); + pr_cont("empty\n"); else - printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + pr_cont("[mem %0#10lx-%0#10lx]\n", arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, (arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start for each node\n"); + pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %#010lx\n", i, + pr_info(" Node %d: %#010lx\n", i, zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ - printk("Early memory node ranges\n"); + pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ @@ -5505,9 +5585,9 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - printk("Memory: %luK/%luK available " + pr_info("Memory: %luK/%luK available " "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved" + "%luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif @@ -5515,7 +5595,8 @@ void __init mem_init_print_info(const char *str) nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages) << (PAGE_SHIFT-10), + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), + totalcma_pages << (PAGE_SHIFT-10), #ifdef CONFIG_HIGHMEM totalhigh_pages << (PAGE_SHIFT-10), #endif @@ -6207,9 +6288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!PageLRU(page)) found++; /* - * If there are RECLAIMABLE pages, we need to check it. - * But now, memory offline itself doesn't call shrink_slab() - * and it still to be fixed. + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. */ /* * If the page is not RAM, page_count()should be 0. @@ -6394,7 +6475,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(); + drain_all_pages(cc.zone); order = 0; outer_start = start; @@ -6408,13 +6489,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c deleted file mode 100644 index 5331c2bd85a2..000000000000 --- a/mm/page_cgroup.c +++ /dev/null @@ -1,530 +0,0 @@ -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/bootmem.h> -#include <linux/bit_spinlock.h> -#include <linux/page_cgroup.h> -#include <linux/hash.h> -#include <linux/slab.h> -#include <linux/memory.h> -#include <linux/vmalloc.h> -#include <linux/cgroup.h> -#include <linux/swapops.h> -#include <linux/kmemleak.h> - -static unsigned long total_usage; - -#if !defined(CONFIG_SPARSEMEM) - - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - pgdat->node_page_cgroup = NULL; -} - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - unsigned long offset; - struct page_cgroup *base; - - base = NODE_DATA(page_to_nid(page))->node_page_cgroup; -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (unlikely(!base)) - return NULL; -#endif - offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; - return base + offset; -} - -static int __init alloc_node_page_cgroup(int nid) -{ - struct page_cgroup *base; - unsigned long table_size; - unsigned long nr_pages; - - nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) - return 0; - - table_size = sizeof(struct page_cgroup) * nr_pages; - - base = memblock_virt_alloc_try_nid_nopanic( - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); - if (!base) - return -ENOMEM; - NODE_DATA(nid)->node_page_cgroup = base; - total_usage += table_size; - return 0; -} - -void __init page_cgroup_init_flatmem(void) -{ - - int nid, fail; - - if (mem_cgroup_disabled()) - return; - - for_each_online_node(nid) { - fail = alloc_node_page_cgroup(nid); - if (fail) - goto fail; - } - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" - " don't want memory cgroups\n"); - return; -fail: - printk(KERN_CRIT "allocation of page_cgroup failed.\n"); - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -#else /* CONFIG_FLAT_NODE_MEM_MAP */ - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (!section->page_cgroup) - return NULL; -#endif - return section->page_cgroup + pfn; -} - -static void *__meminit alloc_page_cgroup(size_t size, int nid) -{ - gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; - void *addr = NULL; - - addr = alloc_pages_exact_nid(nid, size, flags); - if (addr) { - kmemleak_alloc(addr, size, 1, flags); - return addr; - } - - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); - - return addr; -} - -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) -{ - struct mem_section *section; - struct page_cgroup *base; - unsigned long table_size; - - section = __pfn_to_section(pfn); - - if (section->page_cgroup) - return 0; - - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - base = alloc_page_cgroup(table_size, nid); - - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - - if (!base) { - printk(KERN_ERR "page cgroup allocation failure\n"); - return -ENOMEM; - } - - /* - * The passed "pfn" may not be aligned to SECTION. For the calculation - * we need to apply a mask. - */ - pfn &= PAGE_SECTION_MASK; - section->page_cgroup = base - pfn; - total_usage += table_size; - return 0; -} -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - kmemleak_free(addr); - free_pages_exact(addr, table_size); - } -} - -static void __free_page_cgroup(unsigned long pfn) -{ - struct mem_section *ms; - struct page_cgroup *base; - - ms = __pfn_to_section(pfn); - if (!ms || !ms->page_cgroup) - return; - base = ms->page_cgroup + pfn; - free_page_cgroup(base); - ms->page_cgroup = NULL; -} - -static int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) -{ - unsigned long start, end, pfn; - int fail = 0; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - if (nid == -1) { - /* - * In this case, "nid" already exists and contains valid memory. - * "start_pfn" passed to us is a pfn which is an arg for - * online__pages(), and start_pfn should exist. - */ - nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_state(nid, N_ONLINE)); - } - - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn, nid); - } - if (!fail) - return 0; - - /* rollback */ - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - - return -ENOMEM; -} - -static int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) -{ - unsigned long start, end, pfn; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - return 0; - -} - -static int __meminit page_cgroup_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mn = arg; - int ret = 0; - switch (action) { - case MEM_GOING_ONLINE: - ret = online_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_OFFLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_CANCEL_ONLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_GOING_OFFLINE: - break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: - break; - } - - return notifier_from_errno(ret); -} - -#endif - -void __init page_cgroup_init(void) -{ - unsigned long pfn; - int nid; - - if (mem_cgroup_disabled()) - return; - - for_each_node_state(nid, N_MEMORY) { - unsigned long start_pfn, end_pfn; - - start_pfn = node_start_pfn(nid); - end_pfn = node_end_pfn(nid); - /* - * start_pfn and end_pfn may not be aligned to SECTION and the - * page->flags of out of node pages are not initialized. So we - * scan [start_pfn, the biggest section's pfn < end_pfn) here. - */ - for (pfn = start_pfn; - pfn < end_pfn; - pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { - - if (!pfn_valid(pfn)) - continue; - /* - * Nodes's pfns can be overlapping. - * We know some arch can have a nodes layout such as - * -------------pfn--------------> - * N0 | N1 | N2 | N0 | N1 | N2|.... - */ - if (pfn_to_nid(pfn) != nid) - continue; - if (init_section_page_cgroup(pfn, nid)) - goto oom; - } - } - hotplug_memory_notifier(page_cgroup_callback, 0); - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " - "don't want memory cgroups\n"); - return; -oom: - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - return; -} - -#endif - - -#ifdef CONFIG_MEMCG_SWAP - -static DEFINE_MUTEX(swap_cgroup_mutex); -struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -struct swap_cgroup { - unsigned short id; -}; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) -{ - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; - - ctrl = &swap_cgroup_ctrl[type]; - - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); - - return -ENOMEM; -} - -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; -} - -/** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id - * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) - */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; -} - -/** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into - * @id: mem_cgroup to be recorded - * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) - */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - sc->id = id; - spin_unlock_irqrestore(&ctrl->lock, flags); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return lookup_swap_cgroup(ent, NULL)->id; -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - void *array; - unsigned long array_size; - unsigned long length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return 0; - - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - array_size = length * sizeof(void *); - - array = vzalloc(array_size); - if (!array) - goto nomem; - - ctrl = &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct page **map; - unsigned long i, length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl = &swap_cgroup_ctrl[type]; - map = ctrl->map; - length = ctrl->length; - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - } - vfree(map); - } -} - -#endif diff --git a/mm/page_counter.c b/mm/page_counter.c new file mode 100644 index 000000000000..a009574fbba9 --- /dev/null +++ b/mm/page_counter.c @@ -0,0 +1,192 @@ +/* + * Lockless hierarchical page accounting & limiting + * + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/page_counter.h> +#include <linux/atomic.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/bug.h> +#include <asm/page.h> + +/** + * page_counter_cancel - take pages out of the local counter + * @counter: counter + * @nr_pages: number of pages to cancel + */ +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) +{ + long new; + + new = atomic_long_sub_return(nr_pages, &counter->count); + /* More uncharges than charges? */ + WARN_ON_ONCE(new < 0); +} + +/** + * page_counter_charge - hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * + * NOTE: This does not consider any configured counter limits. + */ +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + + new = atomic_long_add_return(nr_pages, &c->count); + /* + * This is indeed racy, but we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } +} + +/** + * page_counter_try_charge - try to hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * @fail: points first counter to hit its limit, if any + * + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of + * its ancestors has hit its configured limit. + */ +int page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + /* + * Charge speculatively to avoid an expensive CAS. If + * a bigger charge fails, it might falsely lock out a + * racing smaller charge and send it into reclaim + * early, but the error is limited to the difference + * between the two sizes, which is less than 2M/4M in + * case of a THP locking out a regular page charge. + * + * The atomic_long_add_return() implies a full memory + * barrier between incrementing the count and reading + * the limit. When racing with page_counter_limit(), + * we either see the new limit or the setter sees the + * counter has changed and retries. + */ + new = atomic_long_add_return(nr_pages, &c->count); + if (new > c->limit) { + atomic_long_sub(nr_pages, &c->count); + /* + * This is racy, but we can live with some + * inaccuracy in the failcnt. + */ + c->failcnt++; + *fail = c; + goto failed; + } + /* + * Just like with failcnt, we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } + return 0; + +failed: + for (c = counter; c != *fail; c = c->parent) + page_counter_cancel(c, nr_pages); + + return -ENOMEM; +} + +/** + * page_counter_uncharge - hierarchically uncharge pages + * @counter: counter + * @nr_pages: number of pages to uncharge + */ +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) + page_counter_cancel(c, nr_pages); +} + +/** + * page_counter_limit - limit the number of pages allowed + * @counter: counter + * @limit: limit to set + * + * Returns 0 on success, -EBUSY if the current number of pages on the + * counter already exceeds the specified limit. + * + * The caller must serialize invocations on the same counter. + */ +int page_counter_limit(struct page_counter *counter, unsigned long limit) +{ + for (;;) { + unsigned long old; + long count; + + /* + * Update the limit while making sure that it's not + * below the concurrently-changing counter value. + * + * The xchg implies two full memory barriers before + * and after, so the read-swap-read is ordered and + * ensures coherency with page_counter_try_charge(): + * that function modifies the count before checking + * the limit, so if it sees the old limit, we see the + * modified counter and retry. + */ + count = atomic_long_read(&counter->count); + + if (count > limit) + return -EBUSY; + + old = xchg(&counter->limit, limit); + + if (atomic_long_read(&counter->count) <= count) + return 0; + + counter->limit = old; + cond_resched(); + } +} + +/** + * page_counter_memparse - memparse() for page counter limits + * @buf: string to parse + * @nr_pages: returns the result in number of pages + * + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be + * limited to %PAGE_COUNTER_MAX. + */ +int page_counter_memparse(const char *buf, unsigned long *nr_pages) +{ + char unlimited[] = "-1"; + char *end; + u64 bytes; + + if (!strncmp(buf, unlimited, sizeof(unlimited))) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &end); + if (*end != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 000000000000..d86fd2f5353f --- /dev/null +++ b/mm/page_ext.c @@ -0,0 +1,403 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/page_ext.h> +#include <linux/memory.h> +#include <linux/vmalloc.h> +#include <linux/kmemleak.h> +#include <linux/page_owner.h> + +/* + * struct page extension + * + * This is the feature to manage memory for extended data per page. + * + * Until now, we must modify struct page itself to store extra data per page. + * This requires rebuilding the kernel and it is really time consuming process. + * And, sometimes, rebuild is impossible due to third party module dependency. + * At last, enlarging struct page could cause un-wanted system behaviour change. + * + * This feature is intended to overcome above mentioned problems. This feature + * allocates memory for extended data per page in certain place rather than + * the struct page itself. This memory can be accessed by the accessor + * functions provided by this code. During the boot process, it checks whether + * allocation of huge chunk of memory is needed or not. If not, it avoids + * allocating memory at all. With this advantage, we can include this feature + * into the kernel in default and can avoid rebuild and solve related problems. + * + * To help these things to work well, there are two callbacks for clients. One + * is the need callback which is mandatory if user wants to avoid useless + * memory allocation at boot-time. The other is optional, init callback, which + * is used to do proper initialization after memory is allocated. + * + * The need callback is used to decide whether extended memory allocation is + * needed or not. Sometimes users want to deactivate some features in this + * boot and extra memory would be unneccessary. In this case, to avoid + * allocating huge chunk of memory, each clients represent their need of + * extra memory through the need callback. If one of the need callbacks + * returns true, it means that someone needs extra memory so that + * page extension core should allocates memory for page extension. If + * none of need callbacks return true, memory isn't needed at all in this boot + * and page extension core can skip to allocate memory. As result, + * none of memory is wasted. + * + * The init callback is used to do proper initialization after page extension + * is completely initialized. In sparse memory system, extra memory is + * allocated some time later than memmap is allocated. In other words, lifetime + * of memory for page extension isn't same with memmap for struct page. + * Therefore, clients can't store extra data until page extension is + * initialized, even if pages are allocated and used freely. This could + * cause inadequate state of extra data per page, so, to prevent it, client + * can utilize this callback to initialize the state of it correctly. + */ + +static struct page_ext_operations *page_ext_ops[] = { + &debug_guardpage_ops, +#ifdef CONFIG_PAGE_POISONING + &page_poisoning_ops, +#endif +#ifdef CONFIG_PAGE_OWNER + &page_owner_ops, +#endif +}; + +static unsigned long total_usage; + +static bool __init invoke_need_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) + return true; + } + + return false; +} + +static void __init invoke_init_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->init) + page_ext_ops[i]->init(); + } +} + +#if !defined(CONFIG_SPARSEMEM) + + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ + pgdat->node_page_ext = NULL; +} + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_ext *base; + + base = NODE_DATA(page_to_nid(page))->node_page_ext; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (unlikely(!base)) + return NULL; +#endif + offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + MAX_ORDER_NR_PAGES); + return base + offset; +} + +static int __init alloc_node_page_ext(int nid) +{ + struct page_ext *base; + unsigned long table_size; + unsigned long nr_pages; + + nr_pages = NODE_DATA(nid)->node_spanned_pages; + if (!nr_pages) + return 0; + + /* + * Need extra space if node range is not aligned with + * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm + * checks buddy's status, range could be out of exact node range. + */ + if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || + !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) + nr_pages += MAX_ORDER_NR_PAGES; + + table_size = sizeof(struct page_ext) * nr_pages; + + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + if (!base) + return -ENOMEM; + NODE_DATA(nid)->node_page_ext = base; + total_usage += table_size; + return 0; +} + +void __init page_ext_init_flatmem(void) +{ + + int nid, fail; + + if (!invoke_need_callbacks()) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_ext(nid); + if (fail) + goto fail; + } + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +fail: + pr_crit("allocation of page_ext failed.\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (!section->page_ext) + return NULL; +#endif + return section->page_ext + pfn; +} + +static void *__meminit alloc_page_ext(size_t size, int nid) +{ + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); + return addr; + } + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vzalloc_node(size, nid); + else + addr = vzalloc(size); + + return addr; +} + +static int __meminit init_section_page_ext(unsigned long pfn, int nid) +{ + struct mem_section *section; + struct page_ext *base; + unsigned long table_size; + + section = __pfn_to_section(pfn); + + if (section->page_ext) + return 0; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + base = alloc_page_ext(table_size, nid); + + /* + * The value stored in section->page_ext is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); + + if (!base) { + pr_err("page ext allocation failure\n"); + return -ENOMEM; + } + + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; + section->page_ext = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_ext(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + +static void __free_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + struct page_ext *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + base = ms->page_ext + pfn; + free_page_ext(base); + ms->page_ext = NULL; +} + +static int __meminit online_page_ext(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == -1) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_state(nid, N_ONLINE)); + } + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_ext(pfn, nid); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + + return -ENOMEM; +} + +static int __meminit offline_page_ext(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + return 0; + +} + +static int __meminit page_ext_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_OFFLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + + return notifier_from_errno(ret); +} + +#endif + +void __init page_ext_init(void) +{ + unsigned long pfn; + int nid; + + if (!invoke_need_callbacks()) + return; + + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_ext(pfn, nid)) + goto oom; + } + } + hotplug_memory_notifier(page_ext_callback, 0); + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +oom: + panic("Out of memory"); +} + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ +} + +#endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2e9481..72f5ac381ab3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -67,7 +68,7 @@ out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) - drain_all_pages(); + drain_all_pages(zone); return ret; } @@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (!is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 000000000000..9ab4a9b5bc09 --- /dev/null +++ b/mm/page_owner.c @@ -0,0 +1,311 @@ +#include <linux/debugfs.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/bootmem.h> +#include <linux/stacktrace.h> +#include <linux/page_owner.h> +#include "internal.h" + +static bool page_owner_disabled = true; +bool page_owner_inited __read_mostly; + +static void init_early_allocated_pages(void); + +static int early_page_owner_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + page_owner_disabled = false; + + return 0; +} +early_param("page_owner", early_page_owner_param); + +static bool need_page_owner(void) +{ + if (page_owner_disabled) + return false; + + return true; +} + +static void init_page_owner(void) +{ + if (page_owner_disabled) + return; + + page_owner_inited = true; + init_early_allocated_pages(); +} + +struct page_ext_operations page_owner_ops = { + .need = need_page_owner, + .init = init_page_owner, +}; + +void __reset_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext; + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); + } +} + +void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +{ + struct page_ext *page_ext; + struct stack_trace *trace; + + page_ext = lookup_page_ext(page); + + trace = &page_ext->trace; + trace->nr_entries = 0; + trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); + trace->entries = &page_ext->trace_entries[0]; + trace->skip = 3; + save_stack_trace(&page_ext->trace); + + page_ext->order = order; + page_ext->gfp_mask = gfp_mask; + + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); +} + +static ssize_t +print_page_owner(char __user *buf, size_t count, unsigned long pfn, + struct page *page, struct page_ext *page_ext) +{ + int ret; + int pageblock_mt, page_mt; + char *kbuf; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = snprintf(kbuf, count, + "Page allocated via order %u, mask 0x%x\n", + page_ext->order, page_ext->gfp_mask); + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + pageblock_mt = get_pfnblock_migratetype(page, pfn); + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + ret += snprintf(kbuf + ret, count - ret, + "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + pfn, + pfn >> pageblock_order, + pageblock_mt, + pageblock_mt != page_mt ? "Fallback" : " ", + PageLocked(page) ? "K" : " ", + PageError(page) ? "E" : " ", + PageReferenced(page) ? "R" : " ", + PageUptodate(page) ? "U" : " ", + PageDirty(page) ? "D" : " ", + PageLRU(page) ? "L" : " ", + PageActive(page) ? "A" : " ", + PageSlab(page) ? "S" : " ", + PageWriteback(page) ? "W" : " ", + PageCompound(page) ? "C" : " ", + PageSwapCache(page) ? "B" : " ", + PageMappedToDisk(page) ? "M" : " "); + + if (ret >= count) + goto err; + + ret += snprint_stack_trace(kbuf + ret, count - ret, + &page_ext->trace, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +static ssize_t +read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long pfn; + struct page *page; + struct page_ext *page_ext; + + if (!page_owner_inited) + return -EINVAL; + + page = NULL; + pfn = min_low_pfn + *ppos; + + /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ + while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) + pfn++; + + drain_all_pages(NULL); + + /* Find an allocated page */ + for (; pfn < max_pfn; pfn++) { + /* + * If the new page is in a new MAX_ORDER_NR_PAGES area, + * validate the area as existing, skip it if not + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { + pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + + /* Check for holes within a MAX_ORDER area */ + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); + + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; + continue; + } + + page_ext = lookup_page_ext(page); + + /* + * Some pages could be missed by concurrent allocation or free, + * because we don't hold the zone lock. + */ + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Record the next PFN to read in the file offset */ + *ppos = (pfn - min_low_pfn) + 1; + + return print_page_owner(buf, count, pfn, page, page_ext); + } + + return 0; +} + +static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count = 0; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + /* + * We are safe to check buddy flag and order, because + * this is init stage and only single thread runs. + */ + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + /* Maybe overraping zone */ + if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Found early allocated page */ + set_page_owner(page, 0, 0); + count++; + } + } + + pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", + pgdat->node_id, zone->name, count); +} + +static void init_zones_in_node(pg_data_t *pgdat) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + init_pages_in_zone(pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +static void init_early_allocated_pages(void) +{ + pg_data_t *pgdat; + + drain_all_pages(NULL); + for_each_online_pgdat(pgdat) + init_zones_in_node(pgdat); +} + +static const struct file_operations proc_page_owner_operations = { + .read = read_page_owner, +}; + +static int __init pageowner_init(void) +{ + struct dentry *dentry; + + if (!page_owner_inited) { + pr_info("page_owner is disabled\n"); + return 0; + } + + dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, + NULL, &proc_page_owner_operations); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} +module_init(pageowner_init) diff --git a/mm/percpu.c b/mm/percpu.c index 014bab65e0ff..d39e2f4e335c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1591,7 +1591,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (cpu == NR_CPUS) continue; - PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); diff --git a/mm/rmap.c b/mm/rmap.c index 1e542744e131..c5bc241127b2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,7 +23,7 @@ * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_mutex + * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -274,6 +274,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; + int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -283,8 +284,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - if (anon_vma_clone(vma, pvma)) - return -ENOMEM; + error = anon_vma_clone(vma, pvma); + if (error) + return error; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); @@ -1052,7 +1054,7 @@ void page_add_file_rmap(struct page *page) __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } static void page_remove_file_rmap(struct page *page) @@ -1082,7 +1084,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } /** @@ -1259,7 +1261,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1379,7 +1381,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) { @@ -1634,7 +1636,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1642,6 +1644,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (!anon_vma) return ret; + pgoff = page_to_pgoff(page); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1675,7 +1678,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1683,13 +1686,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. + * so we can safely take mapping->i_mmap_rwsem. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) return ret; - mutex_lock(&mapping->i_mmap_mutex); + + pgoff = page_to_pgoff(page); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1710,9 +1715,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; ret = rwc->file_nonlinear(page, mapping, rwc->arg); - done: - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 185836ba53ef..73ba1df7c8ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1536,7 +1536,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * holes of a sparse file, we actually need to allocate those pages, * and even mark them dirty, so it cannot exceed the max_blocks limit. */ - if (segment_eq(get_fs(), KERNEL_DS)) + if (!iter_is_iovec(to)) sgp = SGP_DIRTY; index = *ppos >> PAGE_CACHE_SHIFT; diff --git a/mm/slab.c b/mm/slab.c index eb2b2ea30130..65b5dcb6f671 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & GFP_SLAB_BUG_MASK); + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); /* Take the node list lock to change the colour_next on this node */ @@ -3012,7 +3015,7 @@ retry: for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { nid = zone_to_nid(zone); - if (cpuset_zone_allowed_hardwall(zone, flags) && + if (cpuset_zone_allowed(zone, flags) && get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, @@ -3076,7 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, void *obj; int x; - VM_BUG_ON(nodeid > num_online_nodes()); + VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); @@ -3179,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, memset(ptr, 0, cachep->object_size); } + memcg_kmem_put_cache(cachep); return ptr; } @@ -3244,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) memset(objp, 0, cachep->object_size); } + memcg_kmem_put_cache(cachep); return objp; } @@ -3580,11 +3585,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) for_each_online_node(node) { - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } new_shared = NULL; if (cachep->shared) { @@ -4043,12 +4048,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, #ifdef CONFIG_DEBUG_SLAB_LEAK -static void *leaks_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&slab_mutex); - return seq_list_start(&slab_caches, *pos); -} - static inline int add_caller(unsigned long *n, unsigned long v) { unsigned long *p; @@ -4170,7 +4169,7 @@ static int leaks_show(struct seq_file *m, void *p) } static const struct seq_operations slabstats_op = { - .start = leaks_start, + .start = slab_start, .next = slab_next, .stop = slab_stop, .show = leaks_show, diff --git a/mm/slab.h b/mm/slab.h index ab019e63e3c2..1cf4005482dd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) rcu_read_lock(); params = rcu_dereference(s->memcg_params); - cachep = params->memcg_caches[idx]; - rcu_read_unlock(); /* * Make sure we will access the up-to-date value. The code updating * memcg_caches issues a write barrier to match this (see * memcg_register_cache()). */ - smp_read_barrier_depends(); + cachep = lockless_dereference(params->memcg_caches[idx]); + rcu_read_unlock(); + return cachep; } @@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif +void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 406944207b61..e03dd6f2a272 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, size = ALIGN(size, align); flags = kmem_cache_flags(size, flags, name, NULL); - list_for_each_entry(s, &slab_caches, list) { + list_for_each_entry_reverse(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; @@ -259,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, if (s->size - size >= sizeof(void *)) continue; + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + return s; } return NULL; @@ -807,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); #define SLABINFO_RIGHTS S_IRUSR #endif -void print_slabinfo_header(struct seq_file *m) +static void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -830,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -static void *s_start(struct seq_file *m, loff_t *pos) +void *slab_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - return seq_list_start(&slab_caches, *pos); } @@ -877,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) } } -int cache_show(struct kmem_cache *s, struct seq_file *m) +static void cache_show(struct kmem_cache *s, struct seq_file *m) { struct slabinfo sinfo; @@ -896,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); slabinfo_show_stats(m, s); seq_putc(m, '\n'); +} + +static int slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (p == slab_caches.next) + print_slabinfo_header(m); + if (is_root_cache(s)) + cache_show(s, m); return 0; } -static int s_show(struct seq_file *m, void *p) +#ifdef CONFIG_MEMCG_KMEM +int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (!is_root_cache(s)) - return 0; - return cache_show(s, m); + if (p == slab_caches.next) + print_slabinfo_header(m); + if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + cache_show(s, m); + return 0; } +#endif /* * slabinfo_op - iterator that generates /proc/slabinfo @@ -922,10 +936,10 @@ static int s_show(struct seq_file *m, void *p) * + further values on SMP and with statistics enabled */ static const struct seq_operations slabinfo_op = { - .start = s_start, + .start = slab_start, .next = slab_next, .stop = slab_stop, - .show = s_show, + .show = slab_show, }; static int slabinfo_open(struct inode *inode, struct file *file) diff --git a/mm/slub.c b/mm/slub.c index ae7b9f1ad394..fe376fe1f4fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", - s->name, page->objects, maxobj); + page->objects, maxobj); return 0; } if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, page->objects); + page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp; void *object = NULL; - unsigned long max_objects; + int max_objects; fp = page->freelist; while (fp && nr <= page->objects) { @@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) kmemleak_free(x); } -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); - return should_failslab(s->object_size, flags, s->flags); + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); } static inline void slab_post_alloc_hook(struct kmem_cache *s, @@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, flags &= gfp_allowed_mask; kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); + memcg_kmem_put_cache(s); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -1377,7 +1382,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) int order; int idx; - BUG_ON(flags & GFP_SLAB_BUG_MASK); + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } page = allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); @@ -1662,7 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(zone, flags) && + if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { object = get_partial_node(s, n, c, flags); if (object) { @@ -2380,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct page *page; unsigned long tid; - if (slab_pre_alloc_hook(s, gfpflags)) + s = slab_pre_alloc_hook(s, gfpflags); + if (!s) return NULL; - - s = memcg_kmem_get_cache(s, gfpflags); redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2554,7 +2561,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } else { /* Needs to be taken off a list */ - n = get_node(s, page_to_nid(page)); + n = get_node(s, page_to_nid(page)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -2587,10 +2594,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * The list lock was not taken therefore no list * activity can be necessary. */ - if (was_frozen) - stat(s, FREE_FROZEN); - return; - } + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000000..b5f7f24b8dd1 --- /dev/null +++ b/mm/swap_cgroup.c @@ -0,0 +1,208 @@ +#include <linux/swap_cgroup.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> + +#include <linux/swapops.h> /* depends on mm.h include */ + +static DEFINE_MUTEX(swap_cgroup_mutex); +struct swap_cgroup_ctrl { + struct page **map; + unsigned long length; + spinlock_t lock; +}; + +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; + +struct swap_cgroup { + unsigned short id; +}; +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) + +/* + * SwapCgroup implements "lookup" and "exchange" operations. + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge + * against SwapCache. At swap_free(), this is accessed directly from swap. + * + * This means, + * - we have no race in "exchange" when we're accessed via SwapCache because + * SwapCache(and its swp_entry) is under lock. + * - When called via swap_free(), there is no user of this entry and no race. + * Then, we don't need lock around "exchange". + * + * TODO: we can push these buffers out to HIGHMEM. + */ + +/* + * allocate buffer for swap_cgroup. + */ +static int swap_cgroup_prepare(int type) +{ + struct page *page; + struct swap_cgroup_ctrl *ctrl; + unsigned long idx, max; + + ctrl = &swap_cgroup_ctrl[type]; + + for (idx = 0; idx < ctrl->length; idx++) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto not_enough_page; + ctrl->map[idx] = page; + } + return 0; +not_enough_page: + max = idx; + for (idx = 0; idx < max; idx++) + __free_page(ctrl->map[idx]); + + return -ENOMEM; +} + +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @ent: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup using 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** + * swap_cgroup_record - record mem_cgroup for this swp_entry. + * @ent: swap entry to be recorded into + * @id: mem_cgroup to be recorded + * + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) + */ +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned short old; + unsigned long flags; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + old = sc->id; + sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); + + return old; +} + +/** + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry + * @ent: swap entry to be looked up. + * + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + */ +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return lookup_swap_cgroup(ent, NULL)->id; +} + +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + void *array; + unsigned long array_size; + unsigned long length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return 0; + + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + array_size = length * sizeof(void *); + + array = vzalloc(array_size); + if (!array) + goto nomem; + + ctrl = &swap_cgroup_ctrl[type]; + mutex_lock(&swap_cgroup_mutex); + ctrl->length = length; + ctrl->map = array; + spin_lock_init(&ctrl->lock); + if (swap_cgroup_prepare(type)) { + /* memory shortage */ + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + vfree(array); + goto nomem; + } + mutex_unlock(&swap_cgroup_mutex); + + return 0; +nomem: + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); + printk(KERN_INFO + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + return -ENOMEM; +} + +void swap_cgroup_swapoff(int type) +{ + struct page **map; + unsigned long i, length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return; + + mutex_lock(&swap_cgroup_mutex); + ctrl = &swap_cgroup_ctrl[type]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; + if (page) + __free_page(page); + } + vfree(map); + } +} diff --git a/mm/swap_state.c b/mm/swap_state.c index 154444918685..9711342987a0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,7 +17,6 @@ #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> -#include <linux/page_cgroup.h> #include <asm/pgtable.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..63f55ccb9b26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,7 +38,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <linux/swapops.h> -#include <linux/page_cgroup.h> +#include <linux/swap_cgroup.h> static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); diff --git a/mm/truncate.c b/mm/truncate.c index 261eaf6e5a19..f1e4d6052369 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache); * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and before all filesystem specific - * block truncation has been performed. + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { @@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) struct page *page; pgoff_t index; - WARN_ON(!mutex_is_locked(&inode->i_mutex)); WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_CACHE_SIZE) diff --git a/mm/vmacache.c b/mm/vmacache.c index 9f25af825dec..b6e3662fe339 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm) { struct task_struct *g, *p; + count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); + /* * Single threaded tasks need not iterate the entire * list of process. We can avoid the flushing as well diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 90520af7f186..39c338896416 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -463,8 +463,7 @@ overflow: goto retry; } if (printk_ratelimit()) - printk(KERN_WARNING - "vmap allocation for size %lu failed: " + pr_warn("vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); kfree(va); return ERR_PTR(-EBUSY); @@ -2575,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); if (v->flags & VM_UNINITIALIZED) return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d4042e75f7c7..c5afd573d7da 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -165,6 +165,7 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; + spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old @@ -173,11 +174,12 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - if (!vmpr->scanned) + scanned = vmpr->scanned; + if (!scanned) { + spin_unlock(&vmpr->sr_lock); return; + } - spin_lock(&vmpr->sr_lock); - scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; vmpr->scanned = 0; vmpr->reclaimed = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index dcb47074ae03..bd9a72bc4a1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long -shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, - unsigned long nr_pages_scanned, unsigned long lru_pages) +static unsigned long shrink_slabs(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -255,13 +256,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta = (4 * nr_scanned) / shrinker->seeks; delta *= freeable; - do_div(delta, lru_pages + 1); + do_div(delta, nr_eligible + 1); total_scan += delta; if (total_scan < 0) { - printk(KERN_ERR - "shrink_slab: %pF negative objects to delete nr=%ld\n", + pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; } @@ -290,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_pages_scanned, lru_pages, - freeable, delta, total_scan); + nr_scanned, nr_eligible, + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -340,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, return freed; } -/* - * Call the shrink functions to age shrinkable caches - * - * Here we assume it costs one seek to replace a lru page and that it also - * takes a seek to recreate a cache object. With this in mind we age equal - * percentages of the lru and ageable caches. This should balance the seeks - * generated by these structures. +/** + * shrink_node_slabs - shrink slab caches of a given node + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @nr_scanned: pressure numerator + * @nr_eligible: pressure denominator * - * If the vm encountered mapped pages on the LRU it increase the pressure on - * slab to avoid swapping. + * Call the shrink functions to age shrinkable caches. * - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. + * @nr_scanned and @nr_eligible form a ratio that indicate how much of + * the available objects should be scanned. Page reclaim for example + * passes the number of pages scanned and the number of pages on the + * LRU lists that it considered on @nid, plus a bias in @nr_scanned + * when it encountered mapped pages. The ratio is further biased by + * the ->seeks setting of the shrink function, which indicates the + * cost to recreate an object relative to that of an LRU page. * - * Returns the number of slab objects which we shrunk. + * Returns the number of reclaimed slab objects. */ -unsigned long shrink_slab(struct shrink_control *shrinkctl, - unsigned long nr_pages_scanned, - unsigned long lru_pages) +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; - if (nr_pages_scanned == 0) - nr_pages_scanned = SWAP_CLUSTER_MAX; + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { /* @@ -381,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { - shrinkctl->nid = 0; - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); - continue; - } + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + }; - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (node_online(shrinkctl->nid)) - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; - } + freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); } + up_read(&shrinker_rwsem); out: cond_resched(); @@ -875,7 +875,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * end of the LRU a second time. */ mapping = page_mapping(page); - if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + if (((dirty || writeback) && mapping && + bdi_write_congested(mapping->backing_dev_info)) || (writeback && PageReclaim(page))) nr_congested++; @@ -1876,7 +1877,8 @@ enum scan_balance { * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *nr) + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -2022,6 +2024,7 @@ out: some_scanned = false; /* Only use force_scan on second pass. */ for (pass = 0; !some_scanned && pass < 2; pass++) { + *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long size; @@ -2048,14 +2051,19 @@ out: case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) + if ((scan_balance == SCAN_FILE) != file) { + size = 0; scan = 0; + } break; default: /* Look ma, no brain */ BUG(); } + + *lru_pages += size; nr[lru] = scan; + /* * Skip the second pass and don't force_scan, * if we found something to scan. @@ -2069,7 +2077,7 @@ out: * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc) + struct scan_control *sc, unsigned long *lru_pages) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2249,7 +2257,7 @@ static inline bool should_continue_reclaim(struct zone *zone, return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(zone, sc->order, 0, 0)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static bool shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc, + bool is_classzone) { unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) .zone = zone, .priority = sc->priority, }; + unsigned long zone_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { + unsigned long lru_pages; struct lruvec *lruvec; int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, swappiness, sc); + shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + zone_lru_pages += lru_pages; /* * Direct reclaim and kswapd have to scan all memory @@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, memcg, &reclaim); } while (memcg); + /* + * Shrink the slab caches in the same proportion that + * the eligible LRU pages were scanned. + */ + if (global_reclaim(sc) && is_classzone) { + struct reclaim_state *reclaim_state; + + shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + reclaim_state = current->reclaim_state; + if (reclaim_state) { + sc->nr_reclaimed += + reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2346,7 +2377,7 @@ static inline bool compaction_ready(struct zone *zone, int order) * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long lru_pages = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); bool reclaimable = false; @@ -2394,23 +2420,27 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; - nodes_clear(shrink.nodes_to_scan); - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { + requested_highidx, sc->nodemask) { + enum zone_type classzone_idx; + if (!populated_zone(zone)) continue; + + classzone_idx = requested_highidx; + while (!populated_zone(zone->zone_pgdat->node_zones + + classzone_idx)) + classzone_idx--; + /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (global_reclaim(sc)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) continue; - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2449,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc)) + if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) reclaimable = true; if (global_reclaim(sc) && @@ -2458,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) } /* - * Don't shrink slabs when reclaiming memory from over limit cgroups - * but do shrink slab at least once when aborting reclaim for - * compaction to avoid unevenly scanning file/anon LRU pages over slab - * pages. - */ - if (global_reclaim(sc)) { - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } - - /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. */ @@ -2735,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); int swappiness = mem_cgroup_swappiness(memcg); + unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2750,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc); + shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2824,8 +2841,8 @@ static bool zone_balanced(struct zone *zone, int order, balance_gap, classzone_idx, 0)) return false; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, + order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return true; @@ -2931,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, struct scan_control *sc, - unsigned long lru_pages, unsigned long *nr_attempted) { int testorder = sc->order; unsigned long balance_gap; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; bool lowmem_pressure; /* Reclaim above the high watermark. */ @@ -2952,8 +2964,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * from memory. Do not reclaim more than needed for compaction. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order) != - COMPACT_SKIPPED) + compaction_suitable(zone, sc->order, 0, classzone_idx) + != COMPACT_SKIPPED) testorder = 0; /* @@ -2974,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, balance_gap, classzone_idx)) return true; - shrink_zone(zone, sc); - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - - reclaim_state->reclaimed_slab = 0; - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - sc->nr_reclaimed += reclaim_state->reclaimed_slab; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; @@ -3041,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long lru_pages = 0; unsigned long nr_attempted = 0; bool raise_priority = true; bool pgdat_needs_compaction = (order > 0); @@ -3101,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - lru_pages += zone_reclaimable_pages(zone); - /* * If any zone is currently balanced then kswapd will * not call compaction as it is expected that the @@ -3158,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, &sc, - lru_pages, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, + &sc, &nr_attempted)) raise_priority = false; } @@ -3388,7 +3391,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!populated_zone(zone)) return; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; if (pgdat->kswapd_max_order < order) { @@ -3611,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); /* @@ -3633,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc); + shrink_zone(zone, &sc, true); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages0 > zone->min_slab_pages) { - /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we take the current - * number of slab pages and shake the slab until it is reduced - * by the same nr_pages that we used for reclaiming unmapped - * pages. - */ - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - for (;;) { - unsigned long lru_pages = zone_reclaimable_pages(zone); - - /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) - break; - - /* Freed enough memory */ - nr_slab_pages1 = zone_page_state(zone, - NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) - break; - } - - /* - * Update nr_reclaimed by the number of slab pages we - * reclaimed from this zone. - */ - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 < nr_slab_pages0) - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; - } - p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); lockdep_clear_current_reclaim_state(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc68..1284f89fca08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -22,6 +22,8 @@ #include <linux/writeback.h> #include <linux/compaction.h> #include <linux/mm_inline.h> +#include <linux/page_ext.h> +#include <linux/page_owner.h> #include "internal.h" @@ -898,6 +900,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", + "vmacache_full_flushes", #endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; @@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) return 0; } +#ifdef CONFIG_PAGE_OWNER +static void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, + struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pfnblock_migratetype(page, pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_ext->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} +#endif /* CONFIG_PAGE_OWNER */ + +/* + * Print out the number of pageblocks for each migratetype that contain pages + * of other types. This gives an indication of how well fallbacks are being + * contained by rmqueue_fallback(). It requires information from PAGE_OWNER + * to determine what is going on + */ +static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) +{ +#ifdef CONFIG_PAGE_OWNER + int mtype; + + if (!page_owner_inited) + return; + + drain_all_pages(NULL); + + seq_printf(m, "\n%-23s", "Number of mixed blocks "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); +#endif /* CONFIG_PAGE_OWNER */ +} + /* * This prints out statistics in relation to grouping pages by mobility. * It is expensive to collect so do not constantly read the file. @@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) seq_putc(m, '\n'); pagetypeinfo_showfree(m, pgdat); pagetypeinfo_showblockcount(m, pgdat); + pagetypeinfo_showmixedcount(m, pgdat); return 0; } diff --git a/mm/zbud.c b/mm/zbud.c index ecf1dbef6983..4e387bea702e 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = { static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) { - return zbud_create_pool(gfp, &zbud_zpool_ops); + return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); } static void zbud_zpool_destroy(void *pool) @@ -619,5 +619,5 @@ module_init(init_zbud); module_exit(exit_zbud); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 839a48c3ca27..b72403927aa4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -155,8 +155,6 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) /* * We do not maintain any list for completely empty or full pages @@ -171,6 +169,11 @@ enum fullness_group { }; /* + * number of size_classes + */ +static int zs_size_classes; + +/* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where * n = number of allocated objects @@ -214,7 +217,7 @@ struct link_free { }; struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; + struct size_class **size_class; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; @@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, if (newfg == currfg) goto out; - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; remove_zspage(page, class, currfg); insert_zspage(page, class, newfg); set_zspage_mapping(page, class_idx, newfg); @@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) struct page *next_page; struct link_free *link; unsigned int i = 1; + void *vaddr; /* * page->index stores offset of first object starting @@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) if (page != first_page) page->index = off; - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { link->next = obj_location_to_handle(page, i++); @@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) */ next_page = get_next_page(page); link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); + kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } @@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm_buf) return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; @@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) static inline void __zs_cpu_down(struct mapping_area *area) { - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); + kfree(area->vm_buf); area->vm_buf = NULL; } @@ -881,13 +884,26 @@ static struct notifier_block zs_cpu_nb = { .notifier_call = zs_cpu_notifier }; -static void zs_exit(void) +static int zs_register_cpu_notifier(void) { - int cpu; + int cpu, uninitialized_var(ret); -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); + for_each_online_cpu(cpu) { + ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + if (notifier_to_errno(ret)) + break; + } + + cpu_notifier_register_done(); + return notifier_to_errno(ret); +} + +static void zs_unregister_cpu_notifier(void) +{ + int cpu; cpu_notifier_register_begin(); @@ -898,93 +914,129 @@ static void zs_exit(void) cpu_notifier_register_done(); } -static int zs_init(void) +static void init_zs_size_classes(void) { - int cpu, ret; + int nr; - cpu_notifier_register_begin(); + nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; + if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) + nr += 1; - __register_cpu_notifier(&zs_cpu_nb); - for_each_online_cpu(cpu) { - ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) { - cpu_notifier_register_done(); - goto fail; - } - } + zs_size_classes = nr; +} - cpu_notifier_register_done(); +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} -#ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); -#endif +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +{ + if (prev->pages_per_zspage != pages_per_zspage) + return false; - return 0; -fail: - zs_exit(); - return notifier_to_errno(ret); + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; + + return true; } +unsigned long zs_get_total_pages(struct zs_pool *pool) +{ + return atomic_long_read(&pool->pages_allocated); +} +EXPORT_SYMBOL_GPL(zs_get_total_pages); + /** - * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata + * zs_map_object - get address of allocated object from handle. + * @pool: pool from which the object was allocated + * @handle: handle returned from zs_malloc * - * This function must be called before anything when using - * the zsmalloc allocator. + * Before using an object allocated from zs_malloc, it must be mapped using + * this function. When done with the object, it must be unmapped using + * zs_unmap_object. * - * On success, a pointer to the newly created pool is returned, - * otherwise NULL. + * Only one object can be mapped per cpu at a time. There is no protection + * against nested mappings. + * + * This function returns with preemption and page faults disabled. */ -struct zs_pool *zs_create_pool(gfp_t flags) +void *zs_map_object(struct zs_pool *pool, unsigned long handle, + enum zs_mapmode mm) { - int i, ovhd_size; - struct zs_pool *pool; + struct page *page; + unsigned long obj_idx, off; - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); - if (!pool) - return NULL; + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int size; - struct size_class *class; + BUG_ON(!handle); - size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; - if (size > ZS_MAX_ALLOC_SIZE) - size = ZS_MAX_ALLOC_SIZE; + /* + * Because we use per-cpu mapping areas shared among the + * pools/users, we can't allow mapping in interrupt context + * because it can corrupt another users mappings. + */ + BUG_ON(in_interrupt()); - class = &pool->size_class[i]; - class->size = size; - class->index = i; - spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + area = &get_cpu_var(zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + area->vm_addr = kmap_atomic(page); + return area->vm_addr + off; } - pool->flags = flags; + /* this object spans two pages */ + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); - return pool; + return __zs_map_object(area, pages, off, class->size); } -EXPORT_SYMBOL_GPL(zs_create_pool); +EXPORT_SYMBOL_GPL(zs_map_object); -void zs_destroy_pool(struct zs_pool *pool) +void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { - int i; + struct page *page; + unsigned long obj_idx, off; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int fg; - struct size_class *class = &pool->size_class[i]; + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { - pr_info("Freeing non-empty class with size %db, fullness group %d\n", - class->size, fg); - } - } + BUG_ON(!handle); + + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = this_cpu_ptr(&zs_map_area); + if (off + class->size <= PAGE_SIZE) + kunmap_atomic(area->vm_addr); + else { + struct page *pages[2]; + + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + __zs_unmap_object(area, pages, off, class->size); } - kfree(pool); + put_cpu_var(zs_map_area); } -EXPORT_SYMBOL_GPL(zs_destroy_pool); +EXPORT_SYMBOL_GPL(zs_unmap_object); /** * zs_malloc - Allocate block of given size from pool. @@ -999,8 +1051,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) { unsigned long obj; struct link_free *link; - int class_idx; struct size_class *class; + void *vaddr; struct page *first_page, *m_page; unsigned long m_objidx, m_offset; @@ -1008,9 +1060,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); + class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); first_page = find_get_zspage(class); @@ -1031,11 +1081,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) obj_handle_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); first_page->freelist = link->next; memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->inuse++; /* Now move the zspage to another fullness group, if required */ @@ -1051,6 +1101,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) struct link_free *link; struct page *first_page, *f_page; unsigned long f_objidx, f_offset; + void *vaddr; int class_idx; struct size_class *class; @@ -1063,16 +1114,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); spin_lock(&class->lock); /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); + vaddr = kmap_atomic(f_page); + link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->freelist = (void *)obj; first_page->inuse--; @@ -1088,100 +1139,137 @@ void zs_free(struct zs_pool *pool, unsigned long obj) EXPORT_SYMBOL_GPL(zs_free); /** - * zs_map_object - get address of allocated object from handle. - * @pool: pool from which the object was allocated - * @handle: handle returned from zs_malloc - * - * Before using an object allocated from zs_malloc, it must be mapped using - * this function. When done with the object, it must be unmapped using - * zs_unmap_object. + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata * - * Only one object can be mapped per cpu at a time. There is no protection - * against nested mappings. + * This function must be called before anything when using + * the zsmalloc allocator. * - * This function returns with preemption and page faults disabled. + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. */ -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm) +struct zs_pool *zs_create_pool(gfp_t flags) { - struct page *page; - unsigned long obj_idx, off; + int i; + struct zs_pool *pool; + struct size_class *prev_class = NULL; - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; - struct page *pages[2]; + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; - BUG_ON(!handle); + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { + kfree(pool); + return NULL; + } /* - * Because we use per-cpu mapping areas shared among the - * pools/users, we can't allow mapping in interrupt context - * because it can corrupt another users mappings. + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. */ - BUG_ON(in_interrupt()); + for (i = zs_size_classes - 1; i >= 0; i--) { + int size; + int pages_per_zspage; + struct size_class *class; - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); - area = &get_cpu_var(zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ - area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; + + class->size = size; + class->index = i; + class->pages_per_zspage = pages_per_zspage; + spin_lock_init(&class->lock); + pool->size_class[i] = class; + + prev_class = class; } - /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + pool->flags = flags; - return __zs_map_object(area, pages, off, class->size); + return pool; + +err: + zs_destroy_pool(pool); + return NULL; } -EXPORT_SYMBOL_GPL(zs_map_object); +EXPORT_SYMBOL_GPL(zs_create_pool); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +void zs_destroy_pool(struct zs_pool *pool) { - struct page *page; - unsigned long obj_idx, off; + int i; - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; + for (i = 0; i < zs_size_classes; i++) { + int fg; + struct size_class *class = pool->size_class[i]; - BUG_ON(!handle); + if (!class) + continue; - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + if (class->index != i) + continue; - area = this_cpu_ptr(&zs_map_area); - if (off + class->size <= PAGE_SIZE) - kunmap_atomic(area->vm_addr); - else { - struct page *pages[2]; + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + kfree(class); + } - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + kfree(pool->size_class); + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); - __zs_unmap_object(area, pages, off, class->size); +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) { + zs_unregister_cpu_notifier(); + return ret; } - put_cpu_var(zs_map_area); + + init_zs_size_classes(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; } -EXPORT_SYMBOL_GPL(zs_unmap_object); -unsigned long zs_get_total_pages(struct zs_pool *pool) +static void __exit zs_exit(void) { - return atomic_long_read(&pool->pages_allocated); +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + zs_unregister_cpu_notifier(); } -EXPORT_SYMBOL_GPL(zs_get_total_pages); module_init(zs_init); module_exit(zs_exit); diff --git a/mm/zswap.c b/mm/zswap.c index ea064c1a09ba..0cfce9bc51e4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -149,11 +149,10 @@ static int __init zswap_comp_init(void) return 0; } -static void zswap_comp_exit(void) +static void __init zswap_comp_exit(void) { /* free percpu transforms */ - if (zswap_comp_pcpu_tfms) - free_percpu(zswap_comp_pcpu_tfms); + free_percpu(zswap_comp_pcpu_tfms); } /********************************* @@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; **********************************/ static struct kmem_cache *zswap_entry_cache; -static int zswap_entry_cache_create(void) +static int __init zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); return zswap_entry_cache == NULL; @@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = { .notifier_call = zswap_cpu_notifier }; -static int zswap_cpu_init(void) +static int __init zswap_cpu_init(void) { unsigned long cpu; @@ -951,5 +950,5 @@ error: late_initcall(init_zswap); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages"); |