diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 328 |
1 files changed, 250 insertions, 78 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1..d2186ecb36f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/page-debug-flags.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void) saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page, buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * permit the bootmem allocator to evade page validation on high-order frees - */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { - if (order == 0) { - __ClearPageReserved(page); - set_page_count(page, 0); - set_page_refcounted(page); - __free_page(page); - } else { - int loop; + unsigned int nr_pages = 1 << order; + unsigned int loop; - prefetchw(page); - for (loop = 0; loop < (1 << order); loop++) { - struct page *p = &page[loop]; + prefetchw(page); + for (loop = 0; loop < nr_pages; loop++) { + struct page *p = &page[loop]; - if (loop + 1 < (1 << order)) - prefetchw(p + 1); - __ClearPageReserved(p); - set_page_count(p, 0); - } - - set_page_refcounted(page); - __free_pages(page, order); + if (loop + 1 < nr_pages) + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); } + + set_page_refcounted(page); + __free_pages(page, order); } @@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -1189,6 +1257,19 @@ out: } /* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_page_free_batched(page, cold); + free_hot_cold_page(page, cold); + } +} + +/* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] * Each sub-page must be freed individually. @@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) @@ -1645,6 +1726,35 @@ zonelist_scan: if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + /* + * When allocating a page cache page for writing, we + * want to get it from a zone that is within its dirty + * limit, such that no single zone holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the zone's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * This may look like it could increase pressure on + * lower zones by failing allocations in higher zones + * before they are full. But the pages that do spill + * over are limited as the lower zones are protected + * by this very same mechanism. It should not become + * a practical burden to them. + * + * XXX: For now, allow allocations to potentially + * exceed the per-zone dirty limit in the slowpath + * (ALLOC_WMARK_LOW unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * zones are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ + if ((alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + goto this_zone_full; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; /* @@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@ -1864,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -1899,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -1911,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2062,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2142,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2196,7 +2337,8 @@ rebalance: /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; @@ -2210,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -2306,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { @@ -3385,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* @@ -4121,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4171,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4526,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } @@ -4734,8 +4877,19 @@ static void calculate_totalreserve_pages(void) if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -5259,7 +5413,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { - struct zone *zone = page_zone(page); + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (zone->zone_start_pfn > pfn || + zone->zone_start_pfn + zone->spanned_pages <= pfn) + return false; + return __count_immobile_pages(zone, page, 0); } |