summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c397
1 files changed, 213 insertions, 184 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 196709f5ee5..88c5fed8b9a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-long vm_total_pages; /* The total number of pages which the VM controls */
+unsigned long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
}
#endif
-static int inactive_file_is_low_global(struct zone *zone)
-{
- unsigned long active, inactive;
-
- active = zone_page_state(zone, NR_ACTIVE_FILE);
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-
- return (active > inactive);
-}
-
/**
* inactive_file_is_low - check if file pages need to be deactivated
* @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
*/
static int inactive_file_is_low(struct lruvec *lruvec)
{
- if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_file_is_low(lruvec);
+ unsigned long inactive;
+ unsigned long active;
+
+ inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
- return inactive_file_is_low_global(lruvec_zone(lruvec));
+ return active > inactive;
}
static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
return mem_cgroup_swappiness(sc->target_mem_cgroup);
}
+enum scan_balance {
+ SCAN_EQUAL,
+ SCAN_FRACT,
+ SCAN_ANON,
+ SCAN_FILE,
+};
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
- unsigned long anon, file, free;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ u64 fraction[2];
+ u64 denominator = 0; /* gcc */
+ struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
+ enum scan_balance scan_balance;
+ unsigned long anon, file, free;
+ bool force_scan = false;
unsigned long ap, fp;
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- u64 fraction[2], denominator;
enum lru_list lru;
- int noswap = 0;
- bool force_scan = false;
- struct zone *zone = lruvec_zone(lruvec);
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (nr_swap_pages <= 0)) {
- noswap = 1;
- fraction[0] = 0;
- fraction[1] = 1;
- denominator = 1;
+ if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ /*
+ * Global reclaim will swap to prevent OOM even with no
+ * swappiness, but memcg users want to use this knob to
+ * disable swapping for individual groups completely when
+ * using the memory controller's swap limit feature would be
+ * too expensive.
+ */
+ if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ /*
+ * Do not apply any pressure balancing cleverness when the
+ * system is close to OOM, scan both anon and file equally
+ * (unless the swappiness setting disagrees with swapping).
+ */
+ if (!sc->priority && vmscan_swappiness(sc)) {
+ scan_balance = SCAN_EQUAL;
goto out;
}
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ /*
+ * If it's foreseeable that reclaiming the file cache won't be
+ * enough to get the zone back into a desirable shape, we have
+ * to swap. Better start now and leave the - probably heavily
+ * thrashing - remaining file pages alone.
+ */
if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
+ free = zone_page_state(zone, NR_FREE_PAGES);
if (unlikely(file + free <= high_wmark_pages(zone))) {
- /*
- * If we have very few page cache pages, force-scan
- * anon pages.
- */
- fraction[0] = 1;
- fraction[1] = 0;
- denominator = 1;
- goto out;
- } else if (!inactive_file_is_low_global(zone)) {
- /*
- * There is enough inactive page cache, do not
- * reclaim anything from the working set right now.
- */
- fraction[0] = 0;
- fraction[1] = 1;
- denominator = 1;
+ scan_balance = SCAN_ANON;
goto out;
}
}
/*
+ * There is enough inactive page cache, do not reclaim
+ * anything from the anonymous working set right now.
+ */
+ if (!inactive_file_is_low(lruvec)) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ scan_balance = SCAN_FRACT;
+
+ /*
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
+ unsigned long size;
unsigned long scan;
- scan = get_lru_size(lruvec, lru);
- if (sc->priority || noswap || !vmscan_swappiness(sc)) {
- scan >>= sc->priority;
- if (!scan && force_scan)
- scan = SWAP_CLUSTER_MAX;
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
+
+ if (!scan && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
+
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
scan = div64_u64(scan * fraction[file], denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
}
nr[lru] = scan;
}
}
+/*
+ * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
+ */
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long nr_to_scan;
+ enum lru_list lru;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
+
+ get_scan_count(lruvec, sc, nr);
+
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+ for_each_evictable_lru(lru) {
+ if (nr[lru]) {
+ nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+ nr[lru] -= nr_to_scan;
+
+ nr_reclaimed += shrink_list(lru, nr_to_scan,
+ lruvec, sc);
+ }
+ }
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is fine for the starting priority;
+ * we want to put equal scanning pressure on each zone.
+ * However, if the VM has a harder time of freeing pages,
+ * with multiple processes reclaiming pages, the total
+ * freeing target can get unreasonably large.
+ */
+ if (nr_reclaimed >= nr_to_reclaim &&
+ sc->priority < DEF_PRIORITY)
+ break;
+ }
+ blk_finish_plug(&plug);
+ sc->nr_reclaimed += nr_reclaimed;
+
+ /*
+ * Even if we did not try to evict anon pages at all, we want to
+ * rebalance the anon lru active/inactive ratio.
+ */
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
+
+ throttle_vm_writeout(sc->gfp_mask);
+}
+
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
-static inline bool should_continue_reclaim(struct lruvec *lruvec,
+static inline bool should_continue_reclaim(struct zone *zone,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- if (nr_swap_pages > 0)
- inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
+ switch (compaction_suitable(zone, sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
}
}
-/*
- * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
- */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
- unsigned long nr[NR_LRU_LISTS];
- unsigned long nr_to_scan;
- enum lru_list lru;
unsigned long nr_reclaimed, nr_scanned;
- unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct blk_plug plug;
-
-restart:
- nr_reclaimed = 0;
- nr_scanned = sc->nr_scanned;
- get_scan_count(lruvec, sc, nr);
-
- blk_start_plug(&plug);
- while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
- nr[LRU_INACTIVE_FILE]) {
- for_each_evictable_lru(lru) {
- if (nr[lru]) {
- nr_to_scan = min_t(unsigned long,
- nr[lru], SWAP_CLUSTER_MAX);
- nr[lru] -= nr_to_scan;
-
- nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
- }
- }
- /*
- * On large memory systems, scan >> priority can become
- * really large. This is fine for the starting priority;
- * we want to put equal scanning pressure on each zone.
- * However, if the VM has a harder time of freeing pages,
- * with multiple processes reclaiming pages, the total
- * freeing target can get unreasonably large.
- */
- if (nr_reclaimed >= nr_to_reclaim &&
- sc->priority < DEF_PRIORITY)
- break;
- }
- blk_finish_plug(&plug);
- sc->nr_reclaimed += nr_reclaimed;
- /*
- * Even if we did not try to evict anon pages at all, we want to
- * rebalance the anon lru active/inactive ratio.
- */
- if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
-
- /* reclaim/compaction might need reclaim to continue */
- if (should_continue_reclaim(lruvec, nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc))
- goto restart;
+ do {
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = sc->priority,
+ };
+ struct mem_cgroup *memcg;
- throttle_vm_writeout(sc->gfp_mask);
-}
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
- struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .zone = zone,
- .priority = sc->priority,
- };
- struct mem_cgroup *memcg;
+ memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ do {
+ struct lruvec *lruvec;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
- do {
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- shrink_lruvec(lruvec, sc);
+ shrink_lruvec(lruvec, sc);
- /*
- * Limit reclaim has historically picked one memcg and
- * scanned it with decreasing priority levels until
- * nr_to_reclaim had been reclaimed. This priority
- * cycle is thus over after a single memcg.
- *
- * Direct reclaim and kswapd, on the other hand, have
- * to scan all memory cgroups to fulfill the overall
- * scan target for the zone.
- */
- if (!global_reclaim(sc)) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ /*
+ * Direct reclaim and kswapd have to scan all memory
+ * cgroups to fulfill the overall scan target for the
+ * zone.
+ *
+ * Limit reclaim, on the other hand, only cares about
+ * nr_to_reclaim pages to be reclaimed and it will
+ * retry with decreasing priority if one round over the
+ * whole hierarchy is not sufficient.
+ */
+ if (!global_reclaim(sc) &&
+ sc->nr_reclaimed >= sc->nr_to_reclaim) {
+ mem_cgroup_iter_break(root, memcg);
+ break;
+ }
+ memcg = mem_cgroup_iter(root, memcg, &reclaim);
+ } while (memcg);
+ } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc));
}
/* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* a reasonable chance of completing and allocating the page
*/
balance_gap = min(low_wmark_pages(zone),
- (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
goto out;
/*
+ * If we're getting trouble reclaiming, start doing
+ * writepage even in laptop mode.
+ */
+ if (sc->priority < DEF_PRIORITY - 2)
+ sc->may_writepage = 1;
+
+ /*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
{
unsigned long nr_reclaimed;
struct scan_control sc = {
- .gfp_mask = gfp_mask,
+ .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long present_pages = 0;
+ unsigned long managed_pages = 0;
unsigned long balanced_pages = 0;
int i;
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
if (!populated_zone(zone))
continue;
- present_pages += zone->present_pages;
+ managed_pages += zone->managed_pages;
/*
* A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* they must be considered balanced here as well!
*/
if (zone->all_unreclaimable) {
- balanced_pages += zone->present_pages;
+ balanced_pages += zone->managed_pages;
continue;
}
if (zone_balanced(zone, order, 0, i))
- balanced_pages += zone->present_pages;
+ balanced_pages += zone->managed_pages;
else if (!order)
return false;
}
if (order)
- return balanced_pages >= (present_pages >> 2);
+ return balanced_pages >= (managed_pages >> 2);
else
return true;
}
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
- struct zone *unbalanced_zone;
+ bool pgdat_is_balanced = false;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
do {
unsigned long lru_pages = 0;
- int has_under_min_watermark_zone = 0;
-
- unbalanced_zone = NULL;
/*
* Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
zone_clear_flag(zone, ZONE_CONGESTED);
}
}
- if (i < 0)
+
+ if (i < 0) {
+ pgdat_is_balanced = true;
goto out;
+ }
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
* of the zone, whichever is smaller.
*/
balance_gap = min(low_wmark_pages(zone),
- (zone->present_pages +
+ (zone->managed_pages +
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
/*
@@ -2720,12 +2772,10 @@ loop_again:
}
/*
- * If we've done a decent amount of scanning and
- * the reclaim ratio is low, start doing writepage
- * even in laptop mode
+ * If we're getting trouble reclaiming, start doing
+ * writepage even in laptop mode.
*/
- if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+ if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
continue;
}
- if (!zone_balanced(zone, testorder, 0, end_zone)) {
- unbalanced_zone = zone;
- /*
- * We are still under min water mark. This
- * means that we have a GFP_ATOMIC allocation
- * failure risk. Hurry up!
- */
- if (!zone_watermark_ok_safe(zone, order,
- min_wmark_pages(zone), end_zone, 0))
- has_under_min_watermark_zone = 1;
- } else {
+ if (zone_balanced(zone, testorder, 0, end_zone))
/*
* If a zone reaches its high watermark,
* consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
* speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- }
-
}
/*
@@ -2766,17 +2804,9 @@ loop_again:
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
- if (pgdat_balanced(pgdat, order, *classzone_idx))
+ if (pgdat_balanced(pgdat, order, *classzone_idx)) {
+ pgdat_is_balanced = true;
break; /* kswapd: all done */
- /*
- * OK, kswapd is getting into trouble. Take a nap, then take
- * another pass across the zones.
- */
- if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
- if (has_under_min_watermark_zone)
- count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
- else if (unbalanced_zone)
- wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
}
/*
@@ -2788,9 +2818,9 @@ loop_again:
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
} while (--sc.priority >= 0);
-out:
- if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
+out:
+ if (!pgdat_is_balanced) {
cond_resched();
try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
nr = global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_FILE);
- if (nr_swap_pages > 0)
+ if (get_nr_swap_pages() > 0)
nr += global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_INACTIVE_ANON);
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
- if (nr_swap_pages > 0)
+ if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
- .nr_to_reclaim = max_t(unsigned long, nr_pages,
- SWAP_CLUSTER_MAX),
- .gfp_mask = gfp_mask,
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
};