summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/vm.txt14
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/sysctl.h2
-rw-r--r--kernel/sysctl.c11
-rw-r--r--mm/page_alloc.c22
-rw-r--r--mm/vmscan.c27
7 files changed, 69 insertions, 14 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 86754eb390d..7cee90223d3 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
- block_dump
- drop-caches
- zone_reclaim_mode
+- min_unmapped_ratio
- panic_on_oom
==============================================================
@@ -168,6 +169,19 @@ in all nodes of the system.
=============================================================
+min_unmapped_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the file backed pages in each zone. Zone reclaim will only
+occur if more than this percentage of pages are file backed and unmapped.
+This is to insure that a minimal amount of local pages is still available for
+file I/O even if the node is overallocated.
+
+The default is 1 percent.
+
+=============================================================
+
panic_on_oom
This enables or disables panic on out-of-memory feature. If this is set to 1,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 27e748eb72b..656b588a9f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -150,6 +150,10 @@ struct zone {
unsigned long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_ratio;
struct per_cpu_pageset *pageset[NR_CPUS];
#else
struct per_cpu_pageset pageset[NR_CPUS];
@@ -414,6 +418,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+ struct file *, void __user *, size_t *, loff_t *);
#include <linux/topology.h>
/* Returns the number of the current Node. */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index cf6ca6e377b..5e59184c909 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -189,6 +189,7 @@ extern long vm_total_pages;
#ifdef CONFIG_NUMA
extern int zone_reclaim_mode;
+extern int sysctl_min_unmapped_ratio;
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
#else
#define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 46e4d8f2771..e4b1a4d4dcf 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -188,7 +188,7 @@ enum
VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */
VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
- VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+ VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a58f27907..362a0cc3713 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,6 +932,17 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
+ {
+ .ctl_name = VM_MIN_UNMAPPED,
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
#endif
#ifdef CONFIG_X86_32
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e792a583f3..54a4f5375bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+ zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+ / 100;
+#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
@@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ for_each_zone(zone)
+ zone->min_unmapped_ratio = (zone->present_pages *
+ sysctl_min_unmapped_ratio) / 100;
+ return 0;
+}
+#endif
+
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ff2ebe9458a..5d4c4d02254 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1503,10 +1503,6 @@ module_init(kswapd_init)
*
* If non-zero call zone_reclaim when the number of free pages falls below
* the watermarks.
- *
- * In the future we may add flags to the mode. However, the page allocator
- * should only have to check that zone_reclaim_mode != 0 before calling
- * zone_reclaim().
*/
int zone_reclaim_mode __read_mostly;
@@ -1524,6 +1520,12 @@ int zone_reclaim_mode __read_mostly;
#define ZONE_RECLAIM_PRIORITY 4
/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
* Try to free up some pages from this zone through reclaim.
*/
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
int node_id;
/*
- * Do not reclaim if there are not enough reclaimable pages in this
- * zone that would satify this allocations.
+ * Zone reclaim reclaims unmapped file backed pages.
*
- * All unmapped pagecache pages are reclaimable.
- *
- * Both counters may be temporarily off a bit so we use
- * SWAP_CLUSTER_MAX as the boundary. It may also be good to
- * leave a few frequently used unmapped pagecache pages around.
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the zone is overallocated. So we do not reclaim
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
*/
if (zone_page_state(zone, NR_FILE_PAGES) -
- zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
- return 0;
+ zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+ return 0;
/*
* Avoid concurrent zone reclaims, do not reclaim in a zone that does