summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig20
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c469
-rw-r--r--mm/bootmem.c24
-rw-r--r--mm/compaction.c605
-rw-r--r--mm/filemap.c59
-rw-r--r--mm/highmem.c9
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memblock.c541
-rw-r--r--mm/memcontrol.c1139
-rw-r--r--mm/memory-failure.c33
-rw-r--r--mm/memory.c41
-rw-r--r--mm/memory_hotplug.c36
-rw-r--r--mm/mempolicy.c314
-rw-r--r--mm/migrate.c70
-rw-r--r--mm/mincore.c263
-rw-r--r--mm/mlock.c41
-rw-r--r--mm/mmap.c50
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c32
-rw-r--r--mm/oom_kill.c688
-rw-r--r--mm/page-writeback.c300
-rw-r--r--mm/page_alloc.c360
-rw-r--r--mm/page_cgroup.c7
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/percpu-km.c104
-rw-r--r--mm/percpu-vm.c451
-rw-r--r--mm/percpu.c694
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c167
-rw-r--r--mm/shmem.c268
-rw-r--r--mm/slab.c254
-rw-r--r--mm/slob.c22
-rw-r--r--mm/slub.c172
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c114
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c754
-rw-r--r--mm/vmstat.c261
45 files changed, 5885 insertions, 2749 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9c61158308d..f4e516e9c37 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,6 +128,9 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+config HAVE_MEMBLOCK
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -172,6 +175,15 @@ config SPLIT_PTLOCK_CPUS
default "4"
#
+# support for memory compaction
+config COMPACTION
+ bool "Allow for memory compaction"
+ select MIGRATION
+ depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+ help
+ Allows the compaction of memory for the allocation of huge pages.
+
+#
# support for page migration
#
config MIGRATION
@@ -180,9 +192,11 @@ config MIGRATION
depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
help
Allows the migration of the physical location of pages of processes
- while the virtual addresses are not changed. This is useful for
- example on NUMA systems to put pages nearer to the processors accessing
- the page.
+ while the virtual addresses are not changed. This is useful in
+ two situations. The first is on NUMA systems to put pages nearer
+ to the processors accessing. The second is when allocating huge
+ pages as migration can relocate pages to satisfy a huge page
+ allocation instead of reclaiming.
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a54a4..34b2546a9e3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
$(mmu-y)
obj-y += init-mm.o
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -23,6 +25,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_COMPACTION) += compaction.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 707d0dc6da0..eaa4a5bbe06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <trace/events/writeback.h>
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -48,9 +49,6 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
-
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -66,31 +64,25 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb;
+ struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
struct inode *inode;
- /*
- * inode lock is enough here, the bdi->wb_list is protected by
- * RCU on the reader side
- */
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(wb, &bdi->wb_list, list) {
- nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
- nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
- }
+ list_for_each_entry(inode, &wb->b_dirty, i_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_list)
+ nr_more_io++;
spin_unlock(&inode_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -99,21 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
- "state: %8lx\n"
- "wb_mask: %8lx\n"
- "wb_list: %8u\n"
- "wb_cnt: %8u\n",
+ "state: %8lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
- !list_empty(&bdi->wb_list), bdi->wb_cnt);
+ K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
#undef K
return 0;
@@ -250,9 +237,8 @@ static int __init default_bdi_init(void)
sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
BUG_ON(IS_ERR(sync_supers_tsk));
- init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
- arm_supers_timer();
+ bdi_arm_supers_timer();
err = bdi_init(&default_backing_dev_info);
if (!err)
@@ -262,77 +248,6 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
- memset(wb, 0, sizeof(*wb));
-
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
-}
-
-static void bdi_task_init(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
-{
- struct task_struct *tsk = current;
-
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&wb->list, &bdi->wb_list);
- spin_unlock(&bdi->wb_lock);
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-}
-
-static int bdi_start_fn(void *ptr)
-{
- struct bdi_writeback *wb = ptr;
- struct backing_dev_info *bdi = wb->bdi;
- int ret;
-
- /*
- * Add us to the active bdi_list
- */
- spin_lock_bh(&bdi_lock);
- list_add_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_task_init(bdi, wb);
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-
- ret = bdi_writeback_task(wb);
-
- /*
- * Remove us from the list
- */
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&wb->list);
- spin_unlock(&bdi->wb_lock);
-
- /*
- * Flush any work that raced with us exiting. No new work
- * will be added, since this bdi isn't discoverable anymore.
- */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
-
- wb->task = NULL;
- return ret;
-}
-
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
@@ -341,21 +256,20 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
static void bdi_flush_io(struct backing_dev_info *bdi)
{
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.range_cyclic = 1,
.nr_to_write = 1024,
};
- writeback_inodes_wbc(&wbc);
+ writeback_inodes_wb(&bdi->wb, &wbc);
}
/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
* to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
*/
static int bdi_sync_supers(void *unused)
{
@@ -374,10 +288,13 @@ static int bdi_sync_supers(void *unused)
return 0;
}
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
{
unsigned long next;
+ if (!dirty_writeback_interval)
+ return;
+
next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
mod_timer(&sync_supers_timer, round_jiffies_up(next));
}
@@ -385,142 +302,199 @@ static void arm_supers_timer(void)
static void sync_supers_timer_fn(unsigned long unused)
{
wake_up_process(sync_supers_tsk);
- arm_supers_timer();
+ bdi_arm_supers_timer();
}
-static int bdi_forker_task(void *ptr)
+static void wakeup_timer_fn(unsigned long data)
{
- struct bdi_writeback *me = ptr;
-
- bdi_task_init(me->bdi, me);
-
- for (;;) {
- struct backing_dev_info *bdi, *tmp;
- struct bdi_writeback *wb;
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else {
/*
- * Temporary measure, we want to make sure we don't see
- * dirty data on the default backing_dev_info
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
*/
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
- wb_do_writeback(me, 0);
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
- spin_lock_bh(&bdi_lock);
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
- /*
- * Check if any existing bdi's have dirty data without
- * a thread registered. If so, set that up.
- */
- list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
- if (bdi->wb.task)
- continue;
- if (list_empty(&bdi->work_list) &&
- !bdi_has_dirty_io(bdi))
- continue;
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
- bdi_add_default_flusher_task(bdi);
- }
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
- set_current_state(TASK_INTERRUPTIBLE);
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
- if (list_empty(&bdi_pending_list)) {
- unsigned long wait;
+static int bdi_forker_thread(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
- spin_unlock_bh(&bdi_lock);
- wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout(wait);
- try_to_freeze();
- continue;
- }
+ current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
- __set_current_state(TASK_RUNNING);
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
- /*
- * This is our real job - check for pending entries in
- * bdi_pending_list, and create the tasks that got added
- */
- bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
- bdi_list);
- list_del_init(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
+ for (;;) {
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
- wb = &bdi->wb;
- wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
- dev_name(bdi->dev));
/*
- * If task creation fails, then readd the bdi to
- * the pending list and force writeout of the bdi
- * from this forker thread. That will free some memory
- * and we can try again.
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
*/
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
-
- /*
- * Add this 'bdi' to the back, so we get
- * a chance to flush other bdi's to free
- * memory.
- */
- spin_lock_bh(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_flush_io(bdi);
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
+ wb_do_writeback(me, 0);
}
- }
- return 0;
-}
+ spin_lock_bh(&bdi_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
-static void bdi_add_to_pending(struct rcu_head *head)
-{
- struct backing_dev_info *bdi;
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
- bdi = container_of(head, struct backing_dev_info, rcu_head);
- INIT_LIST_HEAD(&bdi->bdi_list);
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
+ continue;
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
- /*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
- */
- wake_up_process(default_backing_dev_info.wb.task);
-}
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
- */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
-{
- if (!bdi_cap_writeback_dirty(bdi))
- return;
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
- if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
- printk(KERN_ERR "bdi %p/%s is not registered!\n",
- bdi, bdi->name);
- return;
- }
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
- /*
- * Check with the helper whether to proceed adding a task. Will only
- * abort if we two or more simultanous calls to
- * bdi_add_default_flusher_task() occured, further additions will block
- * waiting for previous additions to finish.
- */
- if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_del_rcu(&bdi->bdi_list);
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
+ dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread.
+ */
+ bdi_flush_io(bdi);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ }
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
+ else
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+ try_to_freeze();
+ /* Back to the main loop */
+ continue;
+ }
/*
- * We must wait for the current RCU period to end before
- * moving to the pending list. So schedule that operation
- * from an RCU callback.
+ * Clear pending bit and wakeup anybody waiting to tear us down.
*/
- call_rcu(&bdi->rcu_head, bdi_add_to_pending);
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
}
+
+ return 0;
}
/*
@@ -539,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
- int ret = 0;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
- goto exit;
+ return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto exit;
- }
-
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
bdi->dev = dev;
@@ -567,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
- wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
- ret = -ENOMEM;
-
- bdi_remove_from_list(bdi);
- goto exit;
- }
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
-exit:
- return ret;
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
EXPORT_SYMBOL(bdi_register);
@@ -596,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct bdi_writeback *wb;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
- * If setup is pending, wait for that to complete first
+ * Make sure nobody finds us on the bdi_list anymore
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ bdi_remove_from_list(bdi);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * If setup is pending, wait for that to complete first
*/
- bdi_remove_from_list(bdi);
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
/*
- * Finally, kill the kernel threads. We don't need to be RCU
+ * Finally, kill the kernel thread. We don't need to be RCU
* safe anymore, since the bdi is gone from visibility. Force
* unfreeze of the thread before calling kthread_stop(), otherwise
* it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list) {
- thaw_process(wb->task);
- kthread_stop(wb->task);
+ if (bdi->wb.task) {
+ thaw_process(bdi->wb.task);
+ kthread_stop(bdi->wb.task);
}
}
@@ -642,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
@@ -653,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -663,19 +642,11 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
- INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
- /*
- * Just one thread support for now, hard code mask and count
- */
- bdi->wb_mask = 1;
- bdi->wb_cnt = 1;
-
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 58c66cc5056..142c84a5499 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -833,15 +833,24 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
#ifdef CONFIG_NO_BOOTMEM
- return __alloc_memory_core_early(pgdat->node_id, size, align,
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
goal, -1ULL);
#else
- return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
#endif
+
+ return ptr;
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -977,14 +986,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
#ifdef CONFIG_NO_BOOTMEM
- return __alloc_memory_core_early(pgdat->node_id, size, align,
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+ if (ptr)
+ return ptr;
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
#else
- return ___alloc_bootmem_node(pgdat->bdata, size, align,
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
#endif
+ return ptr;
}
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 00000000000..94cce51b0b3
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,605 @@
+/*
+ * linux/mm/compaction.c
+ *
+ * Memory compaction for the reduction of external fragmentation. Note that
+ * this heavily depends upon page migration to do all the real heavy
+ * lifting
+ *
+ * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
+ */
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/backing-dev.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include "internal.h"
+
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned long nr_freepages; /* Number of isolated free pages */
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+
+ /* Account for isolated anon and file pages */
+ unsigned long nr_anon;
+ unsigned long nr_file;
+
+ unsigned int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+};
+
+static unsigned long release_freepages(struct list_head *freelist)
+{
+ struct page *page, *next;
+ unsigned long count = 0;
+
+ list_for_each_entry_safe(page, next, freelist, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ count++;
+ }
+
+ return count;
+}
+
+/* Isolate free pages onto a private freelist. Must hold zone->lock */
+static unsigned long isolate_freepages_block(struct zone *zone,
+ unsigned long blockpfn,
+ struct list_head *freelist)
+{
+ unsigned long zone_end_pfn, end_pfn;
+ int total_isolated = 0;
+ struct page *cursor;
+
+ /* Get the last PFN we should scan for free pages at */
+ zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
+
+ /* Find the first usable PFN in the block to initialse page cursor */
+ for (; blockpfn < end_pfn; blockpfn++) {
+ if (pfn_valid_within(blockpfn))
+ break;
+ }
+ cursor = pfn_to_page(blockpfn);
+
+ /* Isolate free pages. This assumes the block is valid */
+ for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ int isolated, i;
+ struct page *page = cursor;
+
+ if (!pfn_valid_within(blockpfn))
+ continue;
+
+ if (!PageBuddy(page))
+ continue;
+
+ /* Found a free page, break it into order-0 pages */
+ isolated = split_free_page(page);
+ total_isolated += isolated;
+ for (i = 0; i < isolated; i++) {
+ list_add(&page->lru, freelist);
+ page++;
+ }
+
+ /* If a page was split, advance to the end of it */
+ if (isolated) {
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ }
+ }
+
+ return total_isolated;
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ return false;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE, allow migration */
+ if (migratetype == MIGRATE_MOVABLE)
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ struct page *page;
+ unsigned long high_pfn, low_pfn, pfn;
+ unsigned long flags;
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+ pfn = cc->free_pfn;
+ low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ high_pfn = low_pfn;
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ /*
+ * Check for overlapping nodes/zones. It's possible on some
+ * configurations to have a setup like
+ * node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ /* Check the block is suitable for migration */
+ if (!suitable_migration_target(page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
+ isolated = isolate_freepages_block(zone, pfn, freelist);
+ nr_freepages += isolated;
+
+ /*
+ * Record the highest PFN we isolated pages from. When next
+ * looking for free pages, the search will restart here as
+ * page migration may have returned some pages to the allocator
+ */
+ if (isolated)
+ high_pfn = max(high_pfn, pfn);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /* split_free_page does not map the pages */
+ list_for_each_entry(page, freelist, lru) {
+ arch_alloc_page(page, 0);
+ kernel_map_pages(page, 1, 1);
+ }
+
+ cc->free_pfn = high_pfn;
+ cc->nr_freepages = nr_freepages;
+}
+
+/* Update the number of anon and file isolated pages in the zone */
+static void acct_isolated(struct zone *zone, struct compact_control *cc)
+{
+ struct page *page;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+
+ list_for_each_entry(page, &cc->migratepages, lru) {
+ int lru = page_lru_base_type(page);
+ count[lru]++;
+ }
+
+ cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+}
+
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(struct zone *zone)
+{
+
+ unsigned long inactive, isolated;
+
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
+
+ return isolated > inactive;
+}
+
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static unsigned long isolate_migratepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned long low_pfn, end_pfn;
+ struct list_head *migratelist = &cc->migratepages;
+
+ /* Do not scan outside zone boundaries */
+ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+
+ /* Only scan within a pageblock boundary */
+ end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+
+ /* Do not cross the free scanner or scan within a memory hole */
+ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+ cc->migrate_pfn = end_pfn;
+ return 0;
+ }
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+ * list by either parallel reclaimers or compaction. If there are,
+ * delay for some time until fewer pages are isolated
+ */
+ while (unlikely(too_many_isolated(zone))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ if (fatal_signal_pending(current))
+ return 0;
+ }
+
+ /* Time to isolate some pages for migration */
+ spin_lock_irq(&zone->lru_lock);
+ for (; low_pfn < end_pfn; low_pfn++) {
+ struct page *page;
+ if (!pfn_valid_within(low_pfn))
+ continue;
+
+ /* Get the page and skip if free */
+ page = pfn_to_page(low_pfn);
+ if (PageBuddy(page))
+ continue;
+
+ /* Try isolate the page */
+ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ continue;
+
+ /* Successfully isolated */
+ del_page_from_lru_list(zone, page, page_lru(page));
+ list_add(&page->lru, migratelist);
+ mem_cgroup_del_lru(page);
+ cc->nr_migratepages++;
+
+ /* Avoid isolating too much */
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ break;
+ }
+
+ acct_isolated(zone, cc);
+
+ spin_unlock_irq(&zone->lru_lock);
+ cc->migrate_pfn = low_pfn;
+
+ return cc->nr_migratepages;
+}
+
+/*
+ * This is a migrate-callback that "allocates" freepages by taking pages
+ * from the isolated freelists in the block we are migrating to.
+ */
+static struct page *compaction_alloc(struct page *migratepage,
+ unsigned long data,
+ int **result)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ /* Isolate free pages if necessary */
+ if (list_empty(&cc->freepages)) {
+ isolate_freepages(cc->zone, cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+/*
+ * We cannot control nr_migratepages and nr_freepages fully when migration is
+ * running as migrate_pages() has no knowledge of compact_control. When
+ * migration is complete, we count the number of pages on the lists by hand.
+ */
+static void update_nr_listpages(struct compact_control *cc)
+{
+ int nr_migratepages = 0;
+ int nr_freepages = 0;
+ struct page *page;
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ nr_migratepages++;
+ list_for_each_entry(page, &cc->freepages, lru)
+ nr_freepages++;
+
+ cc->nr_migratepages = nr_migratepages;
+ cc->nr_freepages = nr_freepages;
+}
+
+static int compact_finished(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned int order;
+ unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+
+ if (fatal_signal_pending(current))
+ return COMPACT_PARTIAL;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (cc->free_pfn <= cc->migrate_pfn)
+ return COMPACT_COMPLETE;
+
+ /* Compaction run is not finished if the watermark is not met */
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ return COMPACT_CONTINUE;
+
+ if (cc->order == -1)
+ return COMPACT_CONTINUE;
+
+ /* Direct compactor: Is a suitable page free? */
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (order >= pageblock_order && zone->free_area[order].nr_free)
+ return COMPACT_PARTIAL;
+ }
+
+ return COMPACT_CONTINUE;
+}
+
+static int compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ int ret;
+
+ /* Setup to move all movable pages to the end of the zone */
+ cc->migrate_pfn = zone->zone_start_pfn;
+ cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
+ cc->free_pfn &= ~(pageblock_nr_pages-1);
+
+ migrate_prep_local();
+
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ unsigned long nr_migrate, nr_remaining;
+
+ if (!isolate_migratepages(zone, cc))
+ continue;
+
+ nr_migrate = cc->nr_migratepages;
+ migrate_pages(&cc->migratepages, compaction_alloc,
+ (unsigned long)cc, 0);
+ update_nr_listpages(cc);
+ nr_remaining = cc->nr_migratepages;
+
+ count_vm_event(COMPACTBLOCKS);
+ count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
+ if (nr_remaining)
+ count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+
+ /* Release LRU pages not migrated */
+ if (!list_empty(&cc->migratepages)) {
+ putback_lru_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ }
+
+ }
+
+ /* Release free pages and check accounting */
+ cc->nr_freepages -= release_freepages(&cc->freepages);
+ VM_BUG_ON(cc->nr_freepages != 0);
+
+ return ret;
+}
+
+static unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask)
+{
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = order,
+ .migratetype = allocflags_to_migratetype(gfp_mask),
+ .zone = zone,
+ };
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ return compact_zone(zone, &cc);
+}
+
+int sysctl_extfrag_threshold = 500;
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @zonelist: The zonelist used for the current allocation
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+ *
+ * This is the main entry point for direct page compaction.
+ */
+unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ int may_enter_fs = gfp_mask & __GFP_FS;
+ int may_perform_io = gfp_mask & __GFP_IO;
+ unsigned long watermark;
+ struct zoneref *z;
+ struct zone *zone;
+ int rc = COMPACT_SKIPPED;
+
+ /*
+ * Check whether it is worth even starting compaction. The order check is
+ * made because an assumption is made that the page allocator can satisfy
+ * the "cheaper" orders without taking special steps
+ */
+ if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+ return rc;
+
+ count_vm_event(COMPACTSTALL);
+
+ /* Compact each zone in the list */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ nodemask) {
+ int fragindex;
+ int status;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note
+ * the 2UL. This is because during migration, copies of
+ * pages need to be allocated and for a short time, the
+ * footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ continue;
+
+ /*
+ * fragmentation index determines if allocation failures are
+ * due to low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed depending
+ * on watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ continue;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
+ rc = COMPACT_PARTIAL;
+ break;
+ }
+
+ status = compact_zone_order(zone, order, gfp_mask);
+ rc = max(status, rc);
+
+ if (zone_watermark_ok(zone, order, watermark, 0, 0))
+ break;
+ }
+
+ return rc;
+}
+
+
+/* Compact all zones within a node */
+static int compact_node(int nid)
+{
+ int zoneid;
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
+ return -EINVAL;
+ pgdat = NODE_DATA(nid);
+
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = -1,
+ };
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ return 0;
+}
+
+/* Compact all nodes in the system */
+static int compact_nodes(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ compact_node(nid);
+
+ return COMPACT_COMPLETE;
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/* This is the entry point for compacting all nodes via /proc/sys/vm */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ return compact_nodes();
+
+ return 0;
+}
+
+int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ return 0;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+ssize_t sysfs_compact_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ compact_node(dev->id);
+
+ return count;
+}
+static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+
+int compaction_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_compact);
+}
+
+void compaction_unregister_node(struct node *node)
+{
+ return sysdev_remove_file(&node->sysdev, &attr_compact);
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640..3d4df44e422 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page)
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
}
+EXPORT_SYMBOL(remove_from_page_cache);
static int sync_page(void *word)
{
@@ -441,7 +442,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
/*
* Splice_read and readahead add shmem/tmpfs pages into the page cache
* before shmem_readpage has a chance to mark them as SwapBacked: they
- * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+ * need to go on the anon lru below, and mem_cgroup_cache_charge
* (called in add_to_page_cache) needs to know where they're going too.
*/
if (mapping_cap_swap_backed(mapping))
@@ -452,7 +453,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
if (page_is_file_cache(page))
lru_cache_add_file(page);
else
- lru_cache_add_active_anon(page);
+ lru_cache_add_anon(page);
}
return ret;
}
@@ -461,9 +462,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
{
+ int n;
+ struct page *page;
+
if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_exact_node(n, gfp, 0);
+ get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ put_mems_allowed();
+ return page;
}
return alloc_pages(gfp, 0);
}
@@ -1099,6 +1106,12 @@ page_not_up_to_date_locked:
}
readpage:
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1263,7 +1276,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
- unsigned long seg;
+ unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
@@ -1290,21 +1303,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
}
- if (retval > 0)
+ if (retval > 0) {
*ppos = pos + retval;
- if (retval) {
+ count -= retval;
+ }
+
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
file_accessed(filp);
goto out;
}
}
}
+ count = retval;
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
+ loff_t offset = 0;
+
+ /*
+ * If we did a short DIO read we need to skip the section of the
+ * iov that we've already read data into.
+ */
+ if (count) {
+ if (count > iov[seg].iov_len) {
+ count -= iov[seg].iov_len;
+ continue;
+ }
+ offset = count;
+ count = 0;
+ }
desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
+ desc.arg.buf = iov[seg].iov_base + offset;
+ desc.count = iov[seg].iov_len - offset;
if (desc.count == 0)
continue;
desc.error = 0;
@@ -2199,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
do {
struct page *page;
- pgoff_t index; /* Pagecache index for current page */
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
diff --git a/mm/highmem.c b/mm/highmem.c
index bed8a8bfd01..7a0aa1be499 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/kgdb.h>
#include <asm/tlbflush.h>
/*
@@ -422,7 +423,7 @@ void __init page_address_init(void)
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+#ifdef CONFIG_DEBUG_HIGHMEM
void debug_kmap_atomic(enum km_type type)
{
@@ -470,6 +471,12 @@ void debug_kmap_atomic(enum km_type type)
warn_count--;
}
}
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
+ WARN_ON(1);
+ warn_count--;
+ }
+#endif /* CONFIG_KGDB_KDB */
}
#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bbf377..b61d2db9f34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct zonelist *zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol, &nodemask);
+ struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
*/
if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
break;
}
}
+err:
mpol_cond_put(mpol);
+ put_mems_allowed();
return page;
}
@@ -2345,11 +2349,17 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ mmu_notifier_invalidate_range_start(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
/* Make the old page be freed below */
new_page = old_page;
+ mmu_notifier_invalidate_range_end(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
}
page_cache_release(new_page);
page_cache_release(old_page);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da966..1d29cdfe8eb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac89..bd9bc214091 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
object = prio_tree_entry(node, struct kmemleak_object,
tree_node);
if (!alias && object->pointer != ptr) {
- kmemleak_warn("Found object by alias");
+ pr_warning("Found object by alias at 0x%08lx\n", ptr);
+ dump_stack();
+ dump_object_info(object);
object = NULL;
}
} else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
}
/*
- * Make a object permanently as gray-colored so that it can no longer be
+ * Mark an object permanently as gray-colored so that it can no longer be
* reported as a leak. This is used in general to mark a false positive.
*/
static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
rcu_read_unlock();
}
-/*
- * Memory allocation function callback. This function is called from the
- * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
- * vmalloc etc.).
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
-/*
- * Memory freeing function callback. This function is called from the kernel
- * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
*/
void __ref kmemleak_free(const void *ptr)
{
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
}
EXPORT_SYMBOL_GPL(kmemleak_free);
-/*
- * Partial memory freeing function callback. This function is usually called
- * from bootmem allocator when (part of) a memory block is freed.
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
-/*
- * Mark an already allocated memory block as a false positive. This will cause
- * the block to no longer be reported as leak and always be scanned.
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
*/
void __ref kmemleak_not_leak(const void *ptr)
{
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_not_leak);
-/*
- * Ignore a memory block. This is usually done when it is known that the
- * corresponding block is not a leak and does not contain any references to
- * other allocated memory blocks.
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
*/
void __ref kmemleak_ignore(const void *ptr)
{
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_ignore);
-/*
- * Limit the range to be scanned in an allocated memory block.
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(kmemleak_scan_area);
-/*
- * Inform kmemleak not to scan the given memory block.
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
*/
void __ref kmemleak_no_scan(const void *ptr)
{
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") != 0)
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
return -EINVAL;
return 0;
}
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
int i;
unsigned long flags;
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_disable();
+ return;
+ }
+#endif
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index 956880f2ff4..e2ae0045832 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
+#include <linux/hash.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
static struct rb_root root_stable_tree = RB_ROOT;
static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_SHIFT 10
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
kmem_cache_free(mm_slot_cache, mm_slot);
}
-static int __init mm_slots_hash_init(void)
-{
- mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!mm_slots_hash)
- return -ENOMEM;
- return 0;
-}
-
-static void __init mm_slots_hash_free(void)
-{
- kfree(mm_slots_hash);
-}
-
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
struct hlist_head *bucket;
struct hlist_node *node;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
hlist_for_each_entry(mm_slot, node, bucket, link) {
if (mm == mm_slot->mm)
return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
{
struct hlist_head *bucket;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
hlist_add_head(&mm_slot->link, bucket);
}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
struct anon_vma *anon_vma)
{
rmap_item->anon_vma = anon_vma;
- atomic_inc(&anon_vma->ksm_refcount);
+ get_anon_vma(anon_vma);
}
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
{
struct anon_vma *anon_vma = rmap_item->anon_vma;
- if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ drop_anon_vma(anon_vma);
}
/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (!mapcount)
goto out;
}
@@ -1619,7 +1600,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1671,7 +1652,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
if (err)
goto out;
- err = mm_slots_hash_init();
- if (err)
- goto out_free1;
-
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
printk(KERN_ERR "ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
if (err) {
printk(KERN_ERR "ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
#endif
return 0;
-out_free2:
- mm_slots_hash_free();
-out_free1:
+out_free:
ksm_slab_free();
out:
return err;
diff --git a/mm/memblock.c b/mm/memblock.c
new file mode 100644
index 00000000000..43840b305ec
--- /dev/null
+++ b/mm/memblock.c
@@ -0,0 +1,541 @@
+/*
+ * Procedures for maintaining information about logical memory blocks.
+ *
+ * Peter Bergner, IBM Corp. June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/memblock.h>
+
+#define MEMBLOCK_ALLOC_ANYWHERE 0
+
+struct memblock memblock;
+
+static int memblock_debug;
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+static void memblock_dump(struct memblock_region *region, char *name)
+{
+ unsigned long long base, size;
+ int i;
+
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->region[i].base;
+ size = region->region[i].size;
+
+ pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
+ name, i, base, base + size - 1, size);
+ }
+}
+
+void memblock_dump_all(void)
+{
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size);
+ pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
+}
+
+static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
+ u64 size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
+ return 0;
+}
+
+static long memblock_regions_adjacent(struct memblock_region *rgn,
+ unsigned long r1, unsigned long r2)
+{
+ u64 base1 = rgn->region[r1].base;
+ u64 size1 = rgn->region[r1].size;
+ u64 base2 = rgn->region[r2].base;
+ u64 size2 = rgn->region[r2].size;
+
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
+
+static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+{
+ unsigned long i;
+
+ for (i = r; i < rgn->cnt - 1; i++) {
+ rgn->region[i].base = rgn->region[i + 1].base;
+ rgn->region[i].size = rgn->region[i + 1].size;
+ }
+ rgn->cnt--;
+}
+
+/* Assumption: base addr of region 1 < base addr of region 2 */
+static void memblock_coalesce_regions(struct memblock_region *rgn,
+ unsigned long r1, unsigned long r2)
+{
+ rgn->region[r1].size += rgn->region[r2].size;
+ memblock_remove_region(rgn, r2);
+}
+
+void __init memblock_init(void)
+{
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.region[0].base = 0;
+ memblock.memory.region[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.region[0].base = 0;
+ memblock.reserved.region[0].size = 0;
+ memblock.reserved.cnt = 1;
+}
+
+void __init memblock_analyze(void)
+{
+ int i;
+
+ memblock.memory.size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory.size += memblock.memory.region[i].size;
+}
+
+static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+{
+ unsigned long coalesced = 0;
+ long adjacent, i;
+
+ if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
+ rgn->region[0].base = base;
+ rgn->region[0].size = size;
+ return 0;
+ }
+
+ /* First try and coalesce this MEMBLOCK with another. */
+ for (i = 0; i < rgn->cnt; i++) {
+ u64 rgnbase = rgn->region[i].base;
+ u64 rgnsize = rgn->region[i].size;
+
+ if ((rgnbase == base) && (rgnsize == size))
+ /* Already have this region, so we're done */
+ return 0;
+
+ adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ if (adjacent > 0) {
+ rgn->region[i].base -= size;
+ rgn->region[i].size += size;
+ coalesced++;
+ break;
+ } else if (adjacent < 0) {
+ rgn->region[i].size += size;
+ coalesced++;
+ break;
+ }
+ }
+
+ if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
+ memblock_coalesce_regions(rgn, i, i+1);
+ coalesced++;
+ }
+
+ if (coalesced)
+ return coalesced;
+ if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+ return -1;
+
+ /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
+ for (i = rgn->cnt - 1; i >= 0; i--) {
+ if (base < rgn->region[i].base) {
+ rgn->region[i+1].base = rgn->region[i].base;
+ rgn->region[i+1].size = rgn->region[i].size;
+ } else {
+ rgn->region[i+1].base = base;
+ rgn->region[i+1].size = size;
+ break;
+ }
+ }
+
+ if (base < rgn->region[0].base) {
+ rgn->region[0].base = base;
+ rgn->region[0].size = size;
+ }
+ rgn->cnt++;
+
+ return 0;
+}
+
+long memblock_add(u64 base, u64 size)
+{
+ struct memblock_region *_rgn = &memblock.memory;
+
+ /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
+ if (base == 0)
+ memblock.rmo_size = size;
+
+ return memblock_add_region(_rgn, base, size);
+
+}
+
+static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+{
+ u64 rgnbegin, rgnend;
+ u64 end = base + size;
+ int i;
+
+ rgnbegin = rgnend = 0; /* supress gcc warnings */
+
+ /* Find the region where (base, size) belongs to */
+ for (i=0; i < rgn->cnt; i++) {
+ rgnbegin = rgn->region[i].base;
+ rgnend = rgnbegin + rgn->region[i].size;
+
+ if ((rgnbegin <= base) && (end <= rgnend))
+ break;
+ }
+
+ /* Didn't find the region */
+ if (i == rgn->cnt)
+ return -1;
+
+ /* Check to see if we are removing entire region */
+ if ((rgnbegin == base) && (rgnend == end)) {
+ memblock_remove_region(rgn, i);
+ return 0;
+ }
+
+ /* Check to see if region is matching at the front */
+ if (rgnbegin == base) {
+ rgn->region[i].base = end;
+ rgn->region[i].size -= size;
+ return 0;
+ }
+
+ /* Check to see if the region is matching at the end */
+ if (rgnend == end) {
+ rgn->region[i].size -= size;
+ return 0;
+ }
+
+ /*
+ * We need to split the entry - adjust the current one to the
+ * beginging of the hole and add the region after hole.
+ */
+ rgn->region[i].size = base - rgn->region[i].base;
+ return memblock_add_region(rgn, end, rgnend - end);
+}
+
+long memblock_remove(u64 base, u64 size)
+{
+ return __memblock_remove(&memblock.memory, base, size);
+}
+
+long __init memblock_free(u64 base, u64 size)
+{
+ return __memblock_remove(&memblock.reserved, base, size);
+}
+
+long __init memblock_reserve(u64 base, u64 size)
+{
+ struct memblock_region *_rgn = &memblock.reserved;
+
+ BUG_ON(0 == size);
+
+ return memblock_add_region(_rgn, base, size);
+}
+
+long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+{
+ unsigned long i;
+
+ for (i = 0; i < rgn->cnt; i++) {
+ u64 rgnbase = rgn->region[i].base;
+ u64 rgnsize = rgn->region[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
+ }
+
+ return (i < rgn->cnt) ? i : -1;
+}
+
+static u64 memblock_align_down(u64 addr, u64 size)
+{
+ return addr & ~(size - 1);
+}
+
+static u64 memblock_align_up(u64 addr, u64 size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
+ u64 size, u64 align)
+{
+ u64 base, res_base;
+ long j;
+
+ base = memblock_align_down((end - size), align);
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0) {
+ /* this area isn't reserved, take it */
+ if (memblock_add_region(&memblock.reserved, base, size) < 0)
+ base = ~(u64)0;
+ return base;
+ }
+ res_base = memblock.reserved.region[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return ~(u64)0;
+}
+
+static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
+ u64 (*nid_range)(u64, u64, int *),
+ u64 size, u64 align, int nid)
+{
+ u64 start, end;
+
+ start = mp->base;
+ end = start + mp->size;
+
+ start = memblock_align_up(start, align);
+ while (start < end) {
+ u64 this_end;
+ int this_nid;
+
+ this_end = nid_range(start, end, &this_nid);
+ if (this_nid == nid) {
+ u64 ret = memblock_alloc_nid_unreserved(start, this_end,
+ size, align);
+ if (ret != ~(u64)0)
+ return ret;
+ }
+ start = this_end;
+ }
+
+ return ~(u64)0;
+}
+
+u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
+ u64 (*nid_range)(u64 start, u64 end, int *nid))
+{
+ struct memblock_region *mem = &memblock.memory;
+ int i;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ for (i = 0; i < mem->cnt; i++) {
+ u64 ret = memblock_alloc_nid_region(&mem->region[i],
+ nid_range,
+ size, align, nid);
+ if (ret != ~(u64)0)
+ return ret;
+ }
+
+ return memblock_alloc(size, align);
+}
+
+u64 __init memblock_alloc(u64 size, u64 align)
+{
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+}
+
+u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+{
+ u64 alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
+}
+
+u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+{
+ long i, j;
+ u64 base = 0;
+ u64 res_base;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ /* On some platforms, make sure we allocate lowmem */
+ /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
+ if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
+ max_addr = MEMBLOCK_REAL_LIMIT;
+
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ u64 memblockbase = memblock.memory.region[i].base;
+ u64 memblocksize = memblock.memory.region[i].size;
+
+ if (memblocksize < size)
+ continue;
+ if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
+ base = memblock_align_down(memblockbase + memblocksize - size, align);
+ else if (memblockbase < max_addr) {
+ base = min(memblockbase + memblocksize, max_addr);
+ base = memblock_align_down(base - size, align);
+ } else
+ continue;
+
+ while (base && memblockbase <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0) {
+ /* this area isn't reserved, take it */
+ if (memblock_add_region(&memblock.reserved, base, size) < 0)
+ return 0;
+ return base;
+ }
+ res_base = memblock.reserved.region[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+ }
+ return 0;
+}
+
+/* You must call memblock_analyze() before this. */
+u64 __init memblock_phys_mem_size(void)
+{
+ return memblock.memory.size;
+}
+
+u64 memblock_end_of_DRAM(void)
+{
+ int idx = memblock.memory.cnt - 1;
+
+ return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+}
+
+/* You must call memblock_analyze() after this. */
+void __init memblock_enforce_memory_limit(u64 memory_limit)
+{
+ unsigned long i;
+ u64 limit;
+ struct memblock_property *p;
+
+ if (!memory_limit)
+ return;
+
+ /* Truncate the memblock regions to satisfy the memory limit. */
+ limit = memory_limit;
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ if (limit > memblock.memory.region[i].size) {
+ limit -= memblock.memory.region[i].size;
+ continue;
+ }
+
+ memblock.memory.region[i].size = limit;
+ memblock.memory.cnt = i + 1;
+ break;
+ }
+
+ if (memblock.memory.region[0].size < memblock.rmo_size)
+ memblock.rmo_size = memblock.memory.region[0].size;
+
+ memory_limit = memblock_end_of_DRAM();
+
+ /* And truncate any reserves above the limit also. */
+ for (i = 0; i < memblock.reserved.cnt; i++) {
+ p = &memblock.reserved.region[i];
+
+ if (p->base > memory_limit)
+ p->size = 0;
+ else if ((p->base + p->size) > memory_limit)
+ p->size = memory_limit - p->base;
+
+ if (p->size == 0) {
+ memblock_remove_region(&memblock.reserved, i);
+ i--;
+ }
+ }
+}
+
+int __init memblock_is_reserved(u64 addr)
+{
+ int i;
+
+ for (i = 0; i < memblock.reserved.cnt; i++) {
+ u64 upper = memblock.reserved.region[i].base +
+ memblock.reserved.region[i].size - 1;
+ if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
+ return 1;
+ }
+ return 0;
+}
+
+int memblock_is_region_reserved(u64 base, u64 size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+/*
+ * Given a <base, len>, find which memory regions belong to this range.
+ * Adjust the request and return a contiguous chunk.
+ */
+int memblock_find(struct memblock_property *res)
+{
+ int i;
+ u64 rstart, rend;
+
+ rstart = res->base;
+ rend = rstart + res->size - 1;
+
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ u64 start = memblock.memory.region[i].base;
+ u64 end = start + memblock.memory.region[i].size - 1;
+
+ if (start > rend)
+ return -1;
+
+ if ((end >= rstart) && (start < rend)) {
+ /* adjust the request */
+ if (rstart < start)
+ rstart = start;
+ if (rend > end)
+ rend = end;
+ res->base = rstart;
+ res->size = rend - rstart + 1;
+ return 0;
+ }
+ }
+ return -1;
+}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8a79a6f0f02..3eed583895a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include "internal.h"
#include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
+
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -149,16 +152,35 @@ struct mem_cgroup_threshold {
u64 threshold;
};
+/* For threshold */
struct mem_cgroup_threshold_ary {
/* An array index points to threshold just below usage. */
- atomic_t current_threshold;
+ int current_threshold;
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
struct mem_cgroup_threshold entries[0];
};
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
static void mem_cgroup_threshold(struct mem_cgroup *mem);
+static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
/*
* The memory controller data structure. The memory controller controls both
@@ -192,8 +214,6 @@ struct mem_cgroup {
*/
spinlock_t reclaim_param_lock;
- int prev_priority; /* for recording reclaim priority */
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -207,6 +227,8 @@ struct mem_cgroup {
atomic_t refcnt;
unsigned int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
@@ -215,17 +237,19 @@ struct mem_cgroup {
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_threshold_ary *thresholds;
+ struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_threshold_ary *memsw_thresholds;
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
unsigned long move_charge_at_immigrate;
-
/*
* percpu counter.
*/
@@ -239,11 +263,13 @@ struct mem_cgroup {
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
+ MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
NR_MOVE_TYPE,
};
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -252,9 +278,22 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
+static bool move_anon(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_ANON,
+ &mc.to->move_charge_at_immigrate);
+}
+
+static bool move_file(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_FILE,
+ &mc.to->move_charge_at_immigrate);
+}
+
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -282,9 +321,12 @@ enum charge_type {
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
+#define _OOM_TYPE (2)
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/* Used for OOM nofiier */
+#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
@@ -797,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct mem_cgroup *curr = NULL;
+ struct task_struct *p;
- task_lock(task);
- rcu_read_lock();
- curr = try_get_mem_cgroup_from_mm(task->mm);
- rcu_read_unlock();
- task_unlock(task);
+ p = find_lock_task_mm(task);
+ if (!p)
+ return 0;
+ curr = try_get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
if (!curr)
return 0;
/*
@@ -819,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret;
}
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
- int prev_priority;
-
- spin_lock(&mem->reclaim_param_lock);
- prev_priority = mem->prev_priority;
- spin_unlock(&mem->reclaim_param_lock);
-
- return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- if (priority < mem->prev_priority)
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
@@ -905,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -915,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -960,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
LIST_HEAD(pc_list);
struct list_head *src;
struct page_cgroup *pc, *tmp;
- int nid = z->zone_pgdat->node_id;
+ int nid = zone_to_nid(z);
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;
@@ -999,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
}
*scanned = scan;
+
+ trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+ 0, 0, 0, mode);
+
return nr_taken;
}
@@ -1033,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1119,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
}
/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ u64 limit;
+ u64 memsw;
+
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+ total_swap_pages;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ /*
+ * If memsw is finite and limits the amount of swap space available
+ * to this memcg, return that limit.
+ */
+ return min(limit, memsw);
+}
+
+/*
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
@@ -1223,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/* we use swappiness of local cgroup */
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone,
- zone->zone_pgdat->node_id);
+ noswap, get_swappiness(victim), zone);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
@@ -1293,14 +1369,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+struct oom_wait_info {
+ struct mem_cgroup *mem;
+ wait_queue_t wait;
+};
+
+static int memcg_oom_wake_function(wait_queue_t *wait,
+ unsigned mode, int sync, void *arg)
+{
+ struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+ struct oom_wait_info *oom_wait_info;
+
+ oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+
+ if (oom_wait_info->mem == wake_mem)
+ goto wakeup;
+ /* if no hierarchy, no match */
+ if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
+ return 0;
+ /*
+ * Both of oom_wait_info->mem and wake_mem are stable under us.
+ * Then we can use css_is_ancestor without taking care of RCU.
+ */
+ if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
+ !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+ return 0;
+
+wakeup:
+ return autoremove_wake_function(wait, mode, sync, arg);
+}
+
+static void memcg_wakeup_oom(struct mem_cgroup *mem)
+{
+ /* for filtering, pass "mem" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+}
+
+static void memcg_oom_recover(struct mem_cgroup *mem)
+{
+ if (mem && atomic_read(&mem->oom_lock))
+ memcg_wakeup_oom(mem);
+}
+
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
- DEFINE_WAIT(wait);
- bool locked;
+ struct oom_wait_info owait;
+ bool locked, need_to_kill;
+ owait.mem = mem;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+ need_to_kill = true;
/* At first, try to OOM lock hierarchy under mem.*/
mutex_lock(&memcg_oom_mutex);
locked = mem_cgroup_oom_lock(mem);
@@ -1309,32 +1433,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
- if (!locked)
- prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ if (!locked || mem->oom_kill_disable)
+ need_to_kill = false;
+ if (locked)
+ mem_cgroup_oom_notify(mem);
mutex_unlock(&memcg_oom_mutex);
- if (locked)
+ if (need_to_kill) {
+ finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(mem, mask);
- else {
+ } else {
schedule();
- finish_wait(&memcg_oom_waitq, &wait);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
}
mutex_lock(&memcg_oom_mutex);
mem_cgroup_oom_unlock(mem);
- /*
- * Here, we use global waitq .....more fine grained waitq ?
- * Assume following hierarchy.
- * A/
- * 01
- * 02
- * assume OOM happens both in A and 01 at the same time. Tthey are
- * mutually exclusive by lock. (kill in 01 helps A.)
- * When we use per memcg waitq, we have to wake up waiters on A and 02
- * in addtion to waiters on 01. We use global waitq for avoiding mess.
- * It will not be a big problem.
- * (And a task may be moved to other groups while it's waiting for OOM.)
- */
- wake_up_all(&memcg_oom_waitq);
+ memcg_wakeup_oom(mem);
mutex_unlock(&memcg_oom_mutex);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
@@ -1438,7 +1553,7 @@ static void drain_local_stock(struct work_struct *dummy)
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *mem, int val)
{
@@ -1504,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
int csize = CHARGE_SIZE;
/*
@@ -1531,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
- mem = try_get_mem_cgroup_from_mm(mm);
- *memcg = mem;
- } else {
- css_get(&mem->css);
- }
- if (unlikely(!mem))
- return 0;
-
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
- goto done;
-
- while (1) {
- int ret = 0;
- unsigned long flags = 0;
-
+ if (!*memcg && !mm)
+ goto bypass;
+again:
+ if (*memcg) { /* css should be a valid one */
+ mem = *memcg;
+ VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
if (consume_stock(mem))
goto done;
+ css_get(&mem->css);
+ } else {
+ struct task_struct *p;
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
-
- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
- }
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;
-
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
-
+ rcu_read_lock();
+ p = rcu_dereference(mm->owner);
+ VM_BUG_ON(!p);
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
+ * because we don't have task_lock(), "p" can exit while
+ * we're here. In that case, "mem" can point to root
+ * cgroup but never be NULL. (and task_struct itself is freed
+ * by RCU, cgroup itself is RCU safe.) Then, we have small
+ * risk here to get wrong cgroup. But such kind of mis-account
+ * by race always happens because we don't have cgroup_mutex().
+ * It's overkill and we allow that small race, here.
*/
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
-
- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
+ mem = mem_cgroup_from_task(p);
+ VM_BUG_ON(!mem);
+ if (mem_cgroup_is_root(mem)) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (consume_stock(mem)) {
/*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
+ * It seems dagerous to access memcg without css_get().
+ * But considering how consume_stok works, it's not
+ * necessary. If consume_stock success, some charges
+ * from this memcg are cached on this cpu. So, we
+ * don't need to call css_get()/css_tryget() before
+ * calling consume_stock().
*/
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
+ rcu_read_unlock();
+ goto done;
+ }
+ /* after here, we may be blocked. we need to get refcnt */
+ if (!css_tryget(&mem->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+ }
+
+ do {
+ bool oom_check;
+
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current)) {
+ css_put(&mem->css);
+ goto bypass;
+ }
+
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- if (!nr_retries--) {
- if (!oom)
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ css_put(&mem->css);
+ mem = NULL;
+ goto again;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ css_put(&mem->css);
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+ if (!oom) {
+ css_put(&mem->css);
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
}
- /* When we reach here, current task is dying .*/
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
css_put(&mem->css);
goto bypass;
}
- }
+ } while (ret != CHARGE_OK);
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
+ css_put(&mem->css);
done:
+ *memcg = mem;
return 0;
nomem:
- css_put(&mem->css);
+ *memcg = NULL;
return -ENOMEM;
bypass:
*memcg = NULL;
@@ -1669,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
if (do_swap_account)
res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_put(&mem->css, (int)count);
}
- /* we don't need css_put for root */
}
static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1901,10 +2061,9 @@ out:
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
@@ -1914,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
return 0;
prefetchw(pc);
- mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
if (ret || !mem)
return ret;
@@ -1942,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
static void
@@ -1952,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -1973,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
@@ -1985,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
unlock_page_cgroup(pc);
}
- if (unlikely(!mm && !mem))
+ if (unlikely(!mm))
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
/* shmem */
if (PageSwapCache(page)) {
+ struct mem_cgroup *mem = NULL;
+
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+ MEM_CGROUP_CHARGE_TYPE_SHMEM);
return ret;
}
@@ -2036,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
goto charge_cur_mm;
*ptr = mem;
ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
- /* drop extra refcnt from tryget */
css_put(&mem->css);
return ret;
charge_cur_mm:
@@ -2118,15 +2275,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
- /*
- * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continously can be expected to be in
- * the same cgroup and we have chance to coalesce uncharges.
- * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- * because we want to do uncharge as soon as possible.
- */
- if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
- goto direct_uncharge;
batch = &current->memcg_batch;
/*
@@ -2137,6 +2285,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->memcg)
batch->memcg = mem;
/*
+ * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+ * In those cases, all pages freed continously can be expected to be in
+ * the same cgroup and we have chance to coalesce uncharges.
+ * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+ * because we want to do uncharge as soon as possible.
+ */
+
+ if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
+ goto direct_uncharge;
+
+ /*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
@@ -2152,6 +2311,8 @@ direct_uncharge:
res_counter_uncharge(&mem->res, PAGE_SIZE);
if (uncharge_memsw)
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ if (unlikely(batch->memcg != mem))
+ memcg_oom_recover(mem);
return;
}
@@ -2163,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
- struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
@@ -2188,7 +2348,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
case MEM_CGROUP_CHARGE_TYPE_DROP:
- if (page_mapped(page))
+ /* See mem_cgroup_prepare_migration() */
+ if (page_mapped(page) || PageCgroupMigration(pc))
goto unlock_out;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2202,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
- if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
@@ -2216,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* special functions.
*/
- mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
-
+ /*
+ * even after unlock, we have mem->res.usage here and this memcg
+ * will never be freed.
+ */
memcg_check_events(mem, page);
- /* at swapout, this memcg will be accessed to record to swap */
- if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- css_put(&mem->css);
+ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+ mem_cgroup_swap_statistics(mem, true);
+ mem_cgroup_get(mem);
+ }
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype);
return mem;
@@ -2288,6 +2450,7 @@ void mem_cgroup_uncharge_end(void)
res_counter_uncharge(&batch->memcg->res, batch->bytes);
if (batch->memsw_bytes)
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+ memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
@@ -2308,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
memcg = __mem_cgroup_uncharge_common(page, ctype);
- /* record memcg information */
- if (do_swap_account && swapout && memcg) {
+ /*
+ * record memcg information, if swapout && memcg != NULL,
+ * mem_cgroup_get() was called in uncharge().
+ */
+ if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
- mem_cgroup_get(memcg);
- }
- if (swapout && memcg)
- css_put(&memcg->css);
}
#endif
@@ -2392,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
*/
if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE);
- css_put(&to->css);
}
return 0;
}
@@ -2410,10 +2571,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.
*/
-int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
+int mem_cgroup_prepare_migration(struct page *page,
+ struct page *newpage, struct mem_cgroup **ptr)
{
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ enum charge_type ctype;
int ret = 0;
if (mem_cgroup_disabled())
@@ -2424,69 +2587,122 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
+ /*
+ * At migrating an anonymous page, its mapcount goes down
+ * to 0 and uncharge() will be called. But, even if it's fully
+ * unmapped, migration may fail and this page has to be
+ * charged again. We set MIGRATION flag here and delay uncharge
+ * until end_migration() is called
+ *
+ * Corner Case Thinking
+ * A)
+ * When the old page was mapped as Anon and it's unmap-and-freed
+ * while migration was ongoing.
+ * If unmap finds the old page, uncharge() of it will be delayed
+ * until end_migration(). If unmap finds a new page, it's
+ * uncharged when it make mapcount to be 1->0. If unmap code
+ * finds swap_migration_entry, the new page will not be mapped
+ * and end_migration() will find it(mapcount==0).
+ *
+ * B)
+ * When the old page was mapped but migraion fails, the kernel
+ * remaps it. A charge for it is kept by MIGRATION flag even
+ * if mapcount goes down to 0. We can do remap successfully
+ * without charging it again.
+ *
+ * C)
+ * The "old" page is under lock_page() until the end of
+ * migration, so, the old page itself will not be swapped-out.
+ * If the new page is swapped out before end_migraton, our
+ * hook to usual swap-out path will catch the event.
+ */
+ if (PageAnon(page))
+ SetPageCgroupMigration(pc);
}
unlock_page_cgroup(pc);
+ /*
+ * If the page is not charged at this point,
+ * we return here.
+ */
+ if (!mem)
+ return 0;
*ptr = mem;
- if (mem) {
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
- css_put(&mem->css);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ css_put(&mem->css);/* drop extra refcnt */
+ if (ret || *ptr == NULL) {
+ if (PageAnon(page)) {
+ lock_page_cgroup(pc);
+ ClearPageCgroupMigration(pc);
+ unlock_page_cgroup(pc);
+ /*
+ * The old page may be fully unmapped while we kept it.
+ */
+ mem_cgroup_uncharge_page(page);
+ }
+ return -ENOMEM;
}
+ /*
+ * We charge new page before it's used/mapped. So, even if unlock_page()
+ * is called before end_migration, we can catch all events on this new
+ * page. In the case new page is migrated but not remapped, new page's
+ * mapcount will be finally 0 and we call uncharge in end_migration().
+ */
+ pc = lookup_page_cgroup(newpage);
+ if (PageAnon(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+ else if (page_is_file_cache(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ else
+ ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+ __mem_cgroup_commit_charge(mem, pc, ctype);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage)
{
- struct page *target, *unused;
+ struct page *used, *unused;
struct page_cgroup *pc;
- enum charge_type ctype;
if (!mem)
return;
+ /* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
- target = oldpage;
- unused = NULL;
+ used = oldpage;
+ unused = newpage;
} else {
- target = newpage;
+ used = newpage;
unused = oldpage;
}
-
- if (PageAnon(target))
- ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
- else if (page_is_file_cache(target))
- ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- else
- ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
- /* unused page is not on radix-tree now. */
- if (unused)
- __mem_cgroup_uncharge_common(unused, ctype);
-
- pc = lookup_page_cgroup(target);
/*
- * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
- * So, double-counting is effectively avoided.
+ * We disallowed uncharge of pages under migration because mapcount
+ * of the page goes down to zero, temporarly.
+ * Clear the flag and check the page should be charged.
*/
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ pc = lookup_page_cgroup(oldpage);
+ lock_page_cgroup(pc);
+ ClearPageCgroupMigration(pc);
+ unlock_page_cgroup(pc);
+
+ __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
/*
- * Both of oldpage and newpage are still under lock_page().
- * Then, we don't have to care about race in radix-tree.
- * But we have to be careful that this page is unmapped or not.
- *
- * There is a case for !page_mapped(). At the start of
- * migration, oldpage was mapped. But now, it's zapped.
- * But we know *target* page is not freed/reused under us.
- * mem_cgroup_uncharge_page() does all necessary checks.
+ * If a page is a file cache, radix-tree replacement is very atomic
+ * and we can skip this check. When it was an Anon page, its mapcount
+ * goes down to 0. But because we added MIGRATION flage, it's not
+ * uncharged yet. There are several case but page->mapcount check
+ * and USED bit check in mem_cgroup_uncharge_page() will do enough
+ * check. (see prepare_charge() also)
*/
- if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
- mem_cgroup_uncharge_page(target);
+ if (PageAnon(used))
+ mem_cgroup_uncharge_page(used);
/*
- * At migration, we may charge account against cgroup which has no tasks
+ * At migration, we may charge account against cgroup which has no
+ * tasks.
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
@@ -2524,10 +2740,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memswlimit;
+ u64 memswlimit, memlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
+ int enlarge;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
@@ -2538,6 +2755,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ enlarge = 0;
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2555,6 +2773,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
+
+ memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (memlimit < val)
+ enlarge = 1;
+
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
@@ -2576,6 +2799,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
return ret;
}
@@ -2584,9 +2809,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memlimit, oldusage, curusage;
+ u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
+ int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2608,6 +2834,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
+ memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ if (memswlimit < val)
+ enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
@@ -2630,12 +2859,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
return ret;
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask, int nid,
- int zid)
+ gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2647,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (order > 0)
return 0;
- mctz = soft_limit_tree_node_zone(nid, zid);
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -2821,6 +3051,7 @@ move_account:
if (ret)
break;
}
+ memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
@@ -3311,9 +3542,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
rcu_read_lock();
if (!swap)
- t = rcu_dereference(memcg->thresholds);
+ t = rcu_dereference(memcg->thresholds.primary);
else
- t = rcu_dereference(memcg->memsw_thresholds);
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t)
goto unlock;
@@ -3325,7 +3556,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
- i = atomic_read(&t->current_threshold);
+ i = t->current_threshold;
/*
* Iterate backward over array of thresholds starting from
@@ -3349,7 +3580,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
eventfd_signal(t->entries[i].eventfd, 1);
/* Update current_threshold */
- atomic_set(&t->current_threshold, i - 1);
+ t->current_threshold = i - 1;
unlock:
rcu_read_unlock();
}
@@ -3369,106 +3600,117 @@ static int compare_thresholds(const void *a, const void *b)
return _a->threshold - _b->threshold;
}
-static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
- struct eventfd_ctx *eventfd, const char *args)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_eventfd_list *ev;
+
+ list_for_each_entry(ev, &mem->oom_notify, list)
+ eventfd_signal(ev->eventfd, 1);
+ return 0;
+}
+
+static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+{
+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
+}
+
+static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
- int size;
- int i, ret;
+ int i, size, ret;
ret = res_counter_memparse_write_strategy(args, &threshold);
if (ret)
return ret;
mutex_lock(&memcg->thresholds_lock);
+
if (type == _MEM)
- thresholds = memcg->thresholds;
+ thresholds = &memcg->thresholds;
else if (type == _MEMSWAP)
- thresholds = memcg->memsw_thresholds;
+ thresholds = &memcg->memsw_thresholds;
else
BUG();
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before adding a new one */
- if (thresholds)
+ if (thresholds->primary)
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
- if (thresholds)
- size = thresholds->size + 1;
- else
- size = 1;
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- thresholds_new = kmalloc(sizeof(*thresholds_new) +
- size * sizeof(struct mem_cgroup_threshold),
+ new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
GFP_KERNEL);
- if (!thresholds_new) {
+ if (!new) {
ret = -ENOMEM;
goto unlock;
}
- thresholds_new->size = size;
+ new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds)
- memcpy(thresholds_new->entries, thresholds->entries,
- thresholds->size *
+ if (thresholds->primary) {
+ memcpy(new->entries, thresholds->primary->entries, (size - 1) *
sizeof(struct mem_cgroup_threshold));
+ }
+
/* Add new threshold */
- thresholds_new->entries[size - 1].eventfd = eventfd;
- thresholds_new->entries[size - 1].threshold = threshold;
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(thresholds_new->entries, size,
- sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
/* Find current threshold */
- atomic_set(&thresholds_new->current_threshold, -1);
+ new->current_threshold = -1;
for (i = 0; i < size; i++) {
- if (thresholds_new->entries[i].threshold < usage) {
+ if (new->entries[i].threshold < usage) {
/*
- * thresholds_new->current_threshold will not be used
- * until rcu_assign_pointer(), so it's safe to increment
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
* it here.
*/
- atomic_inc(&thresholds_new->current_threshold);
+ ++new->current_threshold;
}
}
- if (type == _MEM)
- rcu_assign_pointer(memcg->thresholds, thresholds_new);
- else
- rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
- /* To be sure that nobody uses thresholds before freeing it */
+ /* To be sure that nobody uses thresholds */
synchronize_rcu();
- kfree(thresholds);
unlock:
mutex_unlock(&memcg->thresholds_lock);
return ret;
}
-static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
- struct eventfd_ctx *eventfd)
+static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 usage;
- int size = 0;
- int i, j, ret;
+ int i, j, size;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM)
- thresholds = memcg->thresholds;
+ thresholds = &memcg->thresholds;
else if (type == _MEMSWAP)
- thresholds = memcg->memsw_thresholds;
+ thresholds = &memcg->memsw_thresholds;
else
BUG();
@@ -3484,59 +3726,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
- for (i = 0; i < thresholds->size; i++) {
- if (thresholds->entries[i].eventfd != eventfd)
+ size = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
}
+ new = thresholds->spare;
+
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
- thresholds_new = NULL;
- goto assign;
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
}
- /* Allocate memory for new array of thresholds */
- thresholds_new = kmalloc(sizeof(*thresholds_new) +
- size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
- if (!thresholds_new) {
- ret = -ENOMEM;
- goto unlock;
- }
- thresholds_new->size = size;
+ new->size = size;
/* Copy thresholds and find current threshold */
- atomic_set(&thresholds_new->current_threshold, -1);
- for (i = 0, j = 0; i < thresholds->size; i++) {
- if (thresholds->entries[i].eventfd == eventfd)
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
continue;
- thresholds_new->entries[j] = thresholds->entries[i];
- if (thresholds_new->entries[j].threshold < usage) {
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold < usage) {
/*
- * thresholds_new->current_threshold will not be used
+ * new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
- atomic_inc(&thresholds_new->current_threshold);
+ ++new->current_threshold;
}
j++;
}
-assign:
- if (type == _MEM)
- rcu_assign_pointer(memcg->thresholds, thresholds_new);
- else
- rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+ rcu_assign_pointer(thresholds->primary, new);
- /* To be sure that nobody uses thresholds before freeing it */
+ /* To be sure that nobody uses thresholds */
synchronize_rcu();
- kfree(thresholds);
-unlock:
mutex_unlock(&memcg->thresholds_lock);
+}
- return ret;
+static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_eventfd_list *event;
+ int type = MEMFILE_TYPE(cft->private);
+
+ BUG_ON(type != _OOM_TYPE);
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ mutex_lock(&memcg_oom_mutex);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (atomic_read(&memcg->oom_lock))
+ eventfd_signal(eventfd, 1);
+ mutex_unlock(&memcg_oom_mutex);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+ int type = MEMFILE_TYPE(cft->private);
+
+ BUG_ON(type != _OOM_TYPE);
+
+ mutex_lock(&memcg_oom_mutex);
+
+ list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ mutex_unlock(&memcg_oom_mutex);
+}
+
+static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
+ struct cftype *cft, struct cgroup_map_cb *cb)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+
+ if (atomic_read(&mem->oom_lock))
+ cb->fill(cb, "under_oom", 1);
+ else
+ cb->fill(cb, "under_oom", 0);
+ return 0;
+}
+
+static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
+
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
+ if (!cgrp->parent || !((val == 0) || (val == 1)))
+ return -EINVAL;
+
+ parent = mem_cgroup_from_cont(cgrp->parent);
+
+ cgroup_lock();
+ /* oom-kill-disable is a flag for subhierarchy. */
+ if ((parent->use_hierarchy) ||
+ (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+ cgroup_unlock();
+ return -EINVAL;
+ }
+ mem->oom_kill_disable = val;
+ if (!val)
+ memcg_oom_recover(mem);
+ cgroup_unlock();
+ return 0;
}
static struct cftype mem_cgroup_files[] = {
@@ -3544,8 +3863,8 @@ static struct cftype mem_cgroup_files[] = {
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
- .register_event = mem_cgroup_register_event,
- .unregister_event = mem_cgroup_unregister_event,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
},
{
.name = "max_usage_in_bytes",
@@ -3594,6 +3913,14 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
+ {
+ .name = "oom_control",
+ .read_map = mem_cgroup_oom_control_read,
+ .write_u64 = mem_cgroup_oom_control_write,
+ .register_event = mem_cgroup_oom_register_event,
+ .unregister_event = mem_cgroup_oom_unregister_event,
+ .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3602,8 +3929,8 @@ static struct cftype memsw_cgroup_files[] = {
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
- .register_event = mem_cgroup_register_event,
- .unregister_event = mem_cgroup_unregister_event,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
},
{
.name = "memsw.max_usage_in_bytes",
@@ -3831,6 +4158,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
+ mem->oom_kill_disable = parent->oom_kill_disable;
}
if (parent && parent->use_hierarchy) {
@@ -3849,6 +4177,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
}
mem->last_scanned_child = 0;
spin_lock_init(&mem->reclaim_param_lock);
+ INIT_LIST_HEAD(&mem->oom_notify);
if (parent)
mem->swappiness = get_swappiness(parent);
@@ -3922,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
goto one_by_one;
}
mc.precharge += count;
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_get(&mem->css, (int)count);
return ret;
}
one_by_one:
@@ -3976,6 +4302,80 @@ enum mc_target_type {
MC_TARGET_SWAP,
};
+static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent)
+{
+ struct page *page = vm_normal_page(vma, addr, ptent);
+
+ if (!page || !page_mapped(page))
+ return NULL;
+ if (PageAnon(page)) {
+ /* we don't move shared anon */
+ if (!move_anon() || page_mapcount(page) > 2)
+ return NULL;
+ } else if (!move_file())
+ /* we ignore mapcount for file pages */
+ return NULL;
+ if (!get_page_unless_zero(page))
+ return NULL;
+
+ return page;
+}
+
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ int usage_count;
+ struct page *page = NULL;
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!move_anon() || non_swap_entry(ent))
+ return NULL;
+ usage_count = mem_cgroup_count_swap_user(ent, &page);
+ if (usage_count > 1) { /* we don't move shared anon */
+ if (page)
+ put_page(page);
+ return NULL;
+ }
+ if (do_swap_account)
+ entry->val = ent.val;
+
+ return page;
+}
+
+static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ struct page *page = NULL;
+ struct inode *inode;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ if (!vma->vm_file) /* anonymous vma */
+ return NULL;
+ if (!move_file())
+ return NULL;
+
+ inode = vma->vm_file->f_path.dentry->d_inode;
+ mapping = vma->vm_file->f_mapping;
+ if (pte_none(ptent))
+ pgoff = linear_page_index(vma, addr);
+ else /* pte_file(ptent) is true */
+ pgoff = pte_to_pgoff(ptent);
+
+ /* page is moved even if it's not RSS of this task(page-faulted). */
+ if (!mapping_cap_swap_backed(mapping)) { /* normal file */
+ page = find_get_page(mapping, pgoff);
+ } else { /* shmem/tmpfs file. we should take account of swap too. */
+ swp_entry_t ent;
+ mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+ if (do_swap_account)
+ entry->val = ent.val;
+ }
+
+ return page;
+}
+
static int is_target_pte_for_mc(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -3983,43 +4383,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
struct page_cgroup *pc;
int ret = 0;
swp_entry_t ent = { .val = 0 };
- int usage_count = 0;
- bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
- &mc.to->move_charge_at_immigrate);
- if (!pte_present(ptent)) {
- /* TODO: handle swap of shmes/tmpfs */
- if (pte_none(ptent) || pte_file(ptent))
- return 0;
- else if (is_swap_pte(ptent)) {
- ent = pte_to_swp_entry(ptent);
- if (!move_anon || non_swap_entry(ent))
- return 0;
- usage_count = mem_cgroup_count_swap_user(ent, &page);
- }
- } else {
- page = vm_normal_page(vma, addr, ptent);
- if (!page || !page_mapped(page))
- return 0;
- /*
- * TODO: We don't move charges of file(including shmem/tmpfs)
- * pages for now.
- */
- if (!move_anon || !PageAnon(page))
- return 0;
- if (!get_page_unless_zero(page))
- return 0;
- usage_count = page_mapcount(page);
- }
- if (usage_count > 1) {
- /*
- * TODO: We don't move charges of shared(used by multiple
- * processes) pages for now.
- */
- if (page)
- put_page(page);
+ if (pte_present(ptent))
+ page = mc_handle_present_pte(vma, addr, ptent);
+ else if (is_swap_pte(ptent))
+ page = mc_handle_swap_pte(vma, addr, ptent, &ent);
+ else if (pte_none(ptent) || pte_file(ptent))
+ page = mc_handle_file_pte(vma, addr, ptent, &ent);
+
+ if (!page && !ent.val)
return 0;
- }
if (page) {
pc = lookup_page_cgroup(page);
/*
@@ -4035,8 +4408,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
if (!ret || !target)
put_page(page);
}
- /* throught */
- if (ent.val && do_swap_account && !ret &&
+ /* There is a swap entry and a page doesn't exist or isn't charged */
+ if (ent.val && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
ret = MC_TARGET_SWAP;
if (target)
@@ -4077,9 +4450,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
};
if (is_vm_hugetlb_page(vma))
continue;
- /* TODO: We don't move charges of shmem/tmpfs pages for now. */
- if (vma->vm_flags & VM_SHARED)
- continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
@@ -4098,6 +4468,9 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
@@ -4113,7 +4486,6 @@ static void mem_cgroup_clear_mc(void)
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
- WARN_ON_ONCE(mc.moved_swap > INT_MAX);
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
@@ -4127,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
- VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
- __css_put(&mc.to->css, mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
@@ -4165,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);
ret = mem_cgroup_precharge_mc(mm);
if (ret)
@@ -4274,9 +4650,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
};
if (is_vm_hugetlb_page(vma))
continue;
- /* TODO: We don't move charges of shmem/tmpfs pages for now. */
- if (vma->vm_flags & VM_SHARED)
- continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b46159..6b44e52caca 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,7 @@
#include <linux/page-isolation.h>
#include <linux/suspend.h>
#include <linux/slab.h>
+#include <linux/swapops.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1296,3 +1297,35 @@ done:
/* keep elevated page count for bad page */
return ret;
}
+
+/*
+ * The caller must hold current->mm->mmap_sem in read mode.
+ */
+int is_hwpoison_address(unsigned long addr)
+{
+ pgd_t *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t pte, *ptep;
+ swp_entry_t entry;
+
+ pgdp = pgd_offset(current->mm, addr);
+ if (!pgd_present(*pgdp))
+ return 0;
+ pudp = pud_offset(pgdp, addr);
+ pud = *pudp;
+ if (!pud_present(pud) || pud_large(pud))
+ return 0;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = *pmdp;
+ if (!pmd_present(pmd) || pmd_large(pmd))
+ return 0;
+ ptep = pte_offset_map(pmdp, addr);
+ pte = *ptep;
+ pte_unmap(ptep);
+ if (!is_swap_pte(pte))
+ return 0;
+ entry = pte_to_swp_entry(pte);
+ return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 833952d8b74..858829d06a9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
* The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
if (addr > end - 1)
return;
- start = addr;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1227,8 +1225,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-/*
- * Do a quick page-table lookup for a single page.
+/**
+ * follow_page - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
*/
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
@@ -1385,10 +1392,20 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i ? : -EFAULT;
}
if (pages) {
- struct page *page = vm_normal_page(gate_vma, start, *pte);
+ struct page *page;
+
+ page = vm_normal_page(gate_vma, start, *pte);
+ if (!page) {
+ if (!(gup_flags & FOLL_DUMP) &&
+ is_zero_pfn(pte_pfn(*pte)))
+ page = pte_page(*pte);
+ else {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ }
pages[i] = page;
- if (page)
- get_page(page);
+ get_page(page);
}
pte_unmap(pte);
if (vmas)
@@ -1989,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr, end = addr + size;
+ unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
- mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2001,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- mmu_notifier_invalidate_range_end(mm, start, end);
+
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2611,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
+ int exclusive = 0;
int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2705,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = 1;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ do_page_add_anon_rmap(page, vma, address, exclusive);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index be211a58293..a4cfcdc0045 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
+ mutex_lock(&zonelists_mutex);
if (!populated_zone(zone))
need_zonelists_rebuild = 1;
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
nr_pages, pfn);
memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
+ if (need_zonelists_rebuild)
+ build_all_zonelists(zone);
+ else
+ zone_pcp_update(zone);
- zone_pcp_update(zone);
+ mutex_unlock(&zonelists_mutex);
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
if (onlined_pages) {
@@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
}
- if (need_zonelists_rebuild)
- build_all_zonelists();
- else
- vm_total_pages = nr_free_pagecache_pages();
+ vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
+/*
+ * called by cpu_up() to online a node without onlined memory.
+ */
+int mem_online_node(int nid)
+{
+ pg_data_t *pgdat;
+ int ret;
+
+ lock_system_sleep();
+ pgdat = hotadd_new_pgdat(nid, 0);
+ if (pgdat) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+
+out:
+ unlock_system_sleep();
+ return ret;
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2f3fe..f969da5dd8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+ /*
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];
/* Check that the nodemask contains at least one populated zone */
@@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
{
int nd, k;
- /* Check that there is something useful in this mask */
- k = policy_zone;
-
for_each_node_mask(nd, *nodemask) {
struct zone *z;
@@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+ return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
kmem_cache_free(policy_cache, p);
}
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
}
-static void mpol_rebind_nodemask(struct mempolicy *pol,
- const nodemask_t *nodes)
+/*
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
@@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
+ /*
+ * if step == 1, we use ->w.cpuset_mems_allowed to cache the
+ * result
+ */
+ if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
+ nodes_remap(tmp, pol->v.nodes,
+ pol->w.cpuset_mems_allowed, *nodes);
+ pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
+ } else if (step == MPOL_REBIND_STEP2) {
+ tmp = pol->w.cpuset_mems_allowed;
+ pol->w.cpuset_mems_allowed = *nodes;
+ } else
+ BUG();
}
- pol->v.nodes = tmp;
+ if (nodes_empty(tmp))
+ tmp = *nodes;
+
+ if (step == MPOL_REBIND_STEP1)
+ nodes_or(pol->v.nodes, pol->v.nodes, tmp);
+ else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
+ pol->v.nodes = tmp;
+ else
+ BUG();
+
if (!node_isset(current->il_next, tmp)) {
current->il_next = next_node(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
@@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes)
+ const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
@@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
}
}
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
- const nodemask_t *newmask)
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
+ enum mpol_rebind_step step)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && step == 0 &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- mpol_ops[pol->mode].rebind(pol, newmask);
+
+ if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
+ return;
+
+ if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
+ BUG();
+
+ if (step == MPOL_REBIND_STEP1)
+ pol->flags |= MPOL_F_REBINDING;
+ else if (step == MPOL_REBIND_STEP2)
+ pol->flags &= ~MPOL_F_REBINDING;
+ else if (step >= MPOL_REBIND_NSTEP)
+ BUG();
+
+ mpol_ops[pol->mode].rebind(pol, newmask, step);
}
/*
@@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
* Called with task's alloc_lock held.
*/
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+ enum mpol_rebind_step step)
{
- mpol_rebind_policy(tsk->mempolicy, new);
+ mpol_rebind_policy(tsk->mempolicy, new, step);
}
/*
@@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new);
+ mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
up_write(&mm->mmap_sem);
}
@@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist))
@@ -1206,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, new_nodes)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
+ struct mm_struct *mm = NULL;
struct task_struct *task;
- nodemask_t old;
- nodemask_t new;
nodemask_t task_nodes;
int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
- err = get_nodes(&old, old_nodes, maxnode);
+ if (!scratch)
+ return -ENOMEM;
+
+ old = &scratch->mask1;
+ new = &scratch->mask2;
+
+ err = get_nodes(old, old_nodes, maxnode);
if (err)
- return err;
+ goto out;
- err = get_nodes(&new, new_nodes, maxnode);
+ err = get_nodes(new, new_nodes, maxnode);
if (err)
- return err;
+ goto out;
/* Find the mm_struct */
read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
read_unlock(&tasklist_lock);
- return -ESRCH;
+ err = -ESRCH;
+ goto out;
}
mm = get_task_mm(task);
read_unlock(&tasklist_lock);
+ err = -EINVAL;
if (!mm)
- return -EINVAL;
+ goto out;
/*
* Check if this process has the right to modify the specified
@@ -1253,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
goto out;
}
@@ -1267,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
if (err)
goto out;
- err = do_migrate_pages(mm, &old, &new,
+ err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
- mmput(mm);
+ if (mm)
+ mmput(mm);
+ NODEMASK_SCRATCH_FREE(scratch);
+
return err;
}
@@ -1444,15 +1525,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
/*
* Normally, MPOL_BIND allocations are node-local within the
* allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node is part of the mask, we use the zonelist for
+ * current node isn't part of the mask, we use the zonelist for
* the first node in the mask instead.
*/
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
- case MPOL_INTERLEAVE: /* should not happen */
- break;
default:
BUG();
}
@@ -1572,6 +1651,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
+ *
+ * Must be protected by get_mems_allowed()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1617,6 +1698,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
if (!(mask && current->mempolicy))
return false;
+ task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
@@ -1636,11 +1718,56 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
default:
BUG();
}
+ task_unlock(current);
return true;
}
#endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -1683,13 +1810,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
+ struct page *page;
+ get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- return alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, 0, nid);
+ put_mems_allowed();
+ return page;
}
zl = policy_zonelist(gfp, pol);
if (unlikely(mpol_needs_cond_ref(pol))) {
@@ -1699,12 +1830,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct page *page = __alloc_pages_nodemask(gfp, 0,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
+ put_mems_allowed();
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
/**
@@ -1729,18 +1863,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
+ struct page *page;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
+ get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages_nodemask(gfp, order,
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -1750,6 +1889,9 @@ EXPORT_SYMBOL(alloc_pages_current);
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
@@ -1759,13 +1901,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* task's mempolicy is protected by alloc_lock */
+ if (old == current->mempolicy) {
+ task_lock(current);
+ *new = *old;
+ task_unlock(current);
+ } else
+ *new = *old;
+
rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- mpol_rebind_policy(old, &mems);
+ if (new->flags & MPOL_F_REBINDING)
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
+ else
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
rcu_read_unlock();
- *new = *old;
atomic_set(&new->refcnt, 1);
return new;
}
@@ -1792,16 +1945,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
return tompol;
}
-static int mpol_match_intent(const struct mempolicy *a,
- const struct mempolicy *b)
-{
- if (a->flags != b->flags)
- return 0;
- if (!mpol_store_user_nodemask(a))
- return 1;
- return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
-}
-
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -1809,8 +1952,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return 0;
if (a->mode != b->mode)
return 0;
- if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ if (a->flags != b->flags)
return 0;
+ if (mpol_store_user_nodemask(a))
+ if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+ return 0;
+
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
@@ -2003,31 +2150,29 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
NODEMASK_SCRATCH(scratch);
if (!scratch)
- return;
+ goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
- if (IS_ERR(new)) {
- mpol_put(mpol); /* drop our ref on sb mpol */
- NODEMASK_SCRATCH_FREE(scratch);
- return; /* no valid nodemask intersection */
- }
+ if (IS_ERR(new))
+ goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
- mpol_put(mpol); /* drop our ref on sb mpol */
- if (ret) {
- NODEMASK_SCRATCH_FREE(scratch);
- mpol_put(new);
- return;
- }
+ if (ret)
+ goto put_new;
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_new:
mpol_put(new); /* drop initial ref */
+free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
+put_mpol:
+ mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
@@ -2132,9 +2277,15 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
-static const char * const policy_types[] =
- { "default", "prefer", "bind", "interleave", "local" };
+#define MPOL_LOCAL MPOL_MAX
+static const char * const policy_modes[] =
+{
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "prefer",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local"
+};
#ifdef CONFIG_TMPFS
@@ -2159,12 +2310,11 @@ static const char * const policy_types[] =
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
{
struct mempolicy *new = NULL;
- unsigned short uninitialized_var(mode);
+ unsigned short mode;
unsigned short uninitialized_var(mode_flags);
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int i;
int err = 1;
if (nodelist) {
@@ -2180,13 +2330,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (i = 0; i <= MPOL_LOCAL; i++) {
- if (!strcmp(str, policy_types[i])) {
- mode = i;
+ for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (i > MPOL_LOCAL)
+ if (mode > MPOL_LOCAL)
goto out;
switch (mode) {
@@ -2250,7 +2399,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
- {
+ if (no_context) {
+ /* save for contextualization */
+ new->w.user_nodemask = nodes;
+ } else {
int ret;
NODEMASK_SCRATCH(scratch);
if (scratch) {
@@ -2266,10 +2418,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
}
}
err = 0;
- if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
- }
out:
/* Restore string for error message */
@@ -2338,11 +2486,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
BUG();
}
- l = strlen(policy_types[mode]);
+ l = strlen(policy_modes[mode]);
if (buffer + maxlen < p + l + 1)
return -ENOSPC;
- strcpy(p, policy_types[mode]);
+ strcpy(p, policy_modes[mode]);
p += l;
if (flags & MPOL_MODE_FLAGS) {
diff --git a/mm/migrate.c b/mm/migrate.c
index d3f3f7f8107..38e7cad782f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,7 +40,8 @@
/*
* migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page().
+ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
+ * undesirable, use migrate_prep_local()
*/
int migrate_prep(void)
{
@@ -55,26 +56,29 @@ int migrate_prep(void)
return 0;
}
+/* Do the necessary work of migrate_prep but not if it involves other CPUs */
+int migrate_prep_local(void)
+{
+ lru_add_drain();
+
+ return 0;
+}
+
/*
* Add isolated pages on the list back to the LRU under page lock
* to avoid leaking evictable pages back onto unevictable list.
- *
- * returns the number of pages put back.
*/
-int putback_lru_pages(struct list_head *l)
+void putback_lru_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
- int count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
- count++;
}
- return count;
}
/*
@@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping,
* < 0 - error code
* == 0 - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page)
+static int move_to_new_page(struct page *newpage, struct page *page,
+ int remap_swapcache)
{
struct address_space *mapping;
int rc;
@@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page)
else
rc = fallback_migrate_page(mapping, newpage, page);
- if (!rc)
- remove_migration_ptes(page, newpage);
- else
+ if (rc) {
newpage->mapping = NULL;
+ } else {
+ if (remap_swapcache)
+ remove_migration_ptes(page, newpage);
+ }
unlock_page(newpage);
@@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
+ int remap_swapcache = 1;
int rcu_locked = 0;
int charge = 0;
struct mem_cgroup *mem = NULL;
+ struct anon_vma *anon_vma = NULL;
if (!newpage)
return -ENOMEM;
@@ -581,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, &mem);
+ charge = mem_cgroup_prepare_migration(page, newpage, &mem);
if (charge == -ENOMEM) {
rc = -ENOMEM;
goto unlock;
@@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (PageAnon(page)) {
rcu_read_lock();
rcu_locked = 1;
+
+ /* Determine how to safely use anon_vma */
+ if (!page_mapped(page)) {
+ if (!PageSwapCache(page))
+ goto rcu_unlock;
+
+ /*
+ * We cannot be sure that the anon_vma of an unmapped
+ * swapcache page is safe to use because we don't
+ * know in advance if the VMA that this page belonged
+ * to still exists. If the VMA and others sharing the
+ * data have been freed, then the anon_vma could
+ * already be invalid.
+ *
+ * To avoid this possibility, swapcache pages get
+ * migrated but are not remapped when migration
+ * completes
+ */
+ remap_swapcache = 0;
+ } else {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ anon_vma = page_anon_vma(page);
+ get_anon_vma(anon_vma);
+ }
}
/*
@@ -638,11 +675,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page);
+ rc = move_to_new_page(newpage, page, remap_swapcache);
- if (rc)
+ if (rc && remap_swapcache)
remove_migration_ptes(page, page);
rcu_unlock:
+
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
+
if (rcu_locked)
rcu_read_unlock();
uncharge:
diff --git a/mm/mincore.c b/mm/mincore.c
index f77433c2027..9ac42dc6d7b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,40 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ struct hstate *h;
+
+ h = hstate_vma(vma);
+ while (1) {
+ unsigned char present;
+ pte_t *ptep;
+ /*
+ * Huge pages are always in RAM for now, but
+ * theoretically it needs to be checked.
+ */
+ ptep = huge_pte_offset(current->mm,
+ addr & huge_page_mask(h));
+ present = ptep && !huge_pte_none(huge_ptep_get(ptep));
+ while (1) {
+ *vec = present;
+ vec++;
+ addr += PAGE_SIZE;
+ if (addr == end)
+ return;
+ /* check hugepage border */
+ if (!(addr & ~huge_page_mask(h)))
+ break;
+ }
+ }
+#else
+ BUG();
+#endif
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
return present;
}
-/*
- * Do a chunk of "sys_mincore()". We've already checked
- * all the arguments, we hold the mmap semaphore: we should
- * just return the amount of info we're asked for.
- */
-static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
+static void mincore_unmapped_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
- spinlock_t *ptl;
- unsigned long nr;
+ unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
- pgoff_t pgoff;
- struct vm_area_struct *vma = find_vma(current->mm, addr);
- /*
- * find_vma() didn't find anything above us, or we're
- * in an unmapped hole in the address space: ENOMEM.
- */
- if (!vma || addr < vma->vm_start)
- return -ENOMEM;
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma)) {
- struct hstate *h;
- unsigned long nr_huge;
- unsigned char present;
+ if (vma->vm_file) {
+ pgoff_t pgoff;
- i = 0;
- nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
- h = hstate_vma(vma);
- nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
- - (addr >> huge_page_shift(h)) + 1;
- nr_huge = min(nr_huge,
- (vma->vm_end - addr) >> huge_page_shift(h));
- while (1) {
- /* hugepage always in RAM for now,
- * but generally it needs to be check */
- ptep = huge_pte_offset(current->mm,
- addr & huge_page_mask(h));
- present = !!(ptep &&
- !huge_pte_none(huge_ptep_get(ptep)));
- while (1) {
- vec[i++] = present;
- addr += PAGE_SIZE;
- /* reach buffer limit */
- if (i == nr)
- return nr;
- /* check hugepage border */
- if (!((addr & ~huge_page_mask(h))
- >> PAGE_SHIFT))
- break;
- }
- }
- return nr;
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
}
-#endif
-
- /*
- * Calculate how many pages there are left in the last level of the
- * PTE array for our address.
- */
- nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
-
- /*
- * Don't overrun this vma
- */
- nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
-
- /*
- * Don't return more than the caller asked for
- */
- nr = min(nr, pages);
+}
- pgd = pgd_offset(vma->vm_mm, addr);
- if (pgd_none_or_clear_bad(pgd))
- goto none_mapped;
- pud = pud_offset(pgd, addr);
- if (pud_none_or_clear_bad(pud))
- goto none_mapped;
- pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
- goto none_mapped;
+static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ spinlock_t *ptl;
+ pte_t *ptep;
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
- unsigned char present;
+ do {
pte_t pte = *ptep;
+ pgoff_t pgoff;
- if (pte_present(pte)) {
- present = 1;
-
- } else if (pte_none(pte)) {
- if (vma->vm_file) {
- pgoff = linear_page_index(vma, addr);
- present = mincore_page(vma->vm_file->f_mapping,
- pgoff);
- } else
- present = 0;
-
- } else if (pte_file(pte)) {
+ next = addr + PAGE_SIZE;
+ if (pte_none(pte))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else if (pte_present(pte))
+ *vec = 1;
+ else if (pte_file(pte)) {
pgoff = pte_to_pgoff(pte);
- present = mincore_page(vma->vm_file->f_mapping, pgoff);
-
+ *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
} else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
+
if (is_migration_entry(entry)) {
/* migration entries are always uptodate */
- present = 1;
+ *vec = 1;
} else {
#ifdef CONFIG_SWAP
pgoff = entry.val;
- present = mincore_page(&swapper_space, pgoff);
+ *vec = mincore_page(&swapper_space, pgoff);
#else
WARN_ON(1);
- present = 1;
+ *vec = 1;
#endif
}
}
+ vec++;
+ } while (ptep++, addr = next, addr != end);
+ pte_unmap_unlock(ptep - 1, ptl);
+}
- vec[i] = present;
- }
- pte_unmap_unlock(ptep-1, ptl);
+static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pmd_t *pmd;
- return nr;
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pte_range(vma, pmd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pmd++, addr = next, addr != end);
+}
-none_mapped:
- if (vma->vm_file) {
- pgoff = linear_page_index(vma, addr);
- for (i = 0; i < nr; i++, pgoff++)
- vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
- } else {
- for (i = 0; i < nr; i++)
- vec[i] = 0;
+static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pmd_range(vma, pud, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pud++, addr = next, addr != end);
+}
+
+static void mincore_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pgd_t *pgd;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pud_range(vma, pgd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pgd++, addr = next, addr != end);
+}
+
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
+
+ end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+
+ if (is_vm_hugetlb_page(vma)) {
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ return (end - addr) >> PAGE_SHIFT;
}
- return nr;
+ end = pmd_addr_end(addr, end);
+
+ if (is_vm_hugetlb_page(vma))
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ else
+ mincore_page_range(vma, addr, end, vec);
+
+ return (end - addr) >> PAGE_SHIFT;
}
/*
@@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
* the temporary buffer size.
*/
down_read(&current->mm->mmap_sem);
- retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+ retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
up_read(&current->mm->mmap_sem);
if (retval <= 0)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
spin_unlock(&shmlock_user_lock);
free_uid(user);
}
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
- size_t size)
-{
- unsigned long lim, vm, pgsz;
- int error = -ENOMEM;
-
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- down_write(&mm->mmap_sem);
-
- lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
- vm = mm->total_vm + pgsz;
- if (lim < vm)
- goto out;
-
- lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
- vm = mm->locked_vm + pgsz;
- if (lim < vm)
- goto out;
-
- mm->total_vm += pgsz;
- mm->locked_vm += pgsz;
-
- error = 0;
- out:
- up_write(&mm->mmap_sem);
- return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
- unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- down_write(&mm->mmap_sem);
-
- mm->total_vm -= pgsz;
- mm->locked_vm -= pgsz;
-
- up_write(&mm->mmap_sem);
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f2788..31003338b97 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mapping->i_mmap_lock);
vma->vm_truncate_count = mapping->truncate_count;
}
- anon_vma_lock(vma);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
- anon_vma_unlock(vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -506,6 +504,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
+ struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
long adjust_next = 0;
int remove_next = 0;
@@ -578,6 +577,17 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ /*
+ * When changing only vma->vm_end, we don't really need anon_vma
+ * lock. This is a fairly rare case by itself, but the anon_vma
+ * lock may be shared between many sibling processes. Skipping
+ * the lock for brk adjustments makes a difference sometimes.
+ */
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock(anon_vma);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -617,6 +627,8 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}
+ if (anon_vma)
+ anon_vma_unlock(anon_vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -1710,7 +1722,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1733,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
@@ -1734,10 +1746,12 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
grow = (address - vma->vm_end) >> PAGE_SHIFT;
error = acct_stack_growth(vma, size, grow);
- if (!error)
+ if (!error) {
vma->vm_end = address;
+ perf_event_mmap(vma);
+ }
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1762,7 +1776,7 @@ static int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1781,9 +1795,10 @@ static int expand_downwards(struct vm_area_struct *vma,
if (!error) {
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
@@ -2208,6 +2223,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
+ perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -2466,23 +2482,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->lock. If some other vma in this mm shares
+ * anon_vma->root->lock. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
}
}
@@ -2573,7 +2589,7 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
@@ -2584,12 +2600,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
}
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f0..632df4527c0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = vfs_fsync(file, file->f_path.dentry, 0);
+ error = vfs_fsync(file, 0);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 63fa17d121f..b76f3ee0abe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -918,14 +918,6 @@ static int validate_mmap_request(struct file *file,
if (!(capabilities & BDI_CAP_MAP_DIRECT))
return -ENODEV;
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
- ) {
- printk("MAP_SHARED not completely supported on !MMU\n");
- return -EINVAL;
- }
-
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
}
@@ -941,6 +933,20 @@ static int validate_mmap_request(struct file *file,
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
+ ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+ ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ ) {
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ if (flags & MAP_SHARED) {
+ printk(KERN_WARNING
+ "MAP_SHARED not completely supported on !MMU\n");
+ return -EINVAL;
+ }
+ }
+ }
+
/* handle executable mappings and implied executable
* mappings */
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
@@ -996,22 +1002,20 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
- vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* vm_flags |= mm->def_flags; */
if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
vm_flags |= VM_MAYSHARE;
- }
- else {
+ } else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
+ vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
if (flags & MAP_SHARED)
- vm_flags |= VM_MAYSHARE | VM_SHARED;
- else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
- vm_flags |= VM_MAYSHARE;
+ vm_flags |= VM_SHARED;
}
/* refuse to let anyone share private mappings with this process if
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b68e802a7a7..5014e50644d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
+ * Copyright (C) 2010 Google, Inc.
+ * Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
#include <linux/security.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks;
+int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
+
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ */
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct task_struct *start = tsk;
+
+ do {
+ if (mask) {
+ /*
+ * If this is a mempolicy constrained oom, tsk's
+ * cpuset is irrelevant. Only return true if its
+ * mempolicy intersects current, otherwise it may be
+ * needlessly killed.
+ */
+ if (mempolicy_nodemask_intersects(tsk, mask))
+ return true;
+ } else {
+ /*
+ * This is not a mempolicy constrained oom, so only
+ * check the mems of tsk's cpuset.
+ */
+ if (cpuset_mems_allowed_intersects(current, tsk))
+ return true;
+ }
+ } while_each_thread(start, tsk);
+
+ return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
/*
- * Is all threads of the target process nodes overlap ours?
+ * If this is a system OOM (not a memcg OOM) and the task selected to be
+ * killed is not already running at high (RT) priorities, speed up the
+ * recovery by boosting the dying task to the lowest FIFO priority.
+ * That helps with the recovery and avoids interfering with RT tasks.
*/
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static void boost_dying_task_prio(struct task_struct *p,
+ struct mem_cgroup *mem)
{
- struct task_struct *t;
+ struct sched_param param = { .sched_priority = 1 };
+
+ if (mem)
+ return;
+
+ if (!rt_task(p))
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+}
+
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer. Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+ struct task_struct *t = p;
- t = tsk;
do {
- if (cpuset_mems_allowed_intersects(current, t))
- return 1;
- t = next_thread(t);
- } while (t != tsk);
+ task_lock(t);
+ if (likely(t->mm))
+ return t;
+ task_unlock(t);
+ } while_each_thread(p, t);
- return 0;
+ return NULL;
+}
+
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
+{
+ if (is_global_init(p))
+ return true;
+ if (p->flags & PF_KTHREAD)
+ return true;
+
+ /* When mem_cgroup_out_of_memory() and p is not member of the group */
+ if (mem && !task_in_mem_cgroup(p, mem))
+ return true;
+
+ /* p may not have freeable memory in nodemask */
+ if (!has_intersects_mems_allowed(p, nodemask))
+ return true;
+
+ return false;
}
/**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
+ * @totalpages: total present RAM allowed for page allocation
*
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- * algorithm has been meticulously tuned to meet the principle
- * of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible. The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
*/
-
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+ const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points, cpu_time, run_time;
- struct mm_struct *mm;
- struct task_struct *child;
- int oom_adj = p->signal->oom_adj;
- struct task_cputime task_time;
- unsigned long utime;
- unsigned long stime;
+ int points;
- if (oom_adj == OOM_DISABLE)
+ if (oom_unkillable_task(p, mem, nodemask))
return 0;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
- task_unlock(p);
+ p = find_lock_task_mm(p);
+ if (!p)
return 0;
- }
/*
- * The memory size of the process is the basis for the badness.
+ * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+ * need to be executed for something that cannot be killed.
*/
- points = mm->total_vm;
-
- /*
- * After this unlock we can no longer dereference local variable `mm'
- */
- task_unlock(p);
-
- /*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_OOM_ORIGIN)
- return ULONG_MAX;
-
- /*
- * Processes which fork a lot of child processes are likely
- * a good choice. We add half the vmsize of the children if they
- * have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children. In case a single
- * child is eating the vast majority of memory, adding only half
- * to the parents will make the child our kill candidate of choice.
- */
- list_for_each_entry(child, &p->children, sibling) {
- task_lock(child);
- if (child->mm != mm && child->mm)
- points += child->mm->total_vm/2 + 1;
- task_unlock(child);
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ task_unlock(p);
+ return 0;
}
/*
- * CPU time is in tens of seconds and run time is in thousands
- * of seconds. There is no particular reason for this other than
- * that it turned out to work very well in practice.
- */
- thread_group_cputime(p, &task_time);
- utime = cputime_to_jiffies(task_time.utime);
- stime = cputime_to_jiffies(task_time.stime);
- cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
- if (uptime >= p->start_time.tv_sec)
- run_time = (uptime - p->start_time.tv_sec) >> 10;
- else
- run_time = 0;
-
- if (cpu_time)
- points /= int_sqrt(cpu_time);
- if (run_time)
- points /= int_sqrt(int_sqrt(run_time));
-
- /*
- * Niced processes are most likely less important, so double
- * their badness points.
+ * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+ * priority for oom killing.
*/
- if (task_nice(p) > 0)
- points *= 2;
+ if (p->flags & PF_OOM_ORIGIN) {
+ task_unlock(p);
+ return 1000;
+ }
/*
- * Superuser processes are usually more important, so we make it
- * less likely that we kill those.
+ * The memory controller may have a limit of 0 bytes, so avoid a divide
+ * by zero, if necessary.
*/
- if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
- has_capability_noaudit(p, CAP_SYS_RESOURCE))
- points /= 4;
+ if (!totalpages)
+ totalpages = 1;
/*
- * We don't want to kill a process with direct hardware access.
- * Not only could that mess up the hardware, but usually users
- * tend to only have this flag set on applications they think
- * of as important.
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss and swap space use.
*/
- if (has_capability_noaudit(p, CAP_SYS_RAWIO))
- points /= 4;
+ points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+ totalpages;
+ task_unlock(p);
/*
- * If p's nodes don't overlap ours, it may still help to kill p
- * because p may have allocated or otherwise mapped memory on
- * this node before. However it will be less likely.
+ * Root processes get 3% bonus, just like the __vm_enough_memory()
+ * implementation used by LSMs.
*/
- if (!has_intersects_mems_allowed(p))
- points /= 8;
+ if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+ points -= 30;
/*
- * Adjust the score by oom_adj.
+ * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+ * either completely disable oom killing or always prefer a certain
+ * task.
*/
- if (oom_adj) {
- if (oom_adj > 0) {
- if (!points)
- points = 1;
- points <<= oom_adj;
- } else
- points >>= -(oom_adj);
- }
+ points += p->signal->oom_score_adj;
-#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
- p->pid, p->comm, points);
-#endif
- return points;
+ if (points < 0)
+ return 0;
+ return (points < 1000) ? points : 1000;
}
/*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
*/
#ifdef CONFIG_NUMA
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ bool cpuset_limited = false;
+ int nid;
+
+ /* Default to all available memory */
+ *totalpages = totalram_pages + total_swap_pages;
+ if (!zonelist)
+ return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
/*
- * The nodemask here is a nodemask passed to alloc_pages(). Now,
- * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
- * feature. mempolicy is an only user of nodemask here.
- * check mempolicy's nodemask contains all N_HIGH_MEMORY
+ * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+ * the page allocator means a mempolicy is in effect. Cpuset policy
+ * is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+ if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, *nodemask)
+ *totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
+ }
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
- return CONSTRAINT_CPUSET;
+ cpuset_limited = true;
+ if (cpuset_limited) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, cpuset_current_mems_allowed)
+ *totalpages += node_spanned_pages(nid);
+ return CONSTRAINT_CPUSET;
+ }
return CONSTRAINT_NONE;
}
#else
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
+ *totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints,
- struct mem_cgroup *mem)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+ unsigned long totalpages, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *chosen = NULL;
- struct timespec uptime;
*ppoints = 0;
- do_posix_clock_monotonic_gettime(&uptime);
for_each_process(p) {
- unsigned long points;
+ unsigned int points;
- /*
- * skip kernel threads and tasks which have already released
- * their mm.
- */
- if (!p->mm)
- continue;
- /* skip the init task */
- if (is_global_init(p))
- continue;
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
/*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
* the process of exiting and releasing its resources.
* Otherwise we could get an easy OOM deadlock.
*/
- if (p->flags & PF_EXITING) {
+ if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
if (p != current)
return ERR_PTR(-1UL);
chosen = p;
- *ppoints = ULONG_MAX;
+ *ppoints = 1000;
}
- if (p->signal->oom_adj == OOM_DISABLE)
- continue;
-
- points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
+ points = oom_badness(p, mem, nodemask, totalpages);
+ if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
*
* Dumps the current memory state of all system tasks, excluding kernel threads.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
*
* If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
* shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*/
static void dump_tasks(const struct mem_cgroup *mem)
{
- struct task_struct *g, *p;
-
- printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
- "name\n");
- do_each_thread(g, p) {
- struct mm_struct *mm;
+ struct task_struct *p;
+ struct task_struct *task;
- if (mem && !task_in_mem_cgroup(p, mem))
+ pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
+ for_each_process(p) {
+ if (p->flags & PF_KTHREAD)
continue;
- if (!thread_group_leader(p))
+ if (mem && !task_in_mem_cgroup(p, mem))
continue;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
+ task = find_lock_task_mm(p);
+ if (!task) {
/*
- * total_vm and rss sizes do not exist for tasks with no
- * mm so there's no need to report them; they can't be
- * oom killed anyway.
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
*/
- task_unlock(p);
continue;
}
- printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
- p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
- p->comm);
- task_unlock(p);
- } while_each_thread(g, p);
+
+ pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
+ task->pid, __task_cred(task)->uid, task->tgid,
+ task->mm->total_vm, get_mm_rss(task->mm),
+ task_cpu(task), task->signal->oom_adj,
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+ }
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
struct mem_cgroup *mem)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_adj=%d\n",
- current->comm, gfp_mask, order, current->signal->oom_adj);
task_lock(current);
+ pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+ "oom_adj=%d, oom_score_adj=%d\n",
+ current->comm, gfp_mask, order, current->signal->oom_adj,
+ current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -374,72 +398,43 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-
-/*
- * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
- * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
- * set.
- */
-static void __oom_kill_task(struct task_struct *p, int verbose)
+static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
- if (is_global_init(p)) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill init!\n");
- return;
- }
-
- task_lock(p);
- if (!p->mm) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
- task_pid_nr(p), p->comm);
+ p = find_lock_task_mm(p);
+ if (!p) {
task_unlock(p);
- return;
+ return 1;
}
-
- if (verbose)
- printk(KERN_ERR "Killed process %d (%s) "
- "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
- task_pid_nr(p), p->comm,
- K(p->mm->total_vm),
- K(get_mm_counter(p->mm, MM_ANONPAGES)),
- K(get_mm_counter(p->mm, MM_FILEPAGES)));
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(p), p->comm, K(p->mm->total_vm),
+ K(get_mm_counter(p->mm, MM_ANONPAGES)),
+ K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
+
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ force_sig(SIGKILL, p);
+
/*
* We give our sacrificial lamb high priority and access to
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
- p->rt.time_slice = HZ;
- set_tsk_thread_flag(p, TIF_MEMDIE);
-
- force_sig(SIGKILL, p);
-}
-
-static int oom_kill_task(struct task_struct *p)
-{
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
- *
- * Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
- * However, this is of no concern to us.
- */
- if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
- return 1;
-
- __oom_kill_task(p, 1);
+ boost_dying_task_prio(p, mem);
return 0;
}
+#undef K
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned long points, struct mem_cgroup *mem,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *mem, nodemask_t *nodemask,
const char *message)
{
- struct task_struct *c;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t = p;
+ unsigned int victim_points = 0;
if (printk_ratelimit())
dump_header(p, gfp_mask, order, mem);
@@ -449,43 +444,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
- __oom_kill_task(p, 0);
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ boost_dying_task_prio(p, mem);
return 0;
}
- printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
- message, task_pid_nr(p), p->comm, points);
+ task_lock(p);
+ pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+ task_unlock(p);
- /* Try to kill a child first */
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == p->mm)
- continue;
- if (mem && !task_in_mem_cgroup(c, mem))
- continue;
- if (!oom_kill_task(c))
- return 0;
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ do {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child, mem, nodemask,
+ totalpages);
+ if (child_points > victim_points) {
+ victim = child;
+ victim_points = child_points;
+ }
+ }
+ } while_each_thread(p, t);
+
+ return oom_kill_task(victim, mem);
+}
+
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+ int order)
+{
+ if (likely(!sysctl_panic_on_oom))
+ return;
+ if (sysctl_panic_on_oom != 2) {
+ /*
+ * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+ * does not panic for cpuset, mempolicy, or memcg allocation
+ * failures.
+ */
+ if (constraint != CONSTRAINT_NONE)
+ return;
}
- return oom_kill_task(p);
+ read_lock(&tasklist_lock);
+ dump_header(NULL, gfp_mask, order, NULL);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory: %s panic_on_oom is enabled\n",
+ sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
- unsigned long points = 0;
+ unsigned long limit;
+ unsigned int points = 0;
struct task_struct *p;
- if (sysctl_panic_on_oom == 2)
- panic("out of memory(memcg). panic_on_oom is selected.\n");
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+ limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, mem);
- if (PTR_ERR(p) == -1UL)
+ p = select_bad_process(&points, limit, mem, NULL);
+ if (!p || PTR_ERR(p) == -1UL)
goto out;
- if (!p)
- p = current;
-
- if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
"Memory cgroup out of memory"))
goto retry;
out:
@@ -512,7 +545,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
* if a parallel OOM killing is already taking place that includes a zone in
* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
*/
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
struct zoneref *z;
struct zone *zone;
@@ -529,7 +562,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
/*
* Lock each zone in the zonelist under zone_scan_lock so a
- * parallel invocation of try_set_zone_oom() doesn't succeed
+ * parallel invocation of try_set_zonelist_oom() doesn't succeed
* when it shouldn't.
*/
zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -558,65 +591,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
}
/*
- * Must be called with tasklist_lock held for read.
+ * Try to acquire the oom killer lock for all system zones. Returns zero if a
+ * parallel oom killing is taking place, otherwise locks all zones and returns
+ * non-zero.
*/
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static int try_set_system_oom(void)
{
- struct task_struct *p;
- unsigned long points;
-
- if (sysctl_oom_kill_allocating_task)
- if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
- "Out of memory (oom_kill_allocating_task)"))
- return;
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points, NULL);
-
- if (PTR_ERR(p) == -1UL)
- return;
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- dump_header(NULL, gfp_mask, order, NULL);
- panic("Out of memory and no killable processes...\n");
- }
+ struct zone *zone;
+ int ret = 1;
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
- goto retry;
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ if (zone_is_oom_locked(zone)) {
+ ret = 0;
+ goto out;
+ }
+ for_each_populated_zone(zone)
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+out:
+ spin_unlock(&zone_scan_lock);
+ return ret;
}
/*
- * pagefault handler calls into here because it is out of memory but
- * doesn't know exactly how or why.
+ * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
+ * attempts or page faults may now recall the oom killer, if necessary.
*/
-void pagefault_out_of_memory(void)
+static void clear_system_oom(void)
{
- unsigned long freed = 0;
-
- blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
- /* Got some memory back in the last second. */
- return;
-
- if (sysctl_panic_on_oom)
- panic("out of memory from page fault. panic_on_oom is selected.\n");
-
- read_lock(&tasklist_lock);
- __out_of_memory(0, 0); /* unknown gfp_mask and order */
- read_unlock(&tasklist_lock);
+ struct zone *zone;
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory.
- */
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ spin_unlock(&zone_scan_lock);
}
/**
@@ -624,6 +632,7 @@ void pagefault_out_of_memory(void)
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
@@ -633,43 +642,68 @@ void pagefault_out_of_memory(void)
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
+ struct task_struct *p;
+ unsigned long totalpages;
unsigned long freed = 0;
- enum oom_constraint constraint;
+ unsigned int points;
+ enum oom_constraint constraint = CONSTRAINT_NONE;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
- if (sysctl_panic_on_oom == 2) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ boost_dying_task_prio(current, NULL);
+ return;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+ &totalpages);
+ check_panic_on_oom(constraint, gfp_mask, order);
+
read_lock(&tasklist_lock);
+ if (sysctl_oom_kill_allocating_task &&
+ !oom_unkillable_task(current, NULL, nodemask) &&
+ (current->signal->oom_adj != OOM_DISABLE)) {
+ /*
+ * oom_kill_process() needs tasklist_lock held. If it returns
+ * non-zero, current could not be killed so we must fallback to
+ * the tasklist scan.
+ */
+ if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+ NULL, nodemask,
+ "Out of memory (oom_kill_allocating_task)"))
+ return;
+ }
- switch (constraint) {
- case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, 0, NULL,
- "No available memory (MPOL_BIND)");
- break;
+retry:
+ p = select_bad_process(&points, totalpages, NULL,
+ constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
+ NULL);
+ if (PTR_ERR(p) == -1UL)
+ return;
- case CONSTRAINT_NONE:
- if (sysctl_panic_on_oom) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. panic_on_oom is selected\n");
- }
- /* Fall-through */
- case CONSTRAINT_CPUSET:
- __out_of_memory(gfp_mask, order);
- break;
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ dump_header(NULL, gfp_mask, order, NULL);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
}
+ if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+ nodemask, "Out of memory"))
+ goto retry;
read_unlock(&tasklist_lock);
/*
@@ -679,3 +713,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
+
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * oom killing is already in progress so do nothing. If a task is found with
+ * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ */
+void pagefault_out_of_memory(void)
+{
+ if (try_set_system_oom()) {
+ out_of_memory(NULL, 0, 0, NULL);
+ clear_system_oom();
+ }
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8..20890d80c7e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <trace/events/writeback.h>
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
}
}
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty, unsigned long *pbdi_dirty)
-{
- unsigned long avail_dirty;
-
- avail_dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP);
-
- if (avail_dirty < dirty)
- avail_dirty = dirty - avail_dirty;
- else
- avail_dirty = 0;
-
- avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
- bdi_stat(bdi, BDI_WRITEBACK);
-
- *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
-
static inline void task_dirties_fraction(struct task_struct *tsk,
long *numerator, long *denominator)
{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
}
/*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
*
* task specific dirty limit:
*
* dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
*/
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+ unsigned long bdi_dirty)
{
long numerator, denominator;
- unsigned long dirty = *pdirty;
+ unsigned long dirty = bdi_dirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
do_div(inv, denominator);
dirty -= inv;
- if (dirty < *pdirty/2)
- dirty = *pdirty/2;
- *pdirty = dirty;
+ return max(dirty, bdi_dirty/2);
}
/*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
- unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
@@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+}
- if (bdi) {
- u64 bdi_dirty;
- long numerator, denominator;
+/**
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
- /*
- * Calculate this BDI's share of the dirty ratio.
- */
- bdi_writeout_fraction(bdi, &numerator, &denominator);
-
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
-
- *pbdi_dirty = bdi_dirty;
- clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
- }
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
}
/*
@@ -490,30 +488,22 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
-
+ bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_writeback = global_page_state(NR_WRITEBACK);
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
- break;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Throttle it only when the background writeback cannot
@@ -524,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
(background_thresh + dirty_thresh) / 2)
break;
- if (!bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wbc(&wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
- }
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -556,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping,
if (bdi_thresh < 2*bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else if (bdi_nr_reclaimable) {
+ } else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ /*
+ * The bdi thresh is somehow "soft" limit derived from the
+ * global "hard" limit. The former helps to prevent heavy IO
+ * bdi or process from holding back light ones; The latter is
+ * the last resort safeguard.
+ */
+ dirty_exceeded =
+ (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+ || (nr_reclaimable + nr_writeback >= dirty_thresh);
+
+ if (!dirty_exceeded)
break;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
+ */
+ trace_wbc_balance_dirty_start(&wbc, bdi);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wb(&bdi->wb, &wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ trace_wbc_balance_dirty_written(&wbc, bdi);
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+ }
+ trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_INTERRUPTIBLE);
io_schedule_timeout(pause);
@@ -578,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
- bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -594,10 +596,8 @@ static void balance_dirty_pages(struct address_space *mapping,
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
- bdi_start_writeback(bdi, NULL, 0);
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
+ bdi_start_background_writeback(bdi);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -660,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
@@ -694,24 +690,23 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
+ bdi_arm_supers_timer();
return 0;
}
-static void do_laptop_sync(struct work_struct *work)
-{
- wakeup_flusher_threads(0);
- kfree(work);
-}
-
-static void laptop_timer_fn(unsigned long unused)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
{
- struct work_struct *work;
+ struct request_queue *q = (struct request_queue *)data;
+ int nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- INIT_WORK(work, do_laptop_sync);
- schedule_work(work);
- }
+ /*
+ * We want to write everything out, not just down to the dirty
+ * threshold
+ */
+ if (bdi_has_dirty_io(&q->backing_dev_info))
+ bdi_start_writeback(&q->backing_dev_info, nr_pages);
}
/*
@@ -719,9 +714,9 @@ static void laptop_timer_fn(unsigned long unused)
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+ mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
@@ -731,8 +726,16 @@ void laptop_io_completion(void)
*/
void laptop_sync_completion(void)
{
- del_timer(&laptop_mode_wb_timer);
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ del_timer(&bdi->laptop_mode_wb_timer);
+
+ rcu_read_unlock();
}
+#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
@@ -803,6 +806,41 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ } while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -816,6 +854,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -831,7 +876,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- long nr_to_write = wbc->nr_to_write;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -849,13 +894,18 @@ int write_cache_pages(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -913,6 +963,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -931,11 +982,10 @@ continue_unlock:
done = 1;
break;
}
- }
+ }
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
+ if (wbc->nr_to_write > 0) {
+ if (--wbc->nr_to_write == 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
/*
* We stop writing back only if we are
@@ -966,11 +1016,8 @@ continue_unlock:
end = writeback_index - 1;
goto retry;
}
- if (!wbc->no_nrwrite_index_update) {
- if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
- mapping->writeback_index = done_index;
- wbc->nr_to_write = nr_to_write;
- }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
return ret;
}
@@ -1315,6 +1362,9 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d556..a9649f4b261 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/memory.h>
+#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
@@ -56,6 +57,22 @@
#include <asm/div64.h>
#include "internal.h"
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
/*
* Array of node states.
*/
@@ -475,6 +492,8 @@ static inline void __free_one_page(struct page *page,
int migratetype)
{
unsigned long page_idx;
+ unsigned long combined_idx;
+ struct page *buddy;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +507,6 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- unsigned long combined_idx;
- struct page *buddy;
-
buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(page, buddy, order))
break;
@@ -505,8 +521,29 @@ static inline void __free_one_page(struct page *page,
order++;
}
set_page_order(page, order);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+
+ /*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+ if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+ struct page *higher_page, *higher_buddy;
+ combined_idx = __find_combined_index(page_idx, order);
+ higher_page = page + combined_idx - page_idx;
+ higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+ if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+ list_add_tail(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ goto out;
+ }
+ }
+
+ list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
zone->free_area[order].nr_free++;
}
@@ -599,20 +636,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
spin_unlock(&zone->lock);
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
{
- unsigned long flags;
int i;
int bad = 0;
- int wasMlocked = __TestClearPageMlocked(page);
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0 ; i < (1 << order) ; ++i)
- bad += free_pages_check(page + i);
+ for (i = 0; i < (1 << order); i++) {
+ struct page *pg = page + i;
+
+ if (PageAnon(pg))
+ pg->mapping = NULL;
+ bad += free_pages_check(pg);
+ }
if (bad)
- return;
+ return false;
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+ return true;
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ if (!free_pages_prepare(page, order))
+ return;
+
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1107,21 +1158,9 @@ void free_hot_cold_page(struct page *page, int cold)
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
- trace_mm_page_free_direct(page, 0);
- kmemcheck_free_shadow(page, 0);
-
- if (PageAnon(page))
- page->mapping = NULL;
- if (free_pages_check(page))
+ if (!free_pages_prepare(page, 0))
return;
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
- debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
- }
- arch_free_page(page, 0);
- kernel_map_pages(page, 1, 0);
-
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1188,6 +1227,51 @@ void split_page(struct page *page, unsigned int order)
}
/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ unsigned long watermark;
+ struct zone *zone;
+
+ BUG_ON(!PageBuddy(page));
+
+ zone = page_zone(page);
+ order = page_order(page);
+
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ /* Remove page from free list */
+ list_del(&page->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(page);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+
+ if (order >= pageblock_order - 1) {
+ struct page *endpage = page + (1 << order) - 1;
+ for (; page < endpage; page += pageblock_nr_pages)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ }
+
+ return 1 << order;
+}
+
+/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
@@ -1654,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct page *page;
/* Acquire the OOM killer lock for the zones in zonelist */
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -1675,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -1693,6 +1780,62 @@ out:
return page;
}
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress)
+{
+ struct page *page;
+
+ if (!order || compaction_deferred(preferred_zone))
+ return NULL;
+
+ *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ nodemask);
+ if (*did_some_progress != COMPACT_SKIPPED) {
+
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
+
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags, preferred_zone,
+ migratetype);
+ if (page) {
+ preferred_zone->compact_considered = 0;
+ preferred_zone->compact_defer_shift = 0;
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+
+ /*
+ * It's bad if compaction run occurs and fails.
+ * The most likely reason is that pages exist,
+ * but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+ defer_compaction(preferred_zone);
+
+ cond_resched();
+ }
+
+ return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress)
+{
+ return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2022,15 @@ rebalance:
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
+ /* Try direct compaction */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress);
+ if (page)
+ goto got_pg;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -1903,15 +2055,23 @@ rebalance:
if (page)
goto got_pg;
- /*
- * The OOM killer does not trigger for high-order
- * ~__GFP_NOFAIL allocations so if no progress is being
- * made, there are no other options and retrying is
- * unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /*
+ * The oom killer is not called for high-order
+ * allocations that may fail, so if no progress
+ * is being made, there are no other options and
+ * retrying is unlikely to help.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto nopage;
+ /*
+ * The oom killer is not called for lowmem
+ * allocations to prevent needlessly killing
+ * innocent tasks.
+ */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto nopage;
+ }
goto restart;
}
@@ -1970,10 +2130,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
+ if (!preferred_zone) {
+ put_mems_allowed();
return NULL;
+ }
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2146,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
@@ -2434,8 +2598,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
strncpy((char*)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order)
- build_all_zonelists();
+ } else if (oldval != user_zonelist_order) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
}
out:
mutex_unlock(&zl_order_mutex);
@@ -2579,10 +2746,10 @@ static int default_zonelist_order(void)
struct zone *z;
int average_size;
/*
- * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
- * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ * This function detect ZONE_DMA/DMA32 size and configures zone order.
*/
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
low_kmem_size = 0;
@@ -2594,6 +2761,15 @@ static int default_zonelist_order(void)
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
+ } else if (zone_type == ZONE_NORMAL) {
+ /*
+ * If any node has only lowmem, then node order
+ * is preferred to allow kernel allocations
+ * locally; otherwise, they can easily infringe
+ * on other nodes when there is an abundance of
+ * lowmem available to allocate from.
+ */
+ return ZONELIST_ORDER_NODE;
}
}
}
@@ -2707,6 +2883,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+ struct zone *zone;
+
+ (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ gfp_zone(GFP_KERNEL),
+ NULL,
+ &zone);
+ return zone->node;
+}
+#endif
#else /* CONFIG_NUMA */
@@ -2776,9 +2970,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
{
int nid;
int cpu;
@@ -2793,6 +2994,14 @@ static int __build_all_zonelists(void *dummy)
build_zonelist_cache(pgdat);
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /* Setup real pagesets for the new zone */
+ if (data) {
+ struct zone *zone = data;
+ setup_zone_pageset(zone);
+ }
+#endif
+
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
@@ -2806,13 +3015,31 @@ static int __build_all_zonelists(void *dummy)
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+ /*
+ * We now know the "local memory node" for each node--
+ * i.e., the node of the first zone in the generic zonelist.
+ * Set up numa_mem percpu variable for on-line cpus. During
+ * boot, only the boot cpu should be on-line; we'll init the
+ * secondary cpus' numa_mem as they come on-line. During
+ * node/memory hotplug, we'll fixup all on-line cpus.
+ */
+ if (cpu_online(cpu))
+ set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+ }
+
return 0;
}
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -2823,7 +3050,7 @@ void build_all_zonelists(void)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, NULL, NULL);
+ stop_machine(__build_all_zonelists, data, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3373,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
+static __meminit void setup_zone_pageset(struct zone *zone)
+{
+ int cpu;
+
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+}
+
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
- * Boot pagesets will no longer be used by this processorr
- * after setup_per_cpu_pageset().
*/
void __init setup_per_cpu_pageset(void)
{
struct zone *zone;
- int cpu;
-
- for_each_populated_zone(zone) {
- zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-
- setup_pageset(pcp, zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(pcp,
- (zone->present_pages /
- percpu_pagelist_fraction));
- }
- }
+ for_each_populated_zone(zone)
+ setup_zone_pageset(zone);
}
static noinline __init_refok
@@ -3415,6 +3645,9 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
int i;
void *ptr;
+ if (limit > get_max_mapped())
+ limit = get_max_mapped();
+
/* need to go over early_node_map to find out good range for node */
for_each_active_range_index_in_nid(i, nid) {
u64 addr;
@@ -3440,6 +3673,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
reserve_early_without_check(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
return ptr;
}
@@ -3862,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone->prev_priority = DEF_PRIORITY;
-
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6c0081441a3..5bffada7cde 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <linux/kmemleak.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -126,6 +127,12 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
if (!base)
base = vmalloc(table_size);
}
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
} else {
/*
* We don't have to allocate page_cgroup again, but
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230..2dee975bf46 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+ rw |= REQ_SYNC | REQ_UNPLUG;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 00000000000..df680855540
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,104 @@
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation. This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
+ * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
+ * fine.
+ *
+ * - NUMA is not supported. When setting up the first chunk,
+ * @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ * than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ * PAGE_SIZE. Because each chunk is allocated as a contiguous
+ * kernel memory block using alloc_pages(), memory will be wasted if
+ * chunk size is not aligned. percpu-km code will whine about it.
+ */
+
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+
+#include <linux/log2.h>
+
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ /* noop */
+ return 0;
+}
+
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ /* nada */
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+ struct pcpu_chunk *chunk;
+ struct page *pages;
+ int i;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ if (!pages) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ for (i = 0; i < nr_pages; i++)
+ pcpu_set_page_chunk(nth_page(pages, i), chunk);
+
+ chunk->data = pages;
+ chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+
+ if (chunk && chunk->data)
+ __free_pages(chunk->data, order_base_2(nr_pages));
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return virt_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ size_t nr_pages, alloc_pages;
+
+ /* all units must be in a single group */
+ if (ai->nr_groups != 1) {
+ printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ return -EINVAL;
+ }
+
+ nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+ alloc_pages = roundup_pow_of_two(nr_pages);
+
+ if (alloc_pages > nr_pages)
+ printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
+
+ return 0;
+}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 00000000000..7d9c1d0ebd3
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+
+/**
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
+{
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
+
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+
+ *bitmapp = bitmap;
+ return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
+ }
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
+ }
+
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu, tcpu;
+ int i, err;
+
+ for_each_possible_cpu(cpu) {
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
+ if (err < 0)
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
+ }
+
+ return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk. The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
+ unsigned int cpu;
+ int rs, re, rc;
+
+ /* quick path, check whether all pages are already there */
+ rs = page_start;
+ pcpu_next_pop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ goto clear;
+
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
+
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
+
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
+ }
+
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+ return 0;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ rs = page_start;
+ pcpu_next_unpop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ return;
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ struct pcpu_chunk *chunk;
+ struct vm_struct **vms;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+ pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+ if (!vms) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ chunk->data = vms;
+ chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ if (chunk && chunk->data)
+ pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return vmalloc_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ /* no extra restriction */
+ return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 6e09741ddc6..e61dc2cc587 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
/*
- * linux/mm/percpu.c - percpu memory allocator
+ * mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
* This file is released under the GPLv2.
*
* This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of boot-time determined number of units and the
- * first chunk is used for static percpu variables in the kernel image
+ * areas. Percpu areas are allocated in chunks. Each chunk is
+ * consisted of boot-time determined number of units and the first
+ * chunk is used for static percpu variables in the kernel image
* (special boot time alloc/init handling necessary as these areas
* need to be brought up before allocation services are running).
* Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated. ie. in
- * vmalloc area
+ * When a chunk is filled up, another chunk is allocated.
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -99,7 +98,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct vm_struct **vms; /* mapped vmalloc regions */
+ void *data; /* chunk data */
bool immutable; /* no [de]population allowed */
unsigned long populated[]; /* populated bitmap */
};
@@ -177,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
static void pcpu_reclaim(struct work_struct *work);
static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+static bool pcpu_addr_in_first_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start && addr < first_start + pcpu_unit_size;
+}
+
+static bool pcpu_addr_in_reserved_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start &&
+ addr < first_start + pcpu_reserved_chunk_limit;
+}
+
static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
@@ -198,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
return pcpu_size_to_slot(chunk->free_size);
}
-static int pcpu_page_idx(unsigned int cpu, int page_idx)
-{
- return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
-}
-
-static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
-}
-
-static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- /* must not be used on pre-mapped chunk */
- WARN_ON(chunk->immutable);
-
- return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
-}
-
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
@@ -231,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index;
}
-static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+ (page_idx << PAGE_SHIFT);
+}
+
+static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
{
*rs = find_next_zero_bit(chunk->populated, end, *rs);
*re = find_next_bit(chunk->populated, end, *rs + 1);
}
-static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
{
*rs = find_next_bit(chunk->populated, end, *rs);
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -275,6 +282,9 @@ static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
*/
static void *pcpu_mem_alloc(size_t size)
{
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else {
@@ -326,36 +336,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
/**
- * pcpu_chunk_addr_search - determine chunk containing specified address
- * @addr: address for which the chunk needs to be determined.
- *
- * RETURNS:
- * The address of the found chunk.
- */
-static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-{
- void *first_start = pcpu_first_chunk->base_addr;
-
- /* is it in the first chunk? */
- if (addr >= first_start && addr < first_start + pcpu_unit_size) {
- /* is it in the reserved area? */
- if (addr < first_start + pcpu_reserved_chunk_limit)
- return pcpu_reserved_chunk;
- return pcpu_first_chunk;
- }
-
- /*
- * The address is relative to unit0 which might be unused and
- * thus unmapped. Offset the address to the unit space of the
- * current processor before looking it up in the vmalloc
- * space. Note that any possible cpu id can be used here, so
- * there's no need to worry about preemption or cpu hotplug.
- */
- addr += pcpu_unit_offsets[raw_smp_processor_id()];
- return pcpu_get_page_chunk(vmalloc_to_page(addr));
-}
-
-/**
* pcpu_need_to_extend - determine whether chunk area map needs to be extended
* @chunk: chunk of interest
*
@@ -415,13 +395,6 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
old_size = chunk->map_alloc * sizeof(chunk->map[0]);
memcpy(new, chunk->map, old_size);
- /*
- * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
- * one of the first chunks and still using static map.
- */
- if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
- old = chunk->map;
-
chunk->map_alloc = new_alloc;
chunk->map = new;
new = NULL;
@@ -623,434 +596,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
pcpu_chunk_relocate(chunk, oslot);
}
-/**
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
- * @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
- *
- * Returns pointer to array of pointers to struct page and bitmap,
- * both of which can be indexed with pcpu_page_idx(). The returned
- * array is cleared to zero and *@bitmapp is copied from
- * @chunk->populated. Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
- *
- * RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
- */
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
- unsigned long **bitmapp,
- bool may_alloc)
+static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
- static struct page **pages;
- static unsigned long *bitmap;
- size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
- size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
- sizeof(unsigned long);
-
- if (!pages || !bitmap) {
- if (may_alloc && !pages)
- pages = pcpu_mem_alloc(pages_size);
- if (may_alloc && !bitmap)
- bitmap = pcpu_mem_alloc(bitmap_size);
- if (!pages || !bitmap)
- return NULL;
- }
-
- memset(pages, 0, pages_size);
- bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
-
- *bitmapp = bitmap;
- return pages;
-}
-
-/**
- * pcpu_free_pages - free pages which were allocated for @chunk
- * @chunk: chunk pages were allocated for
- * @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be freed
- * @page_end: page index of the last page to be freed + 1
- *
- * Free pages [@page_start and @page_end) in @pages for all units.
- * The pages were allocated for @chunk.
- */
-static void pcpu_free_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
-{
- unsigned int cpu;
- int i;
+ struct pcpu_chunk *chunk;
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page *page = pages[pcpu_page_idx(cpu, i)];
+ chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+ if (!chunk)
+ return NULL;
- if (page)
- __free_page(page);
- }
+ chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+ if (!chunk->map) {
+ kfree(chunk);
+ return NULL;
}
-}
-/**
- * pcpu_alloc_pages - allocates pages for @chunk
- * @chunk: target chunk
- * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be allocated
- * @page_end: page index of the last page to be allocated + 1
- *
- * Allocate pages [@page_start,@page_end) into @pages for all units.
- * The allocation is for @chunk. Percpu core doesn't care about the
- * content of @pages and will pass it verbatim to pcpu_map_pages().
- */
-static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
-{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
- int i;
+ chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+ chunk->map[chunk->map_used++] = pcpu_unit_size;
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
-
- *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
- }
- }
- return 0;
-}
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->free_size = pcpu_unit_size;
+ chunk->contig_hint = pcpu_unit_size;
-/**
- * pcpu_pre_unmap_flush - flush cache prior to unmapping
- * @chunk: chunk the regions to be flushed belongs to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages in [@page_start,@page_end) of @chunk are about to be
- * unmapped. Flush cache. As each flushing trial can be very
- * expensive, issue flush on the whole region at once rather than
- * doing it for each cpu. This could be an overkill but is more
- * scalable.
- */
-static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_cache_vunmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ return chunk;
}
-static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ if (!chunk)
+ return;
+ pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+ kfree(chunk);
}
-/**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @populated: populated bitmap
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished. The caller should call
- * proper pre/post flush functions.
+/*
+ * Chunk management implementation.
+ *
+ * To allow different implementations, chunk alloc/free and
+ * [de]population are implemented in a separate file which is pulled
+ * into this file and compiled together. The following functions
+ * should be implemented.
+ *
+ * pcpu_populate_chunk - populate the specified range of a chunk
+ * pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_create_chunk - create a new chunk
+ * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
+ * pcpu_addr_to_page - translate address to physical address
+ * pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
-{
- unsigned int cpu;
- int i;
-
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page *page;
-
- page = pcpu_chunk_page(chunk, cpu, i);
- WARN_ON(!page);
- pages[pcpu_page_idx(cpu, i)] = page;
- }
- __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
- page_end - page_start);
- }
-
- for (i = page_start; i < page_end; i++)
- __clear_bit(i, populated);
-}
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static struct pcpu_chunk *pcpu_create_chunk(void);
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
+static struct page *pcpu_addr_to_page(void *addr);
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
+
+#ifdef CONFIG_NEED_PER_CPU_KM
+#include "percpu-km.c"
+#else
+#include "percpu-vm.c"
+#endif
/**
- * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
- * TLB for the regions. This can be skipped if the area is to be
- * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
*
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
- */
-static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_tlb_kernel_range(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-
-static int __pcpu_map_pages(unsigned long addr, struct page **pages,
- int nr_pages)
-{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
-}
-
-/**
- * pcpu_map_pages - map pages into a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
- * @page_start: page index of the first page to map
- * @page_end: page index of the last page to map + 1
- *
- * For each cpu, map pages [@page_start,@page_end) into @chunk. The
- * caller is responsible for calling pcpu_post_map_flush() after all
- * mappings are complete.
- *
- * This function is responsible for setting corresponding bits in
- * @chunk->populated bitmap and whatever is necessary for reverse
- * lookup (addr -> chunk).
+ * RETURNS:
+ * The address of the found chunk.
*/
-static int pcpu_map_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- unsigned int cpu, tcpu;
- int i, err;
-
- for_each_possible_cpu(cpu) {
- err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
- &pages[pcpu_page_idx(cpu, page_start)],
- page_end - page_start);
- if (err < 0)
- goto err;
- }
-
- /* mapping successful, link chunk and mark populated */
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu)
- pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
- chunk);
- __set_bit(i, populated);
- }
-
- return 0;
-
-err:
- for_each_possible_cpu(tcpu) {
- if (tcpu == cpu)
- break;
- __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
- page_end - page_start);
+ /* is it in the first chunk? */
+ if (pcpu_addr_in_first_chunk(addr)) {
+ /* is it in the reserved area? */
+ if (pcpu_addr_in_reserved_chunk(addr))
+ return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
}
- return err;
-}
-
-/**
- * pcpu_post_map_flush - flush cache after mapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
- * cache.
- *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
- */
-static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_cache_vmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-
-/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
- */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- struct page **pages;
- unsigned long *populated;
- int rs, re;
-
- /* quick path, check whether it's empty already */
- rs = page_start;
- pcpu_next_unpop(chunk, &rs, &re, page_end);
- if (rs == page_start && re == page_end)
- return;
-
- /* immutable chunks can't be depopulated */
- WARN_ON(chunk->immutable);
/*
- * If control reaches here, there must have been at least one
- * successful population attempt so the temp pages array must
- * be available now.
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
*/
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
- BUG_ON(!pages);
-
- /* unmap and free */
- pcpu_pre_unmap_flush(chunk, page_start, page_end);
-
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
-
- /* no need to flush tlb, vmalloc will handle it lazily */
-
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
-
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-}
-
-/**
- * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
- * @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
- *
- * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk. The area is cleared on return.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
- */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int free_end = page_start, unmap_end = page_start;
- struct page **pages;
- unsigned long *populated;
- unsigned int cpu;
- int rs, re, rc;
-
- /* quick path, check whether all pages are already there */
- rs = page_start;
- pcpu_next_pop(chunk, &rs, &re, page_end);
- if (rs == page_start && re == page_end)
- goto clear;
-
- /* need to allocate and map pages, this chunk can't be immutable */
- WARN_ON(chunk->immutable);
-
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
- if (!pages)
- return -ENOMEM;
-
- /* alloc and map */
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_free;
- free_end = re;
- }
-
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_map_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_unmap;
- unmap_end = re;
- }
- pcpu_post_map_flush(chunk, page_start, page_end);
-
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
- for_each_possible_cpu(cpu)
- memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
- return 0;
-
-err_unmap:
- pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
- pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
- return rc;
-}
-
-static void free_pcpu_chunk(struct pcpu_chunk *chunk)
-{
- if (!chunk)
- return;
- if (chunk->vms)
- pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- kfree(chunk);
-}
-
-static struct pcpu_chunk *alloc_pcpu_chunk(void)
-{
- struct pcpu_chunk *chunk;
-
- chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[chunk->map_used++] = pcpu_unit_size;
-
- chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size,
- GFP_KERNEL);
- if (!chunk->vms) {
- free_pcpu_chunk(chunk);
- return NULL;
- }
-
- INIT_LIST_HEAD(&chunk->list);
- chunk->free_size = pcpu_unit_size;
- chunk->contig_hint = pcpu_unit_size;
- chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
-
- return chunk;
+ addr += pcpu_unit_offsets[raw_smp_processor_id()];
+ return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
/**
@@ -1142,7 +773,7 @@ restart:
/* hmmm... no space left, create a new chunk */
spin_unlock_irqrestore(&pcpu_lock, flags);
- chunk = alloc_pcpu_chunk();
+ chunk = pcpu_create_chunk();
if (!chunk) {
err = "failed to allocate new chunk";
goto fail_unlock_mutex;
@@ -1254,7 +885,7 @@ static void pcpu_reclaim(struct work_struct *work)
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
- free_pcpu_chunk(chunk);
+ pcpu_destroy_chunk(chunk);
}
mutex_unlock(&pcpu_alloc_mutex);
@@ -1343,25 +974,39 @@ bool is_kernel_percpu_address(unsigned long addr)
*/
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
- if ((unsigned long)addr < VMALLOC_START ||
- (unsigned long)addr >= VMALLOC_END)
- return __pa(addr);
- else
- return page_to_phys(vmalloc_to_page(addr));
-}
-
-static inline size_t pcpu_calc_fc_sizes(size_t static_size,
- size_t reserved_size,
- ssize_t *dyn_sizep)
-{
- size_t size_sum;
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ bool in_first_chunk = false;
+ unsigned long first_start, first_end;
+ unsigned int cpu;
- size_sum = PFN_ALIGN(static_size + reserved_size +
- (*dyn_sizep >= 0 ? *dyn_sizep : 0));
- if (*dyn_sizep != 0)
- *dyn_sizep = size_sum - static_size - reserved_size;
+ /*
+ * The following test on first_start/end isn't strictly
+ * necessary but will speed up lookups of addresses which
+ * aren't in the first chunk.
+ */
+ first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
+ first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_start &&
+ (unsigned long)addr < first_end) {
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if (addr >= start && addr < start + pcpu_unit_size) {
+ in_first_chunk = true;
+ break;
+ }
+ }
+ }
- return size_sum;
+ if (in_first_chunk) {
+ if ((unsigned long)addr < VMALLOC_START ||
+ (unsigned long)addr >= VMALLOC_END)
+ return __pa(addr);
+ else
+ return page_to_phys(vmalloc_to_page(addr));
+ } else
+ return page_to_phys(pcpu_addr_to_page(addr));
}
/**
@@ -1422,7 +1067,7 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
/**
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
*
@@ -1440,15 +1085,15 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
-struct pcpu_alloc_info * __init pcpu_build_alloc_info(
- size_t reserved_size, ssize_t dyn_size,
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
- int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
+ int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs, group, unit;
@@ -1458,7 +1103,12 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
- memset(group_cnt, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
/*
* Determine min_unit_size, alloc_size and max_upa such that
@@ -1466,7 +1116,6 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
- size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, atom_size);
@@ -1492,7 +1141,6 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
}
group_map[cpu] = group;
group_cnt[group]++;
- group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
@@ -1688,7 +1336,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
static char cpus_buf[4096] __initdata;
- static int smap[2], dmap[2];
+ static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+ static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
struct pcpu_chunk *schunk, *dchunk = NULL;
@@ -1711,14 +1360,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
} while (0)
/* sanity checks */
- BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
- ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
PCPU_SETUP_BUG_ON(!ai->static_size);
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1869,7 +1518,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
@@ -1890,10 +1539,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* vmalloc space is not orders of magnitude larger than distances
* between node memory addresses (ie. 32bit NUMA machines).
*
- * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment. When @dyn_size is auto,
- * @dyn_size is just big enough to fill page alignment after static
- * and reserved areas.
+ * @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned using @free_fn.
@@ -1901,7 +1547,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
@@ -2032,7 +1678,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
- ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+ ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
@@ -2158,3 +1804,33 @@ void __init setup_per_cpu_areas(void)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+/*
+ * First and reserved chunks are initialized with temporary allocation
+ * map in initdata so that they can be used before slab is online.
+ * This function is called after slab is brought up and replaces those
+ * with properly allocated maps.
+ */
+void __init percpu_init_late(void)
+{
+ struct pcpu_chunk *target_chunks[] =
+ { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; (chunk = target_chunks[i]); i++) {
+ int *map;
+ const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
+
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ map = pcpu_mem_alloc(size);
+ BUG_ON(!map);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ memcpy(map, chunk->map, size);
+ chunk->map = map;
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index dfa9a1a03a1..77506a291a2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* @req_size: hint: total size of the read which the caller is performing in
* pagecache pages
*
- * page_cache_async_ondemand() should be called when a page is used which
+ * page_cache_async_readahead() should be called when a page is used which
* has the PG_readahead flag; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 0feeef860a8..a7d0f548263 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -132,9 +132,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
+ /*
+ * This VMA had no anon_vma yet. This anon_vma is
+ * the root of any anon_vma tree that might form.
+ */
+ anon_vma->root = anon_vma;
}
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -142,12 +147,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
avc->anon_vma = anon_vma;
avc->vma = vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- list_add(&avc->same_anon_vma, &anon_vma->head);
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (unlikely(allocated))
anon_vma_free(allocated);
@@ -170,9 +175,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
/*
@@ -224,9 +229,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
avc = anon_vma_chain_alloc();
if (!avc)
goto out_error_free_anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+
+ /*
+ * The root anon_vma's spinlock is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ /*
+ * With KSM refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned
+ * until this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
return 0;
@@ -246,22 +263,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
if (!anon_vma)
return;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_del(&anon_vma_chain->same_anon_vma);
/* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
- spin_unlock(&anon_vma->lock);
+ empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
+ anon_vma_unlock(anon_vma);
- if (empty)
+ if (empty) {
+ /* We no longer need the root anon_vma */
+ if (anon_vma->root != anon_vma)
+ drop_anon_vma(anon_vma->root);
anon_vma_free(anon_vma);
+ }
}
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
- /* Unlink each anon_vma chained to the VMA. */
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
anon_vma_unlink(avc);
list_del(&avc->same_vma);
@@ -274,7 +298,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock);
- ksm_refcount_init(anon_vma);
+ anonvma_external_refcount_init(anon_vma);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -302,7 +326,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
return anon_vma;
out:
rcu_read_unlock();
@@ -311,7 +335,7 @@ out:
void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
rcu_read_unlock();
}
@@ -340,9 +364,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- if (PageAnon(page))
- ;
- else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+ if (PageAnon(page)) {
+ if (vma->anon_vma->root != page_anon_vma(page)->root)
+ return -EFAULT;
+ } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
@@ -743,14 +768,20 @@ static void __page_set_anon_rmap(struct page *page,
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
- *
- * So take the last AVC chain entry in the vma, which is
- * the deepest ancestor, and use the anon_vma from that.
*/
if (!exclusive) {
- struct anon_vma_chain *avc;
- avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
- anon_vma = avc->anon_vma;
+ if (PageAnon(page))
+ return;
+ anon_vma = anon_vma->root;
+ } else {
+ /*
+ * In this case, swapped-out-but-not-discarded swap-cache
+ * is remapped. So, no need to update page->mapping here.
+ * We convice anon_vma poitned by page->mapping is not obsolete
+ * because vma->anon_vma is necessary to be a family of it.
+ */
+ if (PageAnon(page))
+ return;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +811,7 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
+ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
BUG_ON(page->index != linear_page_index(vma, address));
#endif
}
@@ -798,6 +830,17 @@ static void __page_check_anon_rmap(struct page *page,
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ do_page_add_anon_rmap(page, vma, address, 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
int first = atomic_inc_and_test(&page->_mapcount);
if (first)
__inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +850,7 @@ void page_add_anon_rmap(struct page *page,
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
if (first)
- __page_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address, exclusive);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -1131,6 +1174,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
+static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
/**
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
* rmap method
@@ -1159,7 +1216,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
+ unsigned long address;
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
+ is_vma_temporary_stack(vma))
+ continue;
+
+ address = vma_address(page, vma);
if (address == -EFAULT)
continue;
ret = try_to_unmap_one(page, vma, address, flags);
@@ -1340,6 +1411,42 @@ int try_to_munlock(struct page *page)
return try_to_unmap_file(page, TTU_MUNLOCK);
}
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary. Be careful to do all the tests under the lock. Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+ BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+ if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+ struct anon_vma *root = anon_vma->root;
+ int empty = list_empty(&anon_vma->head);
+ int last_root_user = 0;
+ int root_empty = 0;
+
+ /*
+ * The refcount on a non-root anon_vma got dropped. Drop
+ * the refcount on the root and check if we need to free it.
+ */
+ if (empty && anon_vma != root) {
+ BUG_ON(atomic_read(&root->external_refcount) <= 0);
+ last_root_user = atomic_dec_and_test(&root->external_refcount);
+ root_empty = list_empty(&root->head);
+ }
+ anon_vma_unlock(anon_vma);
+
+ if (empty) {
+ anon_vma_free(anon_vma);
+ if (root_empty && last_root_user)
+ anon_vma_free(root);
+ }
+ }
+}
+#endif
+
#ifdef CONFIG_MIGRATION
/*
* rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1355,15 +1462,13 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
* because that depends on page_mapped(); but not all its usages
- * are holding mmap_sem, which also gave the necessary guarantee
- * (that this anon_vma's slab has not already been destroyed).
- * This needs to be reviewed later: avoiding page_lock_anon_vma()
- * is risky, and currently limits the usefulness of rmap_walk().
+ * are holding mmap_sem. Users without mmap_sem are required to
+ * take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1373,7 +1478,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea515..dfaa0f4e978 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/percpu_counter.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += pages;
+ percpu_counter_add(&sbinfo->used_blocks, -pages);
+ spin_lock(&inode->i_lock);
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
}
@@ -416,25 +417,21 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
if (sgp == SGP_READ)
return shmem_swp_map(ZERO_PAGE(0));
/*
- * Test free_blocks against 1 not 0, since we have 1 data
+ * Test used_blocks against 1 less max_blocks, since we have 1 data
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks <= 1) {
- spin_unlock(&sbinfo->stat_lock);
+ if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
return ERR_PTR(-ENOSPC);
- }
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
- if (page)
- set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
@@ -729,10 +726,11 @@ done2:
if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
/*
* Call truncate_inode_pages again: racing shmem_unuse_inode
- * may have swizzled a page in from swap since vmtruncate or
- * generic_delete_inode did it, before we lowered next_index.
- * Also, though shmem_getpage checks i_size before adding to
- * cache, no recheck after: so fix the narrow window there too.
+ * may have swizzled a page in from swap since
+ * truncate_pagecache or generic_delete_inode did it, before we
+ * lowered next_index. Also, though shmem_getpage checks
+ * i_size before adding to cache, no recheck after: so fix the
+ * narrow window there too.
*
* Recalling truncate_inode_pages_range and unmap_mapping_range
* every time for punch_hole (which never got a chance to clear
@@ -762,19 +760,21 @@ done2:
}
}
-static void shmem_truncate(struct inode *inode)
-{
- shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
-}
-
static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- struct page *page = NULL;
+ loff_t newsize = attr->ia_size;
int error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- if (attr->ia_size < inode->i_size) {
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
+ && newsize != inode->i_size) {
+ struct page *page = NULL;
+
+ if (newsize < inode->i_size) {
/*
* If truncating down to a partial page, then
* if that page is already allocated, hold it
@@ -782,9 +782,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
* truncate_partial_page cannnot miss it were
* it assigned to swap.
*/
- if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+ if (newsize & (PAGE_CACHE_SIZE-1)) {
(void) shmem_getpage(inode,
- attr->ia_size>>PAGE_CACHE_SHIFT,
+ newsize >> PAGE_CACHE_SHIFT,
&page, SGP_READ, NULL);
if (page)
unlock_page(page);
@@ -796,36 +796,38 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
* if it's being fully truncated to zero-length: the
* nrpages check is efficient enough in that case.
*/
- if (attr->ia_size) {
+ if (newsize) {
struct shmem_inode_info *info = SHMEM_I(inode);
spin_lock(&info->lock);
info->flags &= ~SHMEM_PAGEIN;
spin_unlock(&info->lock);
}
}
+
+ /* XXX(truncate): truncate_setsize should be called last */
+ truncate_setsize(inode, newsize);
+ if (page)
+ page_cache_release(page);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
}
- error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
+ setattr_copy(inode, attr);
#ifdef CONFIG_TMPFS_POSIX_ACL
- if (!error && (attr->ia_valid & ATTR_MODE))
+ if (attr->ia_valid & ATTR_MODE)
error = generic_acl_chmod(inode);
#endif
- if (page)
- page_cache_release(page);
return error;
}
-static void shmem_delete_inode(struct inode *inode)
+static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- if (inode->i_op->truncate == shmem_truncate) {
+ if (inode->i_mapping->a_ops == &shmem_aops) {
truncate_inode_pages(inode->i_mapping, 0);
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
- shmem_truncate(inode);
+ shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->swaplist)) {
mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist);
@@ -834,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
}
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
- clear_inode(inode);
+ end_writeback(inode);
}
static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -931,7 +933,7 @@ found:
/*
* Move _head_ to start search for next from here.
- * But be careful: shmem_delete_inode checks list_empty without taking
+ * But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
* would appear empty, if it were the only one on shmem_swaplist. We
* could avoid doing it if inode NULL; or use this minor optimization.
@@ -1221,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
struct shmem_sb_info *sbinfo;
struct page *filepage = *pagep;
struct page *swappage;
+ struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
gfp_t gfp;
@@ -1245,7 +1248,6 @@ repeat:
filepage = find_lock_page(mapping, idx);
if (filepage && PageUptodate(filepage))
goto done;
- error = 0;
gfp = mapping_gfp_mask(mapping);
if (!filepage) {
/*
@@ -1256,7 +1258,19 @@ repeat:
if (error)
goto failed;
radix_tree_preload_end();
+ if (sgp != SGP_READ && !prealloc_page) {
+ /* We don't care if this fails */
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
+ }
+ }
+ }
}
+ error = 0;
spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1385,17 +1399,16 @@ repeat:
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0 ||
+ if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
shmem_acct_block(info->flags)) {
- spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock);
error = -ENOSPC;
goto failed;
}
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
} else if (shmem_acct_block(info->flags)) {
spin_unlock(&info->lock);
error = -ENOSPC;
@@ -1405,28 +1418,38 @@ repeat:
if (!filepage) {
int ret;
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
+ if (!prealloc_page) {
+ spin_unlock(&info->lock);
+ filepage = shmem_alloc_page(gfp, info, idx);
+ if (!filepage) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ error = -ENOMEM;
+ goto failed;
+ }
+ SetPageSwapBacked(filepage);
- /* Precharge page while we can wait, compensate after */
- error = mem_cgroup_cache_charge(filepage, current->mm,
- GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- goto failed;
+ /*
+ * Precharge page while we can wait, compensate
+ * after
+ */
+ error = mem_cgroup_cache_charge(filepage,
+ current->mm, GFP_KERNEL);
+ if (error) {
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ goto failed;
+ }
+
+ spin_lock(&info->lock);
+ } else {
+ filepage = prealloc_page;
+ prealloc_page = NULL;
+ SetPageSwapBacked(filepage);
}
- spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
@@ -1467,13 +1490,19 @@ repeat:
}
done:
*pagep = filepage;
- return 0;
+ error = 0;
+ goto out;
failed:
if (*pagep != filepage) {
unlock_page(filepage);
page_cache_release(filepage);
}
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
return error;
}
@@ -1545,8 +1574,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static struct inode *shmem_get_inode(struct super_block *sb, int mode,
- dev_t dev, unsigned long flags)
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+ int mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -1557,9 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
inode = new_inode(sb);
if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1791,17 +1818,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ buf->f_bavail = buf->f_bfree =
+ sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
- spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -1814,7 +1840,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir, NULL, NULL,
NULL);
@@ -1833,11 +1859,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
#else
error = 0;
#endif
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -1957,7 +1978,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (len > PAGE_CACHE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -1992,8 +2013,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
unlock_page(page);
page_cache_release(page);
}
- if (dir->i_mode & S_ISGID)
- inode->i_gid = dir->i_gid;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -2033,7 +2052,6 @@ static const struct inode_operations shmem_symlink_inline_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
- .truncate = shmem_truncate,
.readlink = generic_readlink,
.follow_link = shmem_follow_link,
.put_link = shmem_put_link,
@@ -2071,14 +2089,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
size, flags);
}
-static struct xattr_handler shmem_xattr_security_handler = {
+static const struct xattr_handler shmem_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = shmem_xattr_security_list,
.get = shmem_xattr_security_get,
.set = shmem_xattr_security_set,
};
-static struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler *shmem_xattr_handlers[] = {
&generic_acl_access_handler,
&generic_acl_default_handler,
&shmem_xattr_security_handler,
@@ -2250,7 +2268,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
struct shmem_sb_info config = *sbinfo;
- unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;
@@ -2258,9 +2275,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
return error;
spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (config.max_blocks < blocks)
+ if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
goto out;
if (config.max_inodes < inodes)
goto out;
@@ -2277,7 +2293,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
error = 0;
sbinfo->max_blocks = config.max_blocks;
- sbinfo->free_blocks = config.max_blocks - blocks;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2352,7 +2367,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->free_blocks = sbinfo->max_blocks;
+ percpu_counter_init(&sbinfo->used_blocks, 0);
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2366,7 +2381,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_POSIXACL;
#endif
- inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
goto failed;
inode->i_uid = sbinfo->uid;
@@ -2444,14 +2459,13 @@ static const struct file_operations shmem_file_operations = {
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
#endif
};
static const struct inode_operations shmem_inode_operations = {
- .truncate = shmem_truncate,
.setattr = shmem_notify_change,
.truncate_range = shmem_truncate_range,
#ifdef CONFIG_TMPFS_POSIX_ACL
@@ -2505,7 +2519,7 @@ static const struct super_operations shmem_ops = {
.remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
- .delete_inode = shmem_delete_inode,
+ .evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
};
@@ -2570,6 +2584,45 @@ out4:
return error;
}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * @inode: the inode to be searched
+ * @pgoff: the offset to be searched
+ * @pagep: the pointer for the found page to be stored
+ * @ent: the pointer for the found swap entry to be stored
+ *
+ * If a page is found, refcount of it is incremented. Callers should handle
+ * these refcount.
+ */
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent)
+{
+ swp_entry_t entry = { .val = 0 }, *ptr;
+ struct page *page = NULL;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ goto out;
+
+ spin_lock(&info->lock);
+ ptr = shmem_swp_entry(info, pgoff, NULL);
+#ifdef CONFIG_SWAP
+ if (ptr && ptr->val) {
+ entry.val = ptr->val;
+ page = find_get_page(&swapper_space, entry.val);
+ } else
+#endif
+ page = find_get_page(inode->i_mapping, pgoff);
+ if (ptr)
+ shmem_swp_unmap(ptr);
+ spin_unlock(&info->lock);
+out:
+ *pagep = page;
+ *ent = entry;
+}
+#endif
+
#else /* !CONFIG_SHMEM */
/*
@@ -2609,9 +2662,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
return 0;
}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * @inode: the inode to be searched
+ * @pgoff: the offset to be searched
+ * @pagep: the pointer for the found page to be stored
+ * @ent: the pointer for the found swap entry to be stored
+ *
+ * If a page is found, refcount of it is incremented. Callers should handle
+ * these refcount.
+ */
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent)
+{
+ struct page *page = NULL;
+
+ if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ goto out;
+ page = find_get_page(inode->i_mapping, pgoff);
+out:
+ *pagep = page;
+ *ent = (swp_entry_t){ .val = 0 };
+}
+#endif
+
#define shmem_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
+#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
@@ -2655,7 +2733,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
path.mnt = mntget(shm_mnt);
error = -ENOSPC;
- inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
+ inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
if (!inode)
goto put_dentry;
diff --git a/mm/slab.c b/mm/slab.c
index bac0f4fcc21..88435fcc838 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <linux/kmemtrace.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -115,6 +114,7 @@
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -144,30 +144,6 @@
#define BYTES_PER_WORD sizeof(void *)
#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
@@ -418,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -844,7 +820,7 @@ static void init_reap_node(int cpu)
{
int node;
- node = next_node(cpu_to_node(cpu), node_online_map);
+ node = next_node(cpu_to_mem(cpu), node_online_map);
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
@@ -884,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu)
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
+ INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
@@ -1073,7 +1049,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
struct array_cache *alien = NULL;
int node;
- node = numa_node_id();
+ node = numa_mem_id();
/*
* Make sure we are not freeing a object from another node to the array
@@ -1102,11 +1078,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
#endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3;
+ const int memsize = sizeof(struct kmem_list3);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ /*
+ * Set up the size64 kmemlist for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
+ return -ENOMEM;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+ return 0;
+}
+
static void __cpuinit cpuup_canceled(long cpu)
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
+ int node = cpu_to_mem(cpu);
const struct cpumask *mask = cpumask_of_node(node);
list_for_each_entry(cachep, &cache_chain, next) {
@@ -1171,8 +1193,8 @@ static int __cpuinit cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
+ int node = cpu_to_mem(cpu);
+ int err;
/*
* We need to do this right in the beginning since
@@ -1180,35 +1202,9 @@ static int __cpuinit cpuup_prepare(long cpu)
* kmalloc_node allows us to add the slab to the right
* kmem_list3 and not this cpu's kmem_list3
*/
-
- list_for_each_entry(cachep, &cache_chain, next) {
- /*
- * Set up the size64 kmemlist for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!l3)
- goto bad;
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- /*
- * The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
- * protection here.
- */
- cachep->nodelists[node] = l3;
- }
-
- spin_lock_irq(&cachep->nodelists[node]->list_lock);
- cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->nodelists[node]->list_lock);
- }
+ err = init_cache_nodelists_node(node);
+ if (err < 0)
+ goto bad;
/*
* Now we can go ahead with allocating the shared arrays and
@@ -1324,18 +1320,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
mutex_unlock(&cache_chain_mutex);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block __cpuinitdata cpucache_notifier = {
&cpuup_callback, NULL, 0
};
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct kmem_list3 *l3;
+
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ drain_freelist(cachep, l3, l3->free_objects);
+
+ if (!list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = init_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = drain_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
- int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
{
struct kmem_list3 *ptr;
@@ -1418,7 +1478,7 @@ void __init kmem_cache_init(void)
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- node = numa_node_id();
+ node = numa_mem_id();
/* 1) create the cache_cache */
INIT_LIST_HEAD(&cache_chain);
@@ -1580,6 +1640,14 @@ void __init kmem_cache_init_late(void)
*/
register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * nodelists.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
@@ -2052,7 +2120,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
}
}
- cachep->nodelists[numa_node_id()]->next_reap =
+ cachep->nodelists[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
@@ -2220,8 +2288,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < align) {
ralign = align;
}
- /* disable debug if necessary */
- if (ralign > __alignof__(unsigned long long))
+ /* disable debug if not aligning with REDZONE_ALIGN */
+ if (ralign & (__alignof__(unsigned long long) - 1))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
@@ -2247,8 +2315,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += sizeof(unsigned long long);
- size += 2 * sizeof(unsigned long long);
+ cachep->obj_offset += align;
+ size += align + sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
@@ -2383,7 +2451,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+ assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
#endif
}
@@ -2410,7 +2478,7 @@ static void do_drain(void *arg)
{
struct kmem_cache *cachep = arg;
struct array_cache *ac;
- int node = numa_node_id();
+ int node = numa_mem_id();
check_irq_off();
ac = cpu_cache_get(cachep);
@@ -2943,7 +3011,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
retry:
check_irq_off();
- node = numa_node_id();
+ node = numa_mem_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3147,11 +3215,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
- nid_alloc = nid_here = numa_node_id();
+ nid_alloc = nid_here = numa_mem_id();
+ get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
- nid_alloc = cpuset_mem_spread_node();
+ nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
+ put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3178,6 +3248,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
if (flags & __GFP_THISNODE)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3209,7 +3280,7 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());
+ obj = kmem_getpages(cache, local_flags, numa_mem_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
@@ -3233,6 +3304,7 @@ retry:
}
}
}
+ put_mems_allowed();
return obj;
}
@@ -3316,6 +3388,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
{
unsigned long save_flags;
void *ptr;
+ int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
@@ -3328,7 +3401,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
local_irq_save(save_flags);
if (nodeid == -1)
- nodeid = numa_node_id();
+ nodeid = slab_node;
if (unlikely(!cachep->nodelists[nodeid])) {
/* Node not bootstrapped yet */
@@ -3336,7 +3409,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
goto out;
}
- if (nodeid == numa_node_id()) {
+ if (nodeid == slab_node) {
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
@@ -3380,8 +3453,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
- if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_node_id());
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_mem_id());
out:
return objp;
@@ -3478,7 +3551,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
batchcount = ac->batchcount;
#if DEBUG
@@ -3912,7 +3985,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return -ENOMEM;
for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
+ new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
@@ -3934,9 +4007,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
+ spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
@@ -4042,7 +4115,7 @@ static void cache_reap(struct work_struct *w)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
if (!mutex_trylock(&cache_chain_mutex))
@@ -4216,10 +4289,11 @@ static int s_show(struct seq_file *m, void *p)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
- reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+ "%4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
}
/* cpu stats */
{
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc3..d582171c810 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,8 +66,10 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
#include <asm/atomic.h>
/*
@@ -394,6 +396,7 @@ static void slob_free(void *block, int size)
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
+ struct list_head *slob_list;
if (unlikely(ZERO_OR_NULL_PTR(block)))
return;
@@ -422,7 +425,13 @@ static void slob_free(void *block, int size)
set_slob(b, units,
(void *)((unsigned long)(b +
SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
- set_slob_page_free(sp, &free_slob_small);
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+ set_slob_page_free(sp, slob_list);
goto out;
}
@@ -467,14 +476,6 @@ out:
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
-
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
unsigned int *m;
@@ -647,7 +648,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
- INIT_RCU_HEAD(&slob_rcu->head);
slob_rcu->size = c->size;
call_rcu(&slob_rcu->head, kmem_rcu_free);
} else {
diff --git a/mm/slub.c b/mm/slub.c
index d2a54fe71ea..13fffe1f0f3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -107,11 +106,17 @@
* the fast path and disables lockless freelists.
*/
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
#ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG 1
+ return unlikely(s->flags & SLAB_DEBUG_FLAGS);
#else
-#define SLABDEBUG 0
+ return 0;
#endif
+}
/*
* Issues still to be resolved:
@@ -157,21 +162,13 @@
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
SLAB_CACHE_DMA | SLAB_NOTRACK)
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
/* Internal SLUB flags */
-#define __OBJECT_POISON 0x80000000 /* Poison object */
-#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
+#define __OBJECT_POISON 0x80000000UL /* Poison object */
+#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
static int kmem_size = sizeof(struct kmem_cache);
@@ -1081,10 +1078,10 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
flags |= __GFP_NOTRACK;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return alloc_pages(flags, order);
else
- return alloc_pages_node(node, flags, order);
+ return alloc_pages_exact_node(node, flags, order);
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1165,9 +1162,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
- if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_TRACE))
- __SetPageSlubDebug(page);
start = page_address(page);
@@ -1194,14 +1188,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+ if (kmem_cache_debug(s)) {
void *p;
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
check_object(s, page, p, 0);
- __ClearPageSlubDebug(page);
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1368,6 +1361,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1377,10 +1371,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
page = get_partial_node(n);
- if (page)
+ if (page) {
+ put_mems_allowed();
return page;
+ }
}
}
+ put_mems_allowed();
#endif
return NULL;
}
@@ -1391,10 +1388,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- int searchnode = (node == -1) ? numa_node_id() : node;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
page = get_partial_node(get_node(s, searchnode));
- if (page || (flags & __GFP_THISNODE))
+ if (page || node != -1)
return page;
return get_any_partial(s, flags);
@@ -1419,8 +1416,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
stat(s, DEACTIVATE_FULL);
- if (SLABDEBUG && PageSlubDebug(page) &&
- (s->flags & SLAB_STORE_USER))
+ if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
@@ -1519,7 +1515,7 @@ static void flush_all(struct kmem_cache *s)
static inline int node_match(struct kmem_cache_cpu *c, int node)
{
#ifdef CONFIG_NUMA
- if (node != -1 && c->node != node)
+ if (node != NUMA_NO_NODE && c->node != node)
return 0;
#endif
return 1;
@@ -1628,7 +1624,7 @@ load_freelist:
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+ if (kmem_cache_debug(s))
goto debug;
c->freelist = get_freepointer(s, object);
@@ -1731,7 +1727,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
@@ -1742,7 +1738,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
{
- return slab_alloc(s, gfpflags, -1, _RET_IP_);
+ return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_alloc_notrace);
#endif
@@ -1787,7 +1783,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
slab_lock(page);
- if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+ if (kmem_cache_debug(s))
goto debug;
checks_ok:
@@ -2141,7 +2137,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = s->node[node];
- if (n && n != &s->local_node)
+ if (n)
kmem_cache_free(kmalloc_caches, n);
s->node[node] = NULL;
}
@@ -2150,33 +2146,22 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
{
int node;
- int local_node;
-
- if (slab_state >= UP && (s < kmalloc_caches ||
- s >= kmalloc_caches + KMALLOC_CACHES))
- local_node = page_to_nid(virt_to_page(s));
- else
- local_node = 0;
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n;
- if (local_node == node)
- n = &s->local_node;
- else {
- if (slab_state == DOWN) {
- early_kmem_cache_node_alloc(gfpflags, node);
- continue;
- }
- n = kmem_cache_alloc_node(kmalloc_caches,
- gfpflags, node);
-
- if (!n) {
- free_kmem_cache_nodes(s);
- return 0;
- }
+ if (slab_state == DOWN) {
+ early_kmem_cache_node_alloc(gfpflags, node);
+ continue;
+ }
+ n = kmem_cache_alloc_node(kmalloc_caches,
+ gfpflags, node);
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
}
+
s->node[node] = n;
init_kmem_cache_node(n, s);
}
@@ -2429,9 +2414,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- DECLARE_BITMAP(map, page->objects);
+ long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+ GFP_ATOMIC);
- bitmap_zero(map, page->objects);
+ if (!map)
+ return;
slab_err(s, page, "%s", text);
slab_lock(page);
for_each_free_object(p, s, page->freelist)
@@ -2446,6 +2433,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
slab_unlock(page);
+ kfree(map);
#endif
}
@@ -2502,7 +2490,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
- up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -2511,8 +2498,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- } else
- up_write(&slub_lock);
+ }
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2740,7 +2727,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, -1, _RET_IP_);
+ ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3130,9 +3117,12 @@ void __init kmem_cache_init(void)
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
- kmalloc_caches[i]. name =
- kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+
+ BUG_ON(!s);
+ kmalloc_caches[i].name = s;
+ }
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
@@ -3235,14 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- up_write(&slub_lock);
if (sysfs_slab_alias(s, name)) {
- down_write(&slub_lock);
s->refcount--;
- up_write(&slub_lock);
goto err;
}
+ up_write(&slub_lock);
return s;
}
@@ -3251,14 +3239,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
if (sysfs_slab_add(s)) {
- down_write(&slub_lock);
list_del(&s->list);
- up_write(&slub_lock);
kfree(s);
goto err;
}
+ up_write(&slub_lock);
return s;
}
kfree(s);
@@ -3324,7 +3310,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, -1, caller);
+ ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
/* Honor the call site pointer we recieved. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -3338,8 +3324,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
struct kmem_cache *s;
void *ret;
- if (unlikely(size > SLUB_MAX_SIZE))
- return kmalloc_large_node(size, gfpflags, node);
+ if (unlikely(size > SLUB_MAX_SIZE)) {
+ ret = kmalloc_large_node(size, gfpflags, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << get_order(size),
+ gfpflags, node);
+
+ return ret;
+ }
s = get_slab(size, gfpflags);
@@ -3400,16 +3393,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
} else
printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
s->name, page);
-
- if (s->flags & DEBUG_DEFAULT_FLAGS) {
- if (!PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug not set "
- "on slab 0x%p\n", s->name, page);
- } else {
- if (PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug set on "
- "slab 0x%p\n", s->name, page);
- }
}
static int validate_slab_node(struct kmem_cache *s,
@@ -3651,10 +3634,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc)
+ struct page *page, enum track_item alloc,
+ long *map)
{
void *addr = page_address(page);
- DECLARE_BITMAP(map, page->objects);
void *p;
bitmap_zero(map, page->objects);
@@ -3673,11 +3656,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+ sizeof(unsigned long), GFP_KERNEL);
- if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_TEMPORARY))
+ if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_TEMPORARY)) {
+ kfree(map);
return sprintf(buf, "Out of memory\n");
-
+ }
/* Push back cpu slabs */
flush_all(s);
@@ -3691,9 +3677,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, map);
list_for_each_entry(page, &n->full, lru)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -3744,6 +3730,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
+ kfree(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
@@ -4505,6 +4492,13 @@ static int sysfs_slab_add(struct kmem_cache *s)
static void sysfs_slab_remove(struct kmem_cache *s)
{
+ if (slab_state < SYSFS)
+ /*
+ * Sysfs has not been setup yet so no need to remove the
+ * cache from sysfs.
+ */
+ return;
+
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
@@ -4550,8 +4544,11 @@ static int __init slab_sysfs_init(void)
struct kmem_cache *s;
int err;
+ down_write(&slub_lock);
+
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
+ up_write(&slub_lock);
printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -4576,6 +4573,7 @@ static int __init slab_sysfs_init(void)
kfree(al);
}
+ up_write(&slub_lock);
resiliency_test();
return 0;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index dc0cc4d43ff..95ac219af37 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
{
struct page *map;
+ unsigned long size;
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
- map = alloc_bootmem_pages_node(NODE_DATA(nid),
- PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
+ size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+ map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+ map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
diff --git a/mm/swap.c b/mm/swap.c
index 7cd60bf0a97..3ce7bc373a5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -224,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
____pagevec_lru_add(pvec, lru);
put_cpu_var(lru_add_pvecs);
}
+EXPORT_SYMBOL(__lru_cache_add);
/**
* lru_cache_add_lru - add a page to a page list
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc..1f3f9c59a73 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
long total_swap_pages;
static int least_priority;
+static bool swap_for_hibernation;
+
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
@@ -139,7 +141,8 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL,
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (err)
return err;
cond_resched();
@@ -150,7 +153,8 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL,
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (err)
break;
@@ -189,7 +193,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+ nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+ BLKDEV_IFL_BARRIER))
break;
}
@@ -315,8 +320,10 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
- /* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ /* reuse swap entry of cache-only swap if not hibernation. */
+ if (vm_swap_full()
+ && usage == SWAP_HAS_CACHE
+ && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -446,6 +453,8 @@ swp_entry_t get_swap_page(void)
spin_lock(&swap_lock);
if (nr_swap_pages <= 0)
goto noswap;
+ if (swap_for_hibernation)
+ goto noswap;
nr_swap_pages--;
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -478,28 +487,6 @@ noswap:
return (swp_entry_t) {0};
}
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
- struct swap_info_struct *si;
- pgoff_t offset;
-
- spin_lock(&swap_lock);
- si = swap_info[type];
- if (si && (si->flags & SWP_WRITEOK)) {
- nr_swap_pages--;
- /* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, 1);
- if (offset) {
- spin_unlock(&swap_lock);
- return swp_entry(type, offset);
- }
- nr_swap_pages++;
- }
- spin_unlock(&swap_lock);
- return (swp_entry_t) {0};
-}
-
static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -574,6 +561,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
/* free if no reference */
if (!usage) {
+ struct gendisk *disk = p->bdev->bd_disk;
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -583,6 +571,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ if ((p->flags & SWP_BLKDEV) &&
+ disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev, offset);
}
return usage;
@@ -755,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
#endif
#ifdef CONFIG_HIBERNATION
+
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+ int i;
+
+ spin_lock(&swap_lock);
+
+ printk(KERN_INFO "PM: Freeze Swap\n");
+ swap_for_hibernation = true;
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ hibernation_offset[i] = 1;
+ spin_unlock(&swap_lock);
+}
+
+void hibernation_thaw_swap(void)
+{
+ spin_lock(&swap_lock);
+ if (swap_for_hibernation) {
+ printk(KERN_INFO "PM: Thaw Swap\n");
+ swap_for_hibernation = false;
+ }
+ spin_unlock(&swap_lock);
+}
+
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+ pgoff_t off;
+ swp_entry_t val = {0};
+ struct swap_info_struct *si;
+
+ spin_lock(&swap_lock);
+
+ si = swap_info[type];
+ if (!si || !(si->flags & SWP_WRITEOK))
+ goto done;
+
+ for (off = hibernation_offset[type]; off < si->max; ++off) {
+ if (!si->swap_map[off])
+ break;
+ }
+ if (off < si->max) {
+ val = swp_entry(type, off);
+ hibernation_offset[type] = off + 1;
+ }
+done:
+ spin_unlock(&swap_lock);
+ return val;
+}
+
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+ /* Nothing to do */
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -1884,6 +1943,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error < 0)
goto bad_swap;
p->bdev = bdev;
+ p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
mutex_lock(&inode->i_mutex);
diff --git a/mm/truncate.c b/mm/truncate.c
index f42675a3615..ba887bff48c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
EXPORT_SYMBOL(truncate_pagecache);
/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updastes i_size update and performs pagecache
+ * truncation (if necessary) for a file size updates. It will be
+ * typically be called from the filesystem's setattr function when
+ * ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and after all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize;
+
+ oldsize = inode->i_size;
+ i_size_write(inode, newsize);
+
+ truncate_pagecache(inode, oldsize, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
+/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
* @offset: file offset to start truncating
*
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
+ * This function is deprecated and truncate_setsize or truncate_pagecache
+ * should be used instead, together with filesystem specific block truncation.
*/
int vmtruncate(struct inode *inode, loff_t offset)
{
- loff_t oldsize;
int error;
error = inode_newsize_ok(inode, offset);
if (error)
return error;
- oldsize = inode->i_size;
- i_size_write(inode, offset);
- truncate_pagecache(inode, oldsize, offset);
+
+ truncate_setsize(inode, offset);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
-
- return error;
+ return 0;
}
EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index f5712e8964b..4735ea48181 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
if (length > n)
return ERR_PTR(-EINVAL);
- p = kmalloc(length, GFP_KERNEL);
+ p = memdup_user(s, length);
- if (!p)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(p, s, length)) {
- kfree(p);
- return ERR_PTR(-EFAULT);
- }
+ if (IS_ERR(p))
+ return p;
p[length - 1] = '\0';
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7f35fe2cf9e..6b8889da69a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -736,7 +736,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
node, gfp_mask);
if (unlikely(IS_ERR(va))) {
kfree(vb);
- return ERR_PTR(PTR_ERR(va));
+ return ERR_CAST(va);
}
err = radix_tree_preload(gfp_mask);
@@ -2407,7 +2407,7 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
- seq_printf(m, " phys=%lx", v->phys_addr);
+ seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_printf(m, " ioremap");
@@ -2441,8 +2441,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
- if (NUMA_BUILD)
+ if (NUMA_BUILD) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ if (ptr == NULL)
+ return -ENOMEM;
+ }
ret = seq_open(file, &vmalloc_op);
if (!ret) {
struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311447f..c391c320dba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -73,10 +76,14 @@ struct scan_control {
int swappiness;
- int all_unreclaimable;
-
int order;
+ /*
+ * Intend to reclaim enough contenious memory rather than to reclaim
+ * enough amount memory. I.e, it's the mode for high order allocation.
+ */
+ bool lumpy_reclaim_mode;
+
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -85,12 +92,6 @@ struct scan_control {
* are scanned.
*/
nodemask_t *nodemask;
-
- /* Pluggable isolate pages callback */
- unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
- unsigned long *scanned, int order, int mode,
- struct zone *z, struct mem_cgroup *mem_cont,
- int active, int file);
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -215,8 +216,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
+ unsigned long max_pass;
+ max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
@@ -244,8 +246,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrink)(0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
+ nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+ gfp_mask);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -298,7 +301,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page(page);
+ lock_page_nosync(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page,
+ trace_reclaim_flags(page, sync_writeback));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -575,7 +580,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ if (sc->lumpy_reclaim_mode)
return PAGEREF_RECLAIM;
/*
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
}
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+ struct pagevec freed_pvec;
+ struct page *page, *tmp;
+
+ pagevec_init(&freed_pvec, 1);
+
+ list_for_each_entry_safe(page, tmp, free_pages, lru) {
+ list_del(&page->lru);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ }
+
+ pagevec_free(&freed_pvec);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
enum page_references references;
struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
continue;
cull_mlocked:
@@ -832,18 +856,14 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+
+ free_page_list(&free_pages);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
-/* LRU Isolation modes. */
-#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
-#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
-#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
-
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
@@ -921,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order, int mode, int file)
{
unsigned long nr_taken = 0;
+ unsigned long nr_lumpy_taken = 0;
+ unsigned long nr_lumpy_dirty = 0;
+ unsigned long nr_lumpy_failed = 0;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -998,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
nr_taken++;
+ nr_lumpy_taken++;
+ if (PageDirty(cursor_page))
+ nr_lumpy_dirty++;
scan++;
+ } else {
+ if (mode == ISOLATE_BOTH &&
+ page_count(cursor_page))
+ nr_lumpy_failed++;
}
}
}
*scanned = scan;
+
+ trace_mm_vmscan_lru_isolate(order,
+ nr_to_scan, scan,
+ nr_taken,
+ nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+ mode);
return nr_taken;
}
@@ -1011,7 +1047,6 @@ static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
- struct mem_cgroup *mem_cont,
int active, int file)
{
int lru = LRU_BASE;
@@ -1041,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
ClearPageActive(page);
nr_active++;
}
- count[lru]++;
+ if (count)
+ count[lru]++;
}
return nr_active;
@@ -1118,176 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
}
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+ unsigned long nr_anon, unsigned long nr_file,
+ struct list_head *page_list)
{
- LIST_HEAD(page_list);
+ struct page *page;
struct pagevec pvec;
- unsigned long nr_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int lumpy_reclaim = 0;
-
- while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
- }
+ pagevec_init(&pvec, 1);
/*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
- *
- * We use the same threshold as pageout congestion_wait below.
+ * Put back any unfreeable pages.
*/
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- lumpy_reclaim = 1;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- lumpy_reclaim = 1;
+ spin_lock(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ int lru;
+ page = lru_to_page(page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ reclaim_stat->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
- pagevec_init(&pvec, 1);
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
-
- nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan, sc->order, mode,
- zone, sc->mem_cgroup, 0, file);
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *nr_anon,
+ unsigned long *nr_file,
+ struct list_head *isolated_list)
+{
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- if (scanning_global_lru(sc)) {
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- }
+ nr_active = clear_active_flags(isolated_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
+
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
+
+ *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+
+ reclaim_stat->recent_scanned[0] += *nr_anon;
+ reclaim_stat->recent_scanned[1] += *nr_file;
+}
- if (nr_taken == 0)
- goto done;
+/*
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+ unsigned long nr_freed,
+ int priority,
+ struct scan_control *sc)
+{
+ int lumpy_stall_priority;
- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+ /* kswapd should not stall on sync IO */
+ if (current_is_kswapd())
+ return false;
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+ /* Only stall on lumpy reclaim */
+ if (!sc->lumpy_reclaim_mode)
+ return false;
- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ /* If we have relaimed everything on the isolated list, no stall */
+ if (nr_freed == nr_taken)
+ return false;
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ /*
+ * For high-order allocations, there are two stall thresholds.
+ * High-cost allocations stall immediately where as lower
+ * order allocations such as stacks require the scanning
+ * priority to be much higher before stalling.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ lumpy_stall_priority = DEF_PRIORITY;
+ else
+ lumpy_stall_priority = DEF_PRIORITY / 3;
- spin_unlock_irq(&zone->lru_lock);
+ return priority <= lumpy_stall_priority;
+}
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_taken;
+ unsigned long nr_active;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
+
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, 0, file);
+ zone->pages_scanned += nr_scanned;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scanned);
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup,
+ 0, file);
/*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
*/
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- lumpy_reclaim) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
+ if (nr_taken == 0) {
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+ }
- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
- nr_reclaimed += nr_freed;
+ spin_unlock_irq(&zone->lru_lock);
- local_irq_disable();
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+ nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /* Check if we should syncronously wait for writeback */
+ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
- spin_lock(&zone->lru_lock);
/*
- * Put back any unfreeable pages.
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
*/
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ nr_active = clear_active_flags(&page_list, NULL);
+ count_vm_events(PGDEACTIVATE, nr_active);
- } while (nr_scanned < max_scan);
+ nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ }
-done:
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-/*
- * We are about to scan this zone at a certain priority level. If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone. This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
- if (priority < zone->prev_priority)
- zone->prev_priority = priority;
+ putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ return nr_reclaimed;
}
/*
@@ -1356,16 +1428,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
- sc->mem_cgroup, 1, file);
- /*
- * zone->pages_scanned is used for detect zone's oom
- * mem_cgroup remembers nr_scan by itself.
- */
if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_pages, &l_hold,
+ &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ 1, file);
zone->pages_scanned += pgscanned;
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
+ &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ sc->mem_cgroup, 1, file);
+ /*
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
+ */
}
+
reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
@@ -1519,21 +1598,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
}
/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+ unsigned long *nr_saved_scan)
+{
+ unsigned long nr;
+
+ *nr_saved_scan += nr_to_scan;
+ nr = *nr_saved_scan;
+
+ if (nr >= SWAP_CLUSTER_MAX)
+ *nr_saved_scan = 0;
+ else
+ nr = 0;
+
+ return nr;
+}
+
+/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
- * percent[0] specifies how much pressure to put on ram/swap backed
- * memory, while percent[1] determines pressure on the file LRUs.
+ * nr[0] = anon pages to scan; nr[1] = file pages to scan
*/
-static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
- unsigned long *percent)
+static void get_scan_count(struct zone *zone, struct scan_control *sc,
+ unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ u64 fraction[2], denominator;
+ enum lru_list l;
+ int noswap = 0;
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
+ noswap = 1;
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
+ goto out;
+ }
anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1545,13 +1655,21 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
- percent[0] = 100;
- percent[1] = 0;
- return;
+ fraction[0] = 1;
+ fraction[1] = 0;
+ denominator = 1;
+ goto out;
}
}
/*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
+
+ /*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
@@ -1562,28 +1680,18 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
/*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
-
- /*
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
@@ -1593,30 +1701,39 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
+ spin_unlock_irq(&zone->lru_lock);
+
+ fraction[0] = ap;
+ fraction[1] = fp;
+ denominator = ap + fp + 1;
+out:
+ for_each_evictable_lru(l) {
+ int file = is_file_lru(l);
+ unsigned long scan;
- /* Normalize to percentages */
- percent[0] = 100 * ap / (ap + fp + 1);
- percent[1] = 100 - percent[0];
+ scan = zone_nr_lru_pages(zone, sc, l);
+ if (priority || noswap) {
+ scan >>= priority;
+ scan = div64_u64(scan * fraction[file], denominator);
+ }
+ nr[l] = nr_scan_try_batch(scan,
+ &reclaim_stat->nr_saved_scan[l]);
+ }
}
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
+static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
{
- unsigned long nr;
-
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
+ /*
+ * If we need a large contiguous chunk of memory, or have
+ * trouble getting a small set of contiguous pages, we
+ * will reclaim both active and inactive pages.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ sc->lumpy_reclaim_mode = 1;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ sc->lumpy_reclaim_mode = 1;
else
- nr = 0;
-
- return nr;
+ sc->lumpy_reclaim_mode = 0;
}
/*
@@ -1627,33 +1744,13 @@ static void shrink_zone(int priority, struct zone *zone,
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int noswap = 0;
- /* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (nr_swap_pages <= 0)) {
- noswap = 1;
- percent[0] = 0;
- percent[1] = 100;
- } else
- get_scan_ratio(zone, sc, percent);
+ get_scan_count(zone, sc, nr, priority);
- for_each_evictable_lru(l) {
- int file = is_file_lru(l);
- unsigned long scan;
-
- scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap) {
- scan >>= priority;
- scan = (scan * percent[file]) / 100;
- }
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
- }
+ set_lumpy_reclaim_mode(priority, sc);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -1707,16 +1804,15 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
+ bool all_unreclaimable = true;
- sc->all_unreclaimable = 1;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- sc->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1726,23 +1822,14 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
if (scanning_global_lru(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- note_zone_scanning_priority(zone, priority);
-
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- sc->all_unreclaimable = 0;
- } else {
- /*
- * Ignore cpuset limitation here. We just want to reduce
- * # of used pages by us regardless of memory shortage.
- */
- sc->all_unreclaimable = 0;
- mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
- priority);
}
shrink_zone(priority, zone, sc);
+ all_unreclaimable = false;
}
+ return all_unreclaimable;
}
/*
@@ -1765,42 +1852,38 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- unsigned long ret = 0;
+ bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
+ get_mems_allowed();
delayacct_freepages_start();
if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
- /*
- * mem_cgroup will not do shrink_slab.
- */
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- }
- }
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- shrink_zones(priority, zonelist, sc);
+ all_unreclaimable = shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
if (scanning_global_lru(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1808,10 +1891,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
}
total_scanned += sc->nr_scanned;
- if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
- ret = sc->nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
- }
/*
* Try to write back as many pages as we just scanned. This
@@ -1831,9 +1912,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
priority < DEF_PRIORITY - 2)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
- /* top priority shrink_zones still had more to do? don't OOM, then */
- if (!sc->all_unreclaimable && scanning_global_lru(sc))
- ret = sc->nr_reclaimed;
+
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1845,25 +1924,23 @@ out:
if (priority < 0)
priority = 0;
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
+ delayacct_freepages_end();
+ put_mems_allowed();
- zone->prev_priority = priority;
- }
- } else
- mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+ if (sc->nr_reclaimed)
+ return sc->nr_reclaimed;
- delayacct_freepages_end();
+ /* top priority shrink_zones still had more to do? don't OOM, then */
+ if (scanning_global_lru(sc) && !all_unreclaimable)
+ return 1;
- return ret;
+ return 0;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
@@ -1873,11 +1950,18 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
- .isolate_pages = isolate_pages_global,
.nodemask = nodemask,
};
- return do_try_to_free_pages(zonelist, &sc);
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1885,24 +1969,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone, int nid)
+ struct zone *zone)
{
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
- .isolate_pages = mem_cgroup_isolate_pages,
};
- nodemask_t nm = nodemask_of_node(nid);
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- sc.nodemask = &nm;
- sc.nr_reclaimed = 0;
- sc.nr_scanned = 0;
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -1911,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed;
}
@@ -1920,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
unsigned int swappiness)
{
struct zonelist *zonelist;
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -1928,14 +2016,22 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
- .isolate_pages = mem_cgroup_isolate_pages,
.nodemask = NULL, /* we don't care the placement */
};
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
- return do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#endif
@@ -2006,24 +2102,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
- .isolate_pages = isolate_pages_global,
};
- /*
- * temp_priority is used to remember the scanning priority at which
- * this zone was successfully refilled to
- * free_pages == high_wmark_pages(zone).
- */
- int temp_priority[MAX_NR_ZONES];
-
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++)
- temp_priority[i] = DEF_PRIORITY;
-
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
@@ -2083,7 +2168,6 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- int nid, zid;
if (!populated_zone(zone))
continue;
@@ -2091,18 +2175,14 @@ loop_again:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- temp_priority[i] = priority;
sc.nr_scanned = 0;
- note_zone_scanning_priority(zone, priority);
- nid = pgdat->node_id;
- zid = zone_idx(zone);
/*
* Call soft limit reclaim before calling shrink_zone.
* For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
- nid, zid);
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
@@ -2166,16 +2246,6 @@ loop_again:
break;
}
out:
- /*
- * Note within each zone the priority level at which this zone was
- * brought into a happy state. So that the next thread which scans this
- * zone will start out at that priority level.
- */
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->prev_priority = temp_priority[i];
- }
if (!all_zones_ok) {
cond_resched();
@@ -2279,9 +2349,10 @@ static int kswapd(void *p)
* premature sleep. If not, then go fully
* to sleep until explicitly woken up
*/
- if (!sleeping_prematurely(pgdat, order, remaining))
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
schedule();
- else {
+ } else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
@@ -2301,8 +2372,10 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret)
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balance_pgdat(pgdat, order);
+ }
}
return 0;
}
@@ -2322,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2385,7 +2459,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.hibernation_mode = 1,
.swappiness = vm_swappiness,
.order = 0,
- .isolate_pages = isolate_pages_global,
};
struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
@@ -2570,11 +2643,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
.order = order,
- .isolate_pages = isolate_pages_global,
};
- unsigned long slab_reclaimable;
+ unsigned long nr_slab_pages0, nr_slab_pages1;
- disable_swap_token();
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2593,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- note_zone_scanning_priority(zone, priority);
shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
- slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (slab_reclaimable > zone->min_slab_pages) {
+ nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
@@ -2611,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
- slab_reclaimable - nr_pages)
- ;
+ for (;;) {
+ unsigned long lru_pages = zone_reclaimable_pages(zone);
+
+ /* No reclaimable slab or very low memory pressure */
+ if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ break;
+
+ /* Freed enough memory */
+ nr_slab_pages1 = zone_page_state(zone,
+ NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+ break;
+ }
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- sc.nr_reclaimed += slab_reclaimable -
- zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 < nr_slab_pages0)
+ sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fa12ea3051f..f389168f9a8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -16,19 +16,20 @@
#include <linux/cpu.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
+#include <linux/math64.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
{
int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- for_each_cpu(cpu, cpumask) {
+ for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -44,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret, cpu_online_mask);
+ sum_vm_events(ret);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
}
#endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_COMPACTION
+struct contig_page_info {
+ unsigned long free_pages;
+ unsigned long free_blocks_total;
+ unsigned long free_blocks_suitable;
+};
+
+/*
+ * Calculate the number of free pages in a zone, how many contiguous
+ * pages are free and how many are large enough to satisfy an allocation of
+ * the target size. Note that this function makes no attempt to estimate
+ * how many suitable free blocks there *might* be if MOVABLE pages were
+ * migrated. Calculating that is possible, but expensive and can be
+ * figured out from userspace
+ */
+static void fill_contig_page_info(struct zone *zone,
+ unsigned int suitable_order,
+ struct contig_page_info *info)
+{
+ unsigned int order;
+
+ info->free_pages = 0;
+ info->free_blocks_total = 0;
+ info->free_blocks_suitable = 0;
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long blocks;
+
+ /* Count number of free blocks */
+ blocks = zone->free_area[order].nr_free;
+ info->free_blocks_total += blocks;
+
+ /* Count free base pages */
+ info->free_pages += blocks << order;
+
+ /* Count the suitable free blocks */
+ if (order >= suitable_order)
+ info->free_blocks_suitable += blocks <<
+ (order - suitable_order);
+ }
+}
+
+/*
+ * A fragmentation index only makes sense if an allocation of a requested
+ * size would fail. If that is true, the fragmentation index indicates
+ * whether external fragmentation or a lack of memory was the problem.
+ * The value can be used to determine if page reclaim or compaction
+ * should be used
+ */
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
+{
+ unsigned long requested = 1UL << order;
+
+ if (!info->free_blocks_total)
+ return 0;
+
+ /* Fragmentation index only makes sense when a request would fail */
+ if (info->free_blocks_suitable)
+ return -1000;
+
+ /*
+ * Index is between 0 and 1 so return within 3 decimal places
+ *
+ * 0 => allocation would fail due to lack of memory
+ * 1 => allocation would fail due to fragmentation
+ */
+ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
+}
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ return __fragmentation_index(order, &info);
+}
+#endif
+
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
spin_unlock_irqrestore(&zone->lock, flags);
}
}
+#endif
+#ifdef CONFIG_PROC_FS
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -693,6 +775,16 @@ static const char * const vmstat_text[] = {
"allocstall",
"pgrotated",
+
+#ifdef CONFIG_COMPACTION
+ "compact_blocks_moved",
+ "compact_pages_moved",
+ "compact_pagemigrate_failed",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
#ifdef CONFIG_HUGETLB_PAGE
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
@@ -761,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n all_unreclaimable: %u"
- "\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
zone->all_unreclaimable,
- zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
@@ -954,3 +1044,162 @@ static int __init setup_vmstat(void)
return 0;
}
module_init(setup_vmstat)
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
+#include <linux/debugfs.h>
+
+static struct dentry *extfrag_debug_root;
+
+/*
+ * Return an index indicating how much of the available free memory is
+ * unusable for an allocation of the requested size.
+ */
+static int unusable_free_index(unsigned int order,
+ struct contig_page_info *info)
+{
+ /* No free memory is interpreted as all free memory is unusable */
+ if (info->free_pages == 0)
+ return 1000;
+
+ /*
+ * Index should be a value between 0 and 1. Return a value to 3
+ * decimal places.
+ *
+ * 0 => no fragmentation
+ * 1 => high fragmentation
+ */
+ return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
+
+}
+
+static void unusable_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = unusable_free_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display unusable free space index
+ *
+ * The unusable free space index measures how much of the available free
+ * memory cannot be used to satisfy an allocation of a given size and is a
+ * value between 0 and 1. The higher the value, the more of free memory is
+ * unusable and by implication, the worse the external fragmentation is. This
+ * can be expressed as a percentage by multiplying by 100.
+ */
+static int unusable_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
+ walk_zones_in_node(m, pgdat, unusable_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations unusable_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = unusable_show,
+};
+
+static int unusable_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unusable_op);
+}
+
+static const struct file_operations unusable_file_ops = {
+ .open = unusable_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void extfrag_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+
+ /* Alloc on stack as interrupts are disabled for zone walk */
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = __fragmentation_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display fragmentation index for orders that allocations would fail for
+ */
+static int extfrag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ walk_zones_in_node(m, pgdat, extfrag_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations extfrag_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = extfrag_show,
+};
+
+static int extfrag_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &extfrag_op);
+}
+
+static const struct file_operations extfrag_file_ops = {
+ .open = extfrag_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init extfrag_debug_init(void)
+{
+ extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
+ if (!extfrag_debug_root)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("unusable_index", 0444,
+ extfrag_debug_root, NULL, &unusable_file_ops))
+ return -ENOMEM;
+
+ if (!debugfs_create_file("extfrag_index", 0444,
+ extfrag_debug_root, NULL, &extfrag_file_ops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+module_init(extfrag_debug_init);
+#endif