From 41f2df62894bfcd3bf868af916b32b90aa7168dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Jun 2010 08:54:16 +0200 Subject: block: BARRIER request should imply SYNC A barrier request should by defintion have priority in get_request and let the queue be unplugged immediately as it's blocking all forward progress due to the queue draining. Most filesystems already get this implicitly by the way how submit_bh treats the buffer_ordered flag, and gfs2 sets it explicitly. But btrfs and XFS are still forgetting to set the flag, as is blkdev_issue_flush and some places in DM/MD. For XFS on metadata heavy workloads this gives a consistent speedup in the 2-3% range. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6a857e24f94..efc3539ac5a 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); + submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); -- cgit v1.2.3 From 7b6d91daee5cac6402186ff224c3af39d79f4a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Aug 2010 18:20:39 +0200 Subject: block: unify flags for struct bio and struct request Remove the current bio flags and reuse the request flags for the bio, too. This allows to more easily trace the type of I/O from the filesystem down to the block driver. There were two flags in the bio that were missing in the requests: BIO_RW_UNPLUG and BIO_RW_AHEAD. Also I've renamed two request flags that had a superflous RW in them. Note that the flags are in bio.h despite having the REQ_ name - as blkdev.h includes bio.h that is the only way to go for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/bio.c | 5 +++-- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/inode.c | 6 +++--- fs/btrfs/volumes.c | 18 +++++++++--------- fs/exofs/ios.c | 2 +- fs/gfs2/log.c | 4 ++-- fs/gfs2/meta_io.c | 8 ++++---- fs/gfs2/ops_fstype.c | 2 +- fs/nilfs2/segbuf.c | 2 +- 9 files changed, 28 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index e7bf6ca64dc..8abb2dfb2e7 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (!bio) goto out_bmd; - bio->bi_rw |= (!write_to_vm << BIO_RW); + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; ret = 0; @@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * set data direction, and check if mapped pages need bouncing */ if (!write_to_vm) - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; bio->bi_bdev = bdev; bio->bi_flags |= (1 << BIO_USER_MAPPED); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34f7c375567..64f10082f04 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -480,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.func = end_workqueue_fn; end_io_wq->work.flags = 0; - if (bio->bi_rw & (1 << BIO_RW)) { + if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); @@ -604,7 +604,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & (1 << BIO_RW_SYNCIO)) + if (rw & REQ_SYNC) btrfs_set_work_high_prio(&async->work); btrfs_queue_worker(&fs_info->workers, &async->work); @@ -668,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, 1); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -1427,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work) * ram and up to date before trying to verify things. For * blocksize <= pagesize, it is basically a noop */ - if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && !bio_ready_for_csum(bio)) { btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1bff92ad474..e975d7180a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1429,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1841,7 +1841,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, bio->bi_size = 0; bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & (1 << BIO_RW)) + if (failed_bio->bi_rw & REQ_WRITE) rw = WRITE; else rw = READ; @@ -5642,7 +5642,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, struct bio_vec *bvec = bio->bi_io_vec; u64 start; int skip_sum; - int write = rw & (1 << BIO_RW); + int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6e3af8be95..dd318ff280b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -258,7 +258,7 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); - if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) + if (cur->bi_rw & REQ_SYNC) num_sync_run++; submit_bio(cur->bi_rw, cur); @@ -2651,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int max_errors = 0; struct btrfs_multi_bio *multi = NULL; - if (multi_ret && !(rw & (1 << BIO_RW))) + if (multi_ret && !(rw & REQ_WRITE)) stripes_allocated = 1; again: if (multi_ret) { @@ -2687,7 +2687,7 @@ again: mirror_num = 0; /* if our multi bio struct is too small, back off and try again */ - if (rw & (1 << BIO_RW)) { + if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { stripes_required = map->num_stripes; @@ -2697,7 +2697,7 @@ again: max_errors = 1; } } - if (multi_ret && (rw & (1 << BIO_RW)) && + if (multi_ret && (rw & REQ_WRITE) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2733,7 +2733,7 @@ again: num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2744,7 +2744,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (1 << BIO_RW)) + if (rw & REQ_WRITE) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2755,7 +2755,7 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -2945,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { bio_get(bio); submit_bio(rw, bio); bio_put(bio); @@ -2964,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) + if (bio->bi_rw & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4337cad7777..e2732203fa9 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -599,7 +599,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) } else { bio = master_dev->bio; /* FIXME: bio_set_dir() */ - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; } osd_req_write(or, &ios->obj, per_dev->offset, bio, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index efc3539ac5a..cde1248a622 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + submit_bh(WRITE_BARRIER | REQ_META, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -605,7 +605,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) lock_buffer(bh); skip_barrier: get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); + submit_bh(WRITE_SYNC | REQ_META, bh); wait_on_buffer(bh); } if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 18176d0b75d..f3b071f921a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE)); + int write_op = REQ_META | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); + submit_bh(READ_SYNC | REQ_META, bh); if (!(flags & DIO_WAIT)) return 0; @@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3593b3a7290..fd4f8946abf 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -275,7 +275,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + submit_bio(READ_SYNC | REQ_META, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2e6a2723b8f..4588fb9e93d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -508,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } -- cgit v1.2.3 From c1955ce32fdb0877b7a1b22feb2669358f65be76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 19 Jun 2010 23:08:06 +0200 Subject: writeback: remove wb_list The wb_list member of struct backing_device_info always has exactly one element. Just use the direct bdi->wb pointer instead and simplify some code. Also remove bdi_task_init which is now trivial to prepare for the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be1693ac9..d67989b8ba4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -73,9 +73,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * If the default thread isn't there, make sure we add it. When * it gets created and wakes up, we'll run this work. */ - if (unlikely(list_empty_careful(&bdi->wb_list))) + if (unlikely(!bdi->wb.task)) { wake_up_process(default_backing_dev_info.wb.task); - else { + } else { struct bdi_writeback *wb = &bdi->wb; if (wb->task) -- cgit v1.2.3 From 082439004b31adc146e96e5f1c574dd2b57dcd93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 19 Jun 2010 23:08:22 +0200 Subject: writeback: merge bdi_writeback_task and bdi_start_fn Move all code for the writeback thread into fs/fs-writeback.c instead of splitting it over two functions in two files. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d67989b8ba4..c8471b3ddcc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -775,12 +775,36 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct bdi_writeback *wb) +int bdi_writeback_thread(void *data) { + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; unsigned long last_active = jiffies; unsigned long wait_jiffies = -1UL; long pages_written; + /* + * Add us to the active bdi_list + */ + spin_lock_bh(&bdi_lock); + list_add_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + while (!kthread_should_stop()) { pages_written = wb_do_writeback(wb, 0); @@ -813,9 +837,18 @@ int bdi_writeback_task(struct bdi_writeback *wb) try_to_freeze(); } + wb->task = NULL; + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); return 0; } + /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. -- cgit v1.2.3 From 1676effca4cd2a6b32e6e8e0ecaa91522dfda6fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 21 Jun 2010 11:02:48 +0200 Subject: gcc-4.6: fs: fix unused but set warnings No real bugs I believe, just some dead code, and some shut up code. Signed-off-by: Andi Kleen Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index efdbfece993..ec11c52d646 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -597,7 +597,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; - pgoff_t index; ssize_t res; size_t this_len; int error; @@ -621,7 +620,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, goto shrink_ret; } - index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -- cgit v1.2.3 From 455b2864686d3591b3b2f39eb46290c95f76471f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:06 +1000 Subject: writeback: Initial tracing support Trace queue/sched/exec parts of the writeback loop. This provides insight into when and why flusher threads are scheduled to run. e.g a sync invocation leaves traces like: sync-[...]: writeback_queue: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 flush-8:0-[...]: writeback_exec: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 This also lays the foundation for adding more writeback tracing to provide deeper insight into the whole writeback path. The original tracing code is from Jens Axboe, though this version is a rewrite as a result of the code being traced changing significantly. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c8471b3ddcc..73acab4dc2b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,15 +26,9 @@ #include #include #include +#include #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -50,6 +44,21 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include + +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -65,6 +74,8 @@ int writeback_in_progress(struct backing_dev_info *bdi) static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { + trace_writeback_queue(bdi, work); + spin_lock(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); @@ -74,6 +85,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * it gets created and wakes up, we'll run this work. */ if (unlikely(!bdi->wb.task)) { + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } else { struct bdi_writeback *wb = &bdi->wb; @@ -95,8 +107,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) + if (bdi->wb.task) { + trace_writeback_nowork(bdi); wake_up_process(bdi->wb.task); + } return; } @@ -751,6 +765,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->sync_mode = WB_SYNC_ALL; + trace_writeback_exec(bdi, work); + wrote += wb_writeback(wb, work); /* @@ -805,9 +821,13 @@ int bdi_writeback_thread(void *data) smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) last_active = jiffies; else if (wait_jiffies != -1UL) { @@ -845,6 +865,8 @@ int bdi_writeback_thread(void *data) */ if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } -- cgit v1.2.3 From 028c2dd184c097809986684f2f0627eea5529fea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:07 +1000 Subject: writeback: Add tracing to balance_dirty_pages Tracing high level background writeback events is good, but it doesn't give the entire picture. Add visibility into write throttling to catch IO dispatched by foreground throttling of processing dirtying lots of pages. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 73acab4dc2b..bf10cbf379d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -656,10 +656,14 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) __writeback_inodes_sb(work->sb, wb, &wbc); else writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; @@ -687,6 +691,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); -- cgit v1.2.3 From 6e9624b8caec290d28b4c6d9ec75749df6372b87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 7 Aug 2010 18:25:34 +0200 Subject: block: push down BKL into .open and .release The open and release block_device_operations are currently called with the BKL held. In order to change that, we must first make sure that all drivers that currently rely on this have no regressions. This blindly pushes the BKL into all .open and .release operations for all block drivers to prepare for the next step. The drivers can subsequently replace the BKL with their own locks or remove it completely when it can be shown that it is not needed. The functions blkdev_get and blkdev_put are the only remaining users of the big kernel lock in the block layer, besides a few uses in the ioctl code, none of which need to serialize with blkdev_{get,put}. Most of these two functions is also under the protection of bdev->bd_mutex, including the actual calls to ->open and ->release, and the common code does not access any global data structures that need the BKL. Signed-off-by: Arnd Bergmann Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 99d6af81174..693c2bf5d65 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1345,13 +1345,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } - lock_kernel(); restart: ret = -ENXIO; disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) - goto out_unlock_kernel; + goto out; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { @@ -1431,7 +1430,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return 0; out_clear: @@ -1444,9 +1442,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_contains = NULL; out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); - out_unlock_kernel: - unlock_kernel(); - + out: if (disk) module_put(disk->fops->owner); put_disk(disk); @@ -1515,7 +1511,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct block_device *victim = NULL; mutex_lock_nested(&bdev->bd_mutex, for_part); - lock_kernel(); if (for_part) bdev->bd_part_count--; @@ -1540,7 +1535,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; } - unlock_kernel(); mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) -- cgit v1.2.3 From 6965031d331a642e31278fa1b5bd47f372ffdd5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 3 Aug 2010 12:48:50 +0200 Subject: splice: fix misuse of SPLICE_F_NONBLOCK SPLICE_F_NONBLOCK is clearly documented to only affect blocking on the pipe. In __generic_file_splice_read(), however, it causes an EAGAIN if the page is currently being read. This makes it impossible to write an application that only wants failure if the pipe is full. For example if the same process is handling both ends of a pipe and isn't otherwise able to determine whether a splice to the pipe will fill it or not. We could make the read non-blocking on O_NONBLOCK or some other splice flag, but for now this is the simplest fix. Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Jens Axboe --- fs/splice.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ec11c52d646..8f1dfaecc8f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -399,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { - /* - * If in nonblock mode then dont block on waiting - * for an in-flight io page - */ - if (flags & SPLICE_F_NONBLOCK) { - if (!trylock_page(page)) { - error = -EAGAIN; - break; - } - } else - lock_page(page); + lock_page(page); /* * Page was truncated, or invalidated by the -- cgit v1.2.3 From 08852b6d6c40f387f2b75e199e2ca1df68970f4c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 3 Aug 2010 12:51:16 +0200 Subject: writeback: remove wb in get_next_work_item 83ba7b07 cleans up the writeback. So we don't use wb any more in get_next_work_item. Let's remove unnecessary argument. CC: Christoph Hellwig Signed-off-by: Minchan Kim Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bf10cbf379d..261570deb22 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -704,7 +704,7 @@ static long wb_writeback(struct bdi_writeback *wb, * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) +get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; @@ -762,7 +762,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) struct wb_writeback_work *work; long wrote = 0; - while ((work = get_next_work_item(bdi, wb)) != NULL) { + while ((work = get_next_work_item(bdi)) != NULL) { /* * Override sync mode, in case we must wait for completion * because this thread is exiting now. -- cgit v1.2.3 From 4aeefdc69f7b6f3f287e6fd8d4b213953b9e92d8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Aug 2010 13:22:51 +0200 Subject: coda: fixup clash with block layer REQ_* defines CODA should not be using defines in the global name space of that nature, prefix them with CODA_. Signed-off-by: Jens Axboe --- fs/coda/psdev.c | 12 ++++++------ fs/coda/upcall.c | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 66b9cf79c5b..de89645777c 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -177,7 +177,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, nbytes = req->uc_outSize; /* don't have more space! */ } if (copy_from_user(req->uc_data, buf, nbytes)) { - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); retval = -EFAULT; goto out; @@ -254,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, retval = -EFAULT; /* If request was not a signal, enqueue and don't free */ - if (!(req->uc_flags & REQ_ASYNC)) { - req->uc_flags |= REQ_READ; + if (!(req->uc_flags & CODA_REQ_ASYNC)) { + req->uc_flags |= CODA_REQ_READ; list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } @@ -315,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file) list_del(&req->uc_chain); /* Async requests need to be freed here */ - if (req->uc_flags & REQ_ASYNC) { + if (req->uc_flags & CODA_REQ_ASYNC) { CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); kfree(req); continue; } - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { list_del(&req->uc_chain); - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f09c5ed76f6..b8893ab6f9e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old) (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ (r)->uc_opcode != CODA_RELEASE) || \ - (r)->uc_flags & REQ_READ)) + (r)->uc_flags & CODA_REQ_READ)) static inline void coda_waitfor_upcall(struct upc_req *req) { @@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req) set_current_state(TASK_UNINTERRUPTIBLE); /* got a reply */ - if (req->uc_flags & (REQ_WRITE | REQ_ABORT)) + if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT)) break; if (blocked && time_after(jiffies, timeout) && @@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp, coda_waitfor_upcall(req); /* Op went through, interrupt or not... */ - if (req->uc_flags & REQ_WRITE) { + if (req->uc_flags & CODA_REQ_WRITE) { out = (union outputArgs *)req->uc_data; /* here we map positive Venus errors to kernel errors */ error = -out->oh.result; @@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp, } error = -EINTR; - if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) { + if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { printk(KERN_WARNING "coda: Unexpected interruption.\n"); goto exit; } /* Interrupted before venus read it. */ - if (!(req->uc_flags & REQ_READ)) + if (!(req->uc_flags & CODA_REQ_READ)) goto exit; /* Venus saw the upcall, make sure we can send interrupt signal */ @@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp, sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; - sig_req->uc_flags = REQ_ASYNC; + sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; sig_req->uc_inSize = sizeof(struct coda_in_hdr); -- cgit v1.2.3 From 6f904ff0e39ea88f81eb77e8dfb4e1238492f0a8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:11 +0300 Subject: writeback: harmonize writeback threads naming The write-back code mixes words "thread" and "task" for the same things. This is not a big deal, but still an inconsistency. hch: a convention I tend to use and I've seen in various places is to always use _task for the storage of the task_struct pointer, and thread everywhere else. This especially helps with having foo_thread for the actual thread and foo_task for a global variable keeping the task_struct pointer This patch renames: * 'bdi_add_default_flusher_task()' -> 'bdi_add_default_flusher_thread()' * 'bdi_forker_task()' -> 'bdi_forker_thread()' because bdi threads are 'bdi_writeback_thread()', so these names are more consistent. This patch also amends commentaries and makes them refer the forker and bdi threads as "thread", not "task". Also, while on it, make 'bdi_add_default_flusher_thread()' declaration use 'static void' instead of 'void static' and make checkpatch.pl happy. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 261570deb22..002be0ff2ab 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -840,7 +840,7 @@ int bdi_writeback_thread(void *data) /* * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the task will get + * see dirty data again later, the thread will get * recreated automatically. */ max_idle = max(5UL * 60 * HZ, wait_jiffies); -- cgit v1.2.3 From 297252c81de8043ca6c36e5984c24fdb5aab9013 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:15 +0300 Subject: writeback: do not lose wake-ups in bdi threads Currently, bdi threads ('bdi_writeback_thread()') can lose wake-ups. For example, if 'bdi_queue_work()' is executed after the bdi thread have had finished 'wb_do_writeback()' but before it called 'schedule_timeout_interruptible()'. To fix this issue, we have to check whether we have works to process after we have changed the task state to 'TASK_INTERRUPTIBLE'. This patch also clean-ups handling of the cases when 'dirty_writeback_interval' is zero or non-zero. Additionally, this patch also removes unneeded 'list_empty_careful()' call. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 002be0ff2ab..05444eaa3f3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -848,17 +848,18 @@ int bdi_writeback_thread(void *data) break; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); - } else { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty_careful(&wb->bdi->work_list) && - !kthread_should_stop()) - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list)) { __set_current_state(TASK_RUNNING); + continue; } + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait_jiffies); + } else + schedule(); + try_to_freeze(); } -- cgit v1.2.3 From 78c40cb6581a74adc48821f3de6b864a54d4c34d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:17 +0300 Subject: writeback: do not remove bdi from bdi_list The forker thread removes bdis from 'bdi_list' before forking the bdi thread. But this is wrong for at least 2 reasons. Reason #1: if we temporary remove a bdi from the list, we may miss works which would otherwise be given to us. Reason #2: this is racy; indeed, 'bdi_wb_shutdown()' expects that bdis are always in the 'bdi_list' (see 'bdi_remove_from_list()'), and when it races with the forker thread, it can shut down the bdi thread at the same time as the forker creates it. This patch makes sure the forker thread never removes bdis from 'bdi_list' (which was suggested by Christoph Hellwig). In order to make sure that we do not race with 'bdi_wb_shutdown()', we have to hold the 'bdi_lock' while walking the 'bdi_list' and setting the 'BDI_pending' flag. NOTE! The error path is interesting. Currently, when we fail to create a bdi thread, we move the bdi to the tail of 'bdi_list'. But if we never remove the bdi from the list, we cannot move it to the tail either, because then we can mess up the RCU readers which walk the list. And also, we'll have the race described above in "Reason #2". But I not think that adding to the tail is any important so I just do not do that. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05444eaa3f3..57fbfd0ebc5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -804,13 +804,6 @@ int bdi_writeback_thread(void *data) unsigned long wait_jiffies = -1UL; long pages_written; - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - current->flags |= PF_FLUSHER | PF_SWAPWRITE; set_freezable(); -- cgit v1.2.3 From ecd584030da67ede1bf17955746a6ce834d9fc6b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:18 +0300 Subject: writeback: move last_active to bdi Currently bdi threads use local variable 'last_active' which stores last time when the bdi thread did some useful work. Move this local variable to 'struct bdi_writeback'. This is just a preparation for the further patches which will make the forker thread decide when bdi threads should be killed. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 57fbfd0ebc5..9f5cab75c15 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -800,12 +800,12 @@ int bdi_writeback_thread(void *data) { struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; - unsigned long last_active = jiffies; unsigned long wait_jiffies = -1UL; long pages_written; current->flags |= PF_FLUSHER | PF_SWAPWRITE; set_freezable(); + wb->last_active = jiffies; /* * Our parent may run at a different priority, just set us to normal @@ -827,7 +827,7 @@ int bdi_writeback_thread(void *data) trace_writeback_pages_written(pages_written); if (pages_written) - last_active = jiffies; + wb->last_active = jiffies; else if (wait_jiffies != -1UL) { unsigned long max_idle; @@ -837,7 +837,7 @@ int bdi_writeback_thread(void *data) * recreated automatically. */ max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + last_active)) + if (time_after(jiffies, max_idle + wb->last_active)) break; } -- cgit v1.2.3 From fff5b85aa4225a7be157f208277a055822039a9e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:20 +0300 Subject: writeback: move bdi threads exiting logic to the forker thread Currently, bdi threads can decide to exit if there were no useful activities for 5 minutes. However, this causes nasty races: we can easily oops in the 'bdi_queue_work()' if the bdi thread decides to exit while we are waking it up. And even if we do not oops, but the bdi tread exits immediately after we wake it up, we'd lose the wake-up event and have an unnecessary delay (up to 5 secs) in the bdi work processing. This patch makes the forker thread to be the central place which not only creates bdi threads, but also kills them if they were inactive long enough. This better design-wise. Another reason why this change was done is to prepare for the further changes which will prevent the bdi threads from waking up every 5 sec and wasting power. Indeed, when the task does not wake up periodically anymore, it won't be able to exit either. This patch also moves the the 'wake_up_bit()' call from the bdi thread to the forker thread as well. So now the forker thread sets the BDI_pending bit, then forks the task or kills it, then clears the bit and wakes up the waiting process. The only process which may wain on the bit is 'bdi_wb_shutdown()'. This function was changed as well - now it first removes the bdi from the 'bdi_list', then waits on the 'BDI_pending' bit. Once it wakes up, it is guaranteed that the forker thread won't race with it, because the bdi is not visible. Note, the forker thread sets the 'BDI_pending' bit under the 'bdi->wb_lock' which is essential for proper serialization. And additionally, when we change 'bdi->wb.task', we now take the 'bdi->work_lock', to make sure that we do not lose wake-ups which we otherwise would when raced with, say, 'bdi_queue_work()'. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 54 ++++++++++++------------------------------------------ 1 file changed, 12 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9f5cab75c15..905f3ea3848 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -78,21 +78,17 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); - - /* - * If the default thread isn't there, make sure we add it. When - * it gets created and wakes up, we'll run this work. - */ - if (unlikely(!bdi->wb.task)) { + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); } + spin_unlock(&bdi->wb_lock); } static void @@ -800,7 +796,6 @@ int bdi_writeback_thread(void *data) { struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; - unsigned long wait_jiffies = -1UL; long pages_written; current->flags |= PF_FLUSHER | PF_SWAPWRITE; @@ -812,13 +807,6 @@ int bdi_writeback_thread(void *data) */ set_user_nice(current, 0); - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - trace_writeback_thread_start(bdi); while (!kthread_should_stop()) { @@ -828,18 +816,6 @@ int bdi_writeback_thread(void *data) if (pages_written) wb->last_active = jiffies; - else if (wait_jiffies != -1UL) { - unsigned long max_idle; - - /* - * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the thread will get - * recreated automatically. - */ - max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + wb->last_active)) - break; - } set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&bdi->work_list)) { @@ -847,21 +823,15 @@ int bdi_writeback_thread(void *data) continue; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout(wait_jiffies); - } else + if (dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else schedule(); try_to_freeze(); } - wb->task = NULL; - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ + /* Flush any work that raced with us exiting */ if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); -- cgit v1.2.3 From 253c34e9b10c30d3064be654b5b78fbc1a8b1896 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:21 +0300 Subject: writeback: prevent unnecessary bdi threads wakeups Finally, we can get rid of unnecessary wake-ups in bdi threads, which are very bad for battery-driven devices. There are two types of activities bdi threads do: 1. process bdi works from the 'bdi->work_list' 2. periodic write-back So there are 2 sources of wake-up events for bdi threads: 1. 'bdi_queue_work()' - submits bdi works 2. '__mark_inode_dirty()' - adds dirty I/O to bdi's The former already has bdi wake-up code. The latter does not, and this patch adds it. '__mark_inode_dirty()' is hot-path function, but this patch adds another 'spin_lock(&bdi->wb_lock)' there. However, it is taken only in rare cases when the bdi has no dirty inodes. So adding this spinlock should be fine and should not affect performance. This patch makes sure bdi threads and the forker thread do not wake-up if there is nothing to do. The forker thread will nevertheless wake up at least every 5 min. to check whether it has to kill a bdi thread. This can also be optimized, but is not worth it. This patch also tidies up the warning about unregistered bid, and turns it from an ugly crocodile to a simple 'WARN()' statement. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 59 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 905f3ea3848..55f6e46e06f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -823,10 +823,16 @@ int bdi_writeback_thread(void *data) continue; } - if (dirty_writeback_interval) + if (wb_has_dirty_io(wb) && dirty_writeback_interval) schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ schedule(); + } try_to_freeze(); } @@ -862,6 +868,26 @@ void wakeup_flusher_threads(long nr_pages) rcu_read_unlock(); } +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. + */ +static void wakeup_bdi_thread(struct backing_dev_info *bdi) +{ + spin_lock(&bdi->wb_lock); + if (bdi->wb.task) + wake_up_process(bdi->wb.task); + else + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + wake_up_process(default_backing_dev_info.wb.task); + spin_unlock(&bdi->wb_lock); +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -914,6 +940,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -967,22 +995,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - struct backing_dev_info *bdi = wb->bdi; - - if (bdi_cap_writeback_dirty(bdi) && - !test_bit(BDI_registered, &bdi->state)) { - WARN_ON(1); - printk(KERN_ERR "bdi-%s not registered\n", - bdi->name); + bdi = inode_to_bdi(inode); + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_list, &bdi->wb.b_dirty); } } out: spin_unlock(&inode_lock); + + if (wakeup_bdi) + wakeup_bdi_thread(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); -- cgit v1.2.3 From 6467716a37673e8d47b4984eb19839bdad0a8353 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:22 +0300 Subject: writeback: optimize periodic bdi thread wakeups Whe the first inode for a bdi is marked dirty, we wake up the bdi thread which should take care of the periodic background write-out. However, the write-out will actually start only 'dirty_writeback_interval' centisecs later, so we can delay the wake-up. This change was requested by Nick Piggin who pointed out that if we delay the wake-up, we weed out 2 unnecessary contex switches, which matters because '__mark_inode_dirty()' is a hot-path function. This patch introduces a new function - 'bdi_wakeup_thread_delayed()', which sets up a timer to wake-up the bdi thread and returns. So the wake-up is delayed. We also delete the timer in bdi threads just before writing-back. And synchronously delete it when unregistering bdi. At the unregister point the bdi does not have any users, so no one can arm it again. Since now we take 'bdi->wb_lock' in the timer, which can execute in softirq context, we have to use 'spin_lock_bh()' for 'bdi->wb_lock'. This patch makes this change as well. This patch also moves the 'bdi_wb_init()' function down in the file to avoid forward-declaration of 'bdi_wakeup_thread_delayed()'. Signed-off-by: Artem Bityutskiy Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 55f6e46e06f..bfa2df2c7ce 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -76,7 +76,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, { trace_writeback_queue(bdi, work); - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); @@ -88,7 +88,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); } static void @@ -704,13 +704,13 @@ get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); if (!list_empty(&bdi->work_list)) { work = list_entry(bdi->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); return work; } @@ -810,6 +810,12 @@ int bdi_writeback_thread(void *data) trace_writeback_thread_start(bdi); while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + pages_written = wb_do_writeback(wb, 0); trace_writeback_pages_written(pages_written); @@ -868,26 +874,6 @@ void wakeup_flusher_threads(long nr_pages) rcu_read_unlock(); } -/* - * This function is used when the first inode for this bdi is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. - */ -static void wakeup_bdi_thread(struct backing_dev_info *bdi) -{ - spin_lock(&bdi->wb_lock); - if (bdi->wb.task) - wake_up_process(bdi->wb.task); - else - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - wake_up_process(default_backing_dev_info.wb.task); - spin_unlock(&bdi->wb_lock); -} - static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1019,7 +1005,7 @@ out: spin_unlock(&inode_lock); if (wakeup_bdi) - wakeup_bdi_thread(bdi); + bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); -- cgit v1.2.3