diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bio-integrity.c | 4 | ||||
-rw-r--r-- | block/bio.c | 112 | ||||
-rw-r--r-- | block/blk-cgroup.c | 215 | ||||
-rw-r--r-- | block/blk-cgroup.h | 603 | ||||
-rw-r--r-- | block/blk-core.c | 206 | ||||
-rw-r--r-- | block/blk-exec.c | 10 | ||||
-rw-r--r-- | block/blk-integrity.c | 1 | ||||
-rw-r--r-- | block/blk-merge.c | 3 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 38 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 160 | ||||
-rw-r--r-- | block/blk-sysfs.c | 3 | ||||
-rw-r--r-- | block/blk-throttle.c | 2 | ||||
-rw-r--r-- | block/blk.h | 5 | ||||
-rw-r--r-- | block/bounce.c | 4 | ||||
-rw-r--r-- | block/cfq-iosched.c | 127 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 1 | ||||
-rw-r--r-- | block/ioctl.c | 37 |
19 files changed, 585 insertions, 951 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5cbd5d9ea61d..0436c21db7f2 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio, error); } /** @@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error) */ if (error) { bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio, error); return; } diff --git a/block/bio.c b/block/bio.c index f66a4eae16ee..2a00d349cd68 100644 --- a/block/bio.c +++ b/block/bio.c @@ -270,8 +270,8 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); + atomic_set(&bio->__bi_remaining, 1); + atomic_set(&bio->__bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -292,8 +292,8 @@ void bio_reset(struct bio *bio) __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); + bio->bi_flags = flags | (1 << BIO_UPTODATE); + atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error) bio_put(bio); } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /** * bio_chain - chain bio completions * @bio: the target bio @@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent) bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); + bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); @@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio); **/ void bio_put(struct bio *bio) { - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) + if (!bio_flagged(bio, BIO_REFFED)) bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } } EXPORT_SYMBOL(bio_put); @@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) { + clear_bit(BIO_CHAIN, &bio->bi_flags); + return true; + } + + return false; +} + /** * bio_endio - end I/O on a bio * @bio: bio @@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); void bio_endio(struct bio *bio, int error) { while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; + if (unlikely(!bio_remaining_done(bio))) + break; /* * Need to have a real endio function for chained bios, @@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error) EXPORT_SYMBOL(bio_endio); /** - * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining - * @bio: bio - * @error: error, if any - * - * For code that has saved and restored bi_end_io; thing hard before using this - * function, probably you should've cloned the entire bio. - **/ -void bio_endio_nodec(struct bio *bio, int error) -{ - atomic_inc(&bio->bi_remaining); - bio_endio(bio, error); -} -EXPORT_SYMBOL(bio_endio_nodec); - -/** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio @@ -1971,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_ EXPORT_SYMBOL(bioset_create_nobvec); #ifdef CONFIG_BLK_CGROUP + +/** + * bio_associate_blkcg - associate a bio with the specified blkcg + * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. + * + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. + */ +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +{ + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; +} + /** * bio_associate_current - associate a bio with %current * @bio: target bio @@ -1987,26 +2026,17 @@ EXPORT_SYMBOL(bioset_create_nobvec); int bio_associate_current(struct bio *bio) { struct io_context *ioc; - struct cgroup_subsys_state *css; - if (bio->bi_ioc) + if (bio->bi_css) return -EBUSY; ioc = current->io_context; if (!ioc) return -ENOENT; - /* acquire active ref on @ioc and associate */ get_io_context_active(ioc); bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget_online(css)) - bio->bi_css = css; - rcu_read_unlock(); - + bio->bi_css = task_get_css(current, blkio_cgrp_id); return 0; } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ac817b750db..9f97da52d006 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -9,27 +9,33 @@ * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> + * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" #define MAX_KEY_LEN 100 static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static bool blkcg_policy_enabled(struct request_queue *q, @@ -179,6 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -190,22 +197,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_ATOMIC); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -EINVAL; - goto err_put_css; + goto err_put_congested; } blkg_get(blkg->parent); } @@ -235,18 +250,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) { - if (blkcg == &blkcg_root) { - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - } + if (!ret) return blkg; - } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -340,15 +352,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * If root blkg is destroyed. Just clear the pointer since root_rl - * does not take reference on root blkg. - */ - if (blkcg == &blkcg_root) { - blkg->q->root_blkg = NULL; - blkg->q->root_rl.blkg = NULL; - } - - /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ @@ -402,6 +405,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) if (blkg->parent) blkg_put(blkg->parent); + wb_congested_put(blkg->wb_congested); + blkg_free(blkg); } EXPORT_SYMBOL_GPL(__blkg_release_rcu); @@ -809,6 +814,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) @@ -823,6 +830,8 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; + struct cgroup_subsys_state *ret; + int i; if (!parent_css) { blkcg = &blkcg_root; @@ -830,17 +839,51 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } + + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_size) + continue; + + BUG_ON(blkcg->pd[i]); + cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->pd[i] = cpd; + cpd->plid = i; + pol->cpd_init_fn(blkcg); + } - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + kfree(blkcg->pd[i]); + +free_blkcg: + kfree(blkcg); + return ret; } /** @@ -855,9 +898,45 @@ done: */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); + + if (IS_ERR(blkg)) { + kfree(new_blkg); + return PTR_ERR(blkg); + } + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; - return blk_throtl_init(q); + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -958,52 +1037,26 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; - struct blkg_policy_data *pd, *n; + LIST_HEAD(cpds); + struct blkcg_gq *blkg; + struct blkg_policy_data *pd, *nd; + struct blkcg_policy_data *cpd, *cnd; int cnt = 0, ret; - bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - + /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; - spin_unlock_irq(q->queue_lock); - /* allocate policy_data for all existing blkgs */ + /* + * Allocate per-blkg and per-blkcg policy data + * for all existing blkgs. + */ while (cnt--) { pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); if (!pd) { @@ -1011,26 +1064,50 @@ int blkcg_activate_policy(struct request_queue *q, goto out_free; } list_add_tail(&pd->alloc_node, &pds); + + if (!pol->cpd_size) + continue; + cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node); + if (!cpd) { + ret = -ENOMEM; + goto out_free; + } + list_add_tail(&cpd->alloc_node, &cpds); } /* - * Install the allocated pds. With @q bypassing, no new blkg + * Install the allocated pds and cpds. With @q bypassing, no new blkg * should have been created while the queue lock was dropped. */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { + if (WARN_ON(list_empty(&pds)) || + WARN_ON(pol->cpd_size && list_empty(&cpds))) { /* umm... this shouldn't happen, just abort */ ret = -ENOMEM; goto out_unlock; } + cpd = list_first_entry(&cpds, struct blkcg_policy_data, + alloc_node); + list_del_init(&cpd->alloc_node); pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); list_del_init(&pd->alloc_node); /* grab blkcg lock too while installing @pd on @blkg */ spin_lock(&blkg->blkcg->lock); + if (!pol->cpd_size) + goto no_cpd; + if (!blkg->blkcg->pd[pol->plid]) { + /* Per-policy per-blkcg data */ + blkg->blkcg->pd[pol->plid] = cpd; + cpd->plid = pol->plid; + pol->cpd_init_fn(blkg->blkcg); + } else { /* must free it as it has already been extracted */ + kfree(cpd); + } +no_cpd: blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; @@ -1045,8 +1122,10 @@ out_unlock: spin_unlock_irq(q->queue_lock); out_free: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) + list_for_each_entry_safe(pd, nd, &pds, alloc_node) kfree(pd); + list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node) + kfree(cpd); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1072,10 +1151,6 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); @@ -1087,6 +1162,8 @@ void blkcg_deactivate_policy(struct request_queue *q, kfree(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; + kfree(blkg->blkcg->pd[pol->plid]); + blkg->blkcg->pd[pol->plid] = NULL; spin_unlock(&blkg->blkcg->lock); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h deleted file mode 100644 index c567865b5f1d..000000000000 --- a/block/blk-cgroup.h +++ /dev/null @@ -1,603 +0,0 @@ -#ifndef _BLK_CGROUP_H -#define _BLK_CGROUP_H -/* - * Common Block IO controller cgroup interface - * - * Based on ideas and code from CFQ, CFS and BFQ: - * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> - * - * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> - * Paolo Valente <paolo.valente@unimore.it> - * - * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> - * Nauman Rafique <nauman@google.com> - */ - -#include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> -#include <linux/seq_file.h> -#include <linux/radix-tree.h> -#include <linux/blkdev.h> -#include <linux/atomic.h> - -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX - -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; -}; - -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; -}; - -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; -}; - -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; - - /* used during policy activation */ - struct list_head alloc_node; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - - /* reference count */ - atomic_t refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; -}; - -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; -}; - -extern struct blkcg blkcg_root; - -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; -}; - -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); -} - -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - return task_blkcg(current); -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "<unavailable>", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); -} - -void __blkg_release_rcu(struct rcu_head *rcu); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); -} - -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - -static inline void blkg_stat_init(struct blkg_stat *stat) -{ - u64_stats_init(&stat->syncp); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_stat_merge - merge a blkg_stat into another - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count to @to. - */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) -{ - blkg_stat_add(to, blkg_stat_read(from)); -} - -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) -{ - u64_stats_init(&rwstat->syncp); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); -} - -/** - * blkg_rwstat_merge - merge a blkg_rwstat into another - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's counts to @to. - */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - struct blkg_rwstat v = blkg_rwstat_read(from); - int i; - - u64_stats_update_begin(&to->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); -} - -#else /* CONFIG_BLK_CGROUP */ - -struct cgroup; -struct blkcg; - -struct blkg_policy_data { -}; - -struct blkcg_gq { -}; - -struct blkcg_policy { -}; - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 03b5f8d77f37..688ae9482cb8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,12 +32,12 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); @@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -117,7 +142,7 @@ EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - if (error) + if (error && !(rq->cmd_flags & REQ_CLONE)) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; @@ -128,7 +153,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && + !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE))) bio_endio(bio, error); } @@ -285,6 +311,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue @@ -621,8 +648,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = 0; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; @@ -845,13 +871,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -884,25 +905,25 @@ static void freed_request(struct request_list *rl, unsigned int flags) int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; + int on_thresh, off_thresh; spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); - blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { @@ -1012,12 +1033,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* @@ -1525,7 +1541,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -1545,8 +1562,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1611,7 +1636,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) * any locks. */ if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; spin_lock_irq(q->queue_lock); @@ -1718,8 +1743,6 @@ static void handle_bad_sector(struct bio *bio) bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -2904,95 +2927,22 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/* - * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->sense) are not copied. - */ -static void __blk_rq_prep_clone(struct request *dst, struct request *src) +void blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK); + dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; -} - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->sense) - * are not copied, and copying such parts is the caller's responsibility. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else - rq->bio = rq->biotail = bio; - } - - __blk_rq_prep_clone(rq, rq_src); - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; + dst->bio = src->bio; + dst->biotail = src->biotail; + dst->cmd = src->cmd; + dst->cmd_len = src->cmd_len; + dst->sense = src->sense; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); @@ -3034,21 +2984,20 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - /* - * If this is a nested plug, don't actually assign it. It will be - * flushed on its own. + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier */ - if (!tsk->plug) { - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; - } + tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -3195,10 +3144,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { + if (plug != current->plug) + return; blk_flush_plug_list(plug, false); - if (plug == current->plug) - current->plug = NULL; + current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); diff --git a/block/blk-exec.c b/block/blk-exec.c index 9924725fa50d..3fec8a29d0fa 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - bool is_pm_resume; WARN_ON(irqs_disabled()); WARN_ON(rq->cmd_type == REQ_TYPE_FS); @@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, return; } - /* - * need to check this before __blk_run_queue(), because rq can - * be freed before that returns. - */ - is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { @@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be run */ - if (is_pm_resume) - __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 79ffb4855af0..f548b64be092 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -21,6 +21,7 @@ */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> diff --git a/block/blk-merge.c b/block/blk-merge.c index fd3fee81c23c..30a0d9f89017 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; - if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { + /* Only check gaps if the bio carries data */ + if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) { struct bio_vec *bprev; bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1]; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index be3290cc0644..9b6e28830b82 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, } } +static void bt_tags_for_each(struct blk_mq_tags *tags, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_tag_iter_fn *fn, void *data, bool reserved) +{ + struct request *rq; + int bit, i; + + if (!tags->rqs) + return; + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = blk_mq_tag_to_rq(tags, off + bit); + fn(rq, data, reserved); + } + + off += (1 << bt->bits_per_word); + } +} + +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv) +{ + if (tags->nr_reserved_tags) + bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); +} +EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); + void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv) { @@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; + if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { + kfree(tags); + return NULL; + } + tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 90767b370308..75893a34237d 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -44,6 +44,7 @@ struct blk_mq_tags { struct list_head page_list; int alloc_policy; + cpumask_var_t cpumask; }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 594eea04266e..f53779692c77 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; if (ret) @@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_hw_queues(q, false); } @@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); } @@ -1237,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) + return 0; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + return -1; + } +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to @@ -1248,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; blk_queue_bounce(q, &bio); @@ -1256,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + return; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) return; @@ -1266,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * we do limited pluging. If bio can be merged, do merge. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is empty + **/ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; + list_del_init(&old_rq->queuelist); } - } + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) + return; + if (!blk_mq_direct_issue_request(old_rq)) + return; + blk_mq_insert_request(old_rq, false, true, true); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1310,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); } @@ -1322,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; - blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { @@ -1339,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; rq = blk_mq_map_request(q, bio, &data); @@ -1358,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; - - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1508,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; fail: @@ -1792,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx = q->mq_ops->map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); + cpumask_set_cpu(i, hctx->tags->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } @@ -2056,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); @@ -2173,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2234,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i; for (i = 0; i < set->nr_hw_queues; i++) { - if (set->tags[i]) + if (set->tags[i]) { blk_mq_free_rq_map(set, set->tags[i], i); + free_cpumask_var(set->tags[i]->cpumask); + } } kfree(set->tags); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 2b8fd302f677..6264b382d4d1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -6,11 +6,12 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/blk-mq.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" struct queue_sysfs_entry { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5b9c6d5c3636..b23193518ac7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -9,7 +9,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" /* Max dispatch from a group in 1 round */ diff --git a/block/blk.h b/block/blk.h index 43b036185712..026d9594142b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); @@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -void __blk_run_queue_uncond(struct request_queue *q); - int blk_dev_init(void); diff --git a/block/bounce.c b/block/bounce.c index ed9dd8067120..b17311227c12 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> @@ -128,9 +129,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) struct bio_vec *bvec, *org_vec; int i; - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - /* * free up bounce indirect pages used */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5da8e6e9ab4b..c62bb2e650b8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,8 +14,8 @@ #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" /* * tunables @@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool; #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) +/* blkio-related constants */ +#define CFQ_WEIGHT_MIN 10 +#define CFQ_WEIGHT_MAX 1000 +#define CFQ_WEIGHT_DEFAULT 500 + struct cfq_ttime { unsigned long last_end_request; @@ -212,6 +217,15 @@ struct cfqg_stats { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; +/* Per-cgroup data */ +struct cfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + + unsigned int weight; + unsigned int leaf_weight; +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ @@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* cfqg stats flags */ @@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static struct cfq_group_data +*cpd_to_cfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + static struct blkcg_policy blkcg_policy_cfq; static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) @@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) +{ + return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); +} + static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; @@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats) #endif } +static void cfq_cpd_init(const struct blkcg *blkcg) +{ + struct cfq_group_data *cgd = + cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); + + if (blkcg == &blkcg_root) { + cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; + cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; + } else { + cgd->weight = CFQ_WEIGHT_DEFAULT; + cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; + } +} + static void cfq_pd_init(struct blkcg_gq *blkg) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); cfq_init_cfqg_base(cfqg); - cfqg->weight = blkg->blkcg->cfq_weight; - cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; + cfqg->weight = cgd->weight; + cfqg->leaf_weight = cgd->leaf_weight; cfqg_stats_init(&cfqg->stats); cfqg_stats_init(&cfqg->dead_stats); } @@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->leaf_weight; + + seq_printf(sf, "%u\n", val); return 0; } @@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; + struct cfq_group_data *cfqgd; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); @@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, ret = -EINVAL; cfqg = blkg_to_cfqg(ctx.blkg); + cfqgd = blkcg_to_cfqgd(blkcg); + if (!cfqg || !cfqgd) + goto err; + if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { if (!is_leaf_weight) { cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + cfqg->new_weight = ctx.v ?: cfqgd->weight; } else { cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; } ret = 0; } +err: blkg_conf_finish(&ctx); return ret ?: nbytes; } @@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + struct cfq_group_data *cfqgd; + int ret = 0; if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); + cfqgd = blkcg_to_cfqgd(blkcg); + if (!cfqgd) { + ret = -EINVAL; + goto out; + } if (!is_leaf_weight) - blkcg->cfq_weight = val; + cfqgd->weight = val; else - blkcg->cfq_leaf_weight = val; + cfqgd->leaf_weight = val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, if (!is_leaf_weight) { if (!cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + cfqg->new_weight = cfqgd->weight; } else { if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + cfqg->new_leaf_weight = cfqgd->leaf_weight; } } +out: spin_unlock_irq(&blkcg->lock); - return 0; + return ret; } static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, @@ -4477,6 +4545,18 @@ out_free: return ret; } +static void cfq_registered_queue(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + struct cfq_data *cfqd = e->elevator_data; + + /* + * Default to IOPS mode with no idling for SSDs + */ + if (blk_queue_nonrot(q)) + cfqd->cfq_slice_idle = 0; +} + /* * sysfs parts below --> */ @@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = { .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, + .elevator_registered_fn = cfq_registered_queue, }, .icq_size = sizeof(struct cfq_io_cq), .icq_align = __alignof__(struct cfq_io_cq), @@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { .pd_size = sizeof(struct cfq_group), + .cpd_size = sizeof(struct cfq_group_data), .cftypes = cfq_blkcg_files, + .cpd_init_fn = cfq_cpd_init, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, .pd_reset_stats_fn = cfq_pd_reset_stats, diff --git a/block/elevator.c b/block/elevator.c index 8985038f398c..84d63943f2de 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -35,11 +35,11 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; + if (e->type->ops.elevator_registered_fn) + e->type->ops.elevator_registered_fn(q); } return error; } diff --git a/block/genhd.c b/block/genhd.c index ea982eadaf63..59a1395eedac 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,6 +8,7 @@ #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> diff --git a/block/ioctl.c b/block/ioctl.c index 7d8befde2aca..8061eba42887 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -static int blkdev_reread_part(struct block_device *bdev) +/* + * This is an exported API for the block driver, and will not + * acquire bd_mutex. This API should be used in case that + * caller has held bd_mutex already. + */ +int __blkdev_reread_part(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - int res; if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (!mutex_trylock(&bdev->bd_mutex)) - return -EBUSY; - res = rescan_partitions(disk, bdev); + + lockdep_assert_held(&bdev->bd_mutex); + + return rescan_partitions(disk, bdev); +} +EXPORT_SYMBOL(__blkdev_reread_part); + +/* + * This is an exported API for the block driver, and will + * try to acquire bd_mutex. If bd_mutex has been held already + * in current context, please call __blkdev_reread_part(). + * + * Make sure the held locks in current context aren't required + * in open()/close() handler and I/O path for avoiding ABBA deadlock: + * - bd_mutex is held before calling block driver's open/close + * handler + * - reading partition table may submit I/O to the block device + */ +int blkdev_reread_part(struct block_device *bdev) +{ + int res; + + mutex_lock(&bdev->bd_mutex); + res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); + return res; } +EXPORT_SYMBOL(blkdev_reread_part); static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len, int secure) |