diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 10 | ||||
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 138 | ||||
-rw-r--r-- | block/blk-cgroup.h | 2 | ||||
-rw-r--r-- | block/blk-core.c | 479 | ||||
-rw-r--r-- | block/blk-flush.c | 24 | ||||
-rw-r--r-- | block/blk-softirq.c | 10 | ||||
-rw-r--r-- | block/blk-sysfs.c | 22 | ||||
-rw-r--r-- | block/blk-tag.c | 6 | ||||
-rw-r--r-- | block/blk-throttle.c | 110 | ||||
-rw-r--r-- | block/blk.h | 22 | ||||
-rw-r--r-- | block/bsg-lib.c | 298 | ||||
-rw-r--r-- | block/cfq-iosched.c | 21 | ||||
-rw-r--r-- | block/elevator.c | 39 | ||||
-rw-r--r-- | block/genhd.c | 87 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 3 |
16 files changed, 907 insertions, 365 deletions
diff --git a/block/Kconfig b/block/Kconfig index 60be1e0455d..e97934eecec 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -65,6 +65,16 @@ config BLK_DEV_BSG If unsure, say Y. +config BLK_DEV_BSGLIB + bool "Block layer SG support v4 helper lib" + default n + select BLK_DEV_BSG + help + Subsystems will normally enable this if needed. Users will not + normally need to manually enable this. + + If unsure, say N. + config BLK_DEV_INTEGRITY bool "Block layer data integrity support" ---help--- diff --git a/block/Makefile b/block/Makefile index 0fec4b3fab5..514c6e4f427 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bcaf16ee6ad..8f630cec906 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -768,27 +768,16 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return disk_total; } -static int blkio_check_dev_num(dev_t dev) -{ - int part = 0; - struct gendisk *disk; - - disk = get_gendisk(dev, &part); - if (!disk || part) - return -ENODEV; - - return 0; -} - static int blkio_policy_parse_and_set(char *buf, struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { + struct gendisk *disk = NULL; char *s[4], *p, *major_s = NULL, *minor_s = NULL; - int ret; - unsigned long major, minor, temp; - int i = 0; + unsigned long major, minor; + int i = 0, ret = -EINVAL; + int part; dev_t dev; - u64 bps, iops; + u64 temp; memset(s, 0, sizeof(s)); @@ -804,43 +793,45 @@ static int blkio_policy_parse_and_set(char *buf, } if (i != 2) - return -EINVAL; + goto out; p = strsep(&s[0], ":"); if (p != NULL) major_s = p; else - return -EINVAL; + goto out; minor_s = s[0]; if (!minor_s) - return -EINVAL; + goto out; - ret = strict_strtoul(major_s, 10, &major); - if (ret) - return -EINVAL; + if (strict_strtoul(major_s, 10, &major)) + goto out; - ret = strict_strtoul(minor_s, 10, &minor); - if (ret) - return -EINVAL; + if (strict_strtoul(minor_s, 10, &minor)) + goto out; dev = MKDEV(major, minor); - ret = blkio_check_dev_num(dev); - if (ret) - return ret; + if (strict_strtoull(s[1], 10, &temp)) + goto out; - newpn->dev = dev; + /* For rule removal, do not check for device presence. */ + if (temp) { + disk = get_gendisk(dev, &part); + if (!disk || part) { + ret = -ENODEV; + goto out; + } + } - if (s[1] == NULL) - return -EINVAL; + newpn->dev = dev; switch (plid) { case BLKIO_POLICY_PROP: - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - return -EINVAL; + if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + goto out; newpn->plid = plid; newpn->fileid = fileid; @@ -850,94 +841,114 @@ static int blkio_policy_parse_and_set(char *buf, switch(fileid) { case BLKIO_THROTL_read_bps_device: case BLKIO_THROTL_write_bps_device: - ret = strict_strtoull(s[1], 10, &bps); - if (ret) - return -EINVAL; - newpn->plid = plid; newpn->fileid = fileid; - newpn->val.bps = bps; + newpn->val.bps = temp; break; case BLKIO_THROTL_read_iops_device: case BLKIO_THROTL_write_iops_device: - ret = strict_strtoull(s[1], 10, &iops); - if (ret) - return -EINVAL; - - if (iops > THROTL_IOPS_MAX) - return -EINVAL; + if (temp > THROTL_IOPS_MAX) + goto out; newpn->plid = plid; newpn->fileid = fileid; - newpn->val.iops = (unsigned int)iops; + newpn->val.iops = (unsigned int)temp; break; } break; default: BUG(); } - - return 0; + ret = 0; +out: + put_disk(disk); + return ret; } unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int weight; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); if (pn) - return pn->val.weight; + weight = pn->val.weight; else - return blkcg->weight; + weight = blkcg->weight; + + spin_unlock_irqrestore(&blkcg->lock, flags); + + return weight; } EXPORT_SYMBOL_GPL(blkcg_get_weight); uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_read_bps_device); if (pn) - return pn->val.bps; - else - return -1; + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; } uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_write_bps_device); if (pn) - return pn->val.bps; - else - return -1; + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; } unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_read_iops_device); if (pn) - return pn->val.iops; - else - return -1; + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; } unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_write_iops_device); if (pn) - return pn->val.iops; - else - return -1; + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; } /* Checks whether user asked for deleting a policy rule */ @@ -1090,6 +1101,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, if (blkio_delete_rule_command(newpn)) { blkio_policy_delete_node(pn); + kfree(pn); spin_unlock_irq(&blkcg->lock); goto update_io_group; } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index a71d2904ffb..6f3ace7e792 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -188,7 +188,7 @@ struct blkio_policy_node { union { unsigned int weight; /* - * Rate read/write in terms of byptes per second + * Rate read/write in terms of bytes per second * Whether this rate represents read or write is determined * by file type "fileid". */ diff --git a/block/blk-core.c b/block/blk-core.c index b627558c461..f43c8a5840a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,6 +28,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> +#include <linux/delay.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -38,8 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); -static int __make_request(struct request_queue *q, struct bio *bio); - /* * For the allocated request tables */ @@ -347,31 +346,80 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -/* - * Note: If a driver supplied the queue lock, it should not zap that lock - * unexpectedly as some queue cleanup components like elevator_exit() and - * blk_throtl_exit() need queue lock. +/** + * blk_drain_queue - drain requests from request_queue + * @q: queue to drain + * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV + * + * Drain requests from @q. If @drain_all is set, all requests are drained. + * If not, only ELVPRIV requests are drained. The caller is responsible + * for ensuring that no new requests which need to be drained are queued. + */ +void blk_drain_queue(struct request_queue *q, bool drain_all) +{ + while (true) { + int nr_rqs; + + spin_lock_irq(q->queue_lock); + + elv_drain_elevator(q); + if (drain_all) + blk_throtl_drain(q); + + __blk_run_queue(q); + + if (drain_all) + nr_rqs = q->rq.count[0] + q->rq.count[1]; + else + nr_rqs = q->rq.elvpriv; + + spin_unlock_irq(q->queue_lock); + + if (!nr_rqs) + break; + msleep(10); + } +} + +/** + * blk_cleanup_queue - shutdown a request queue + * @q: request queue to shutdown + * + * Mark @q DEAD, drain all pending requests, destroy and put it. All + * future requests will be failed immediately with -ENODEV. */ void blk_cleanup_queue(struct request_queue *q) { - /* - * We know we have process context here, so we can be a little - * cautious and ensure that pending block actions on this device - * are done before moving on. Going into this function, we should - * not have processes doing IO to this device. - */ - blk_sync_queue(q); + spinlock_t *lock = q->queue_lock; - del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + /* mark @q DEAD, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); + + spin_lock_irq(lock); + queue_flag_set(QUEUE_FLAG_NOMERGES, q); + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + queue_flag_set(QUEUE_FLAG_DEAD, q); + + if (q->queue_lock != &q->__queue_lock) + q->queue_lock = &q->__queue_lock; + + spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); + /* + * Drain all requests queued before DEAD marking. The caller might + * be trying to tear down @q before its elevator is initialized, in + * which case we don't want to call into draining. + */ if (q->elevator) - elevator_exit(q->elevator); + blk_drain_queue(q, true); - blk_throtl_exit(q); + /* @q won't process any more request, flush async actions */ + del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + blk_sync_queue(q); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -542,7 +590,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, /* * This also sets hw/phys segments, boundary and size */ - blk_queue_make_request(q, __make_request); + blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; @@ -577,7 +625,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) } static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -588,12 +636,10 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) rq->cmd_flags = flags | REQ_ALLOCED; - if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - rq->cmd_flags |= REQ_ELVPRIV; + if ((flags & REQ_ELVPRIV) && + unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; } return rq; @@ -652,12 +698,13 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) +static void freed_request(struct request_queue *q, unsigned int flags) { struct request_list *rl = &q->rq; + int sync = rw_is_sync(flags); rl->count[sync]--; - if (priv) + if (flags & REQ_ELVPRIV) rl->elvpriv--; __freed_request(q, sync); @@ -685,10 +732,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio) return true; } -/* - * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. - * Returns !NULL on success, with queue_lock *not held*. +/** + * get_request - get a free request + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask + * + * Get a free request from @q. This function may fail under memory + * pressure or if @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -697,7 +753,10 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv = 0; + int may_queue; + + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -741,17 +800,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->count[is_sync]++; rl->starved[is_sync] = 0; - if (blk_rq_should_init_elevator(bio)) { - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (blk_rq_should_init_elevator(bio) && + !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { + rw_flags |= REQ_ELVPRIV; + rl->elvpriv++; } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + rq = blk_alloc_request(q, rw_flags, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -761,7 +820,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + freed_request(q, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -791,11 +850,18 @@ out: return rq; } -/* - * No available requests for this queue, wait for some requests to become - * available. +/** + * get_request_wait - get a free request with retry + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) * - * Called with q->queue_lock held, and returns with it unlocked. + * Get a free request from @q. This function keeps retrying under memory + * pressure and fails iff @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) @@ -809,6 +875,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct io_context *ioc; struct request_list *rl = &q->rq; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); @@ -839,19 +908,15 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - return NULL; - BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw, NULL); - } else { + else rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) - spin_unlock_irq(q->queue_lock); - } + if (!rq) + spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ return rq; @@ -1053,14 +1118,13 @@ void __blk_put_request(struct request_queue *q, struct request *req) * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { - int is_sync = rq_is_sync(req) != 0; - int priv = req->cmd_flags & REQ_ELVPRIV; + unsigned int flags = req->cmd_flags; BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, is_sync, priv); + freed_request(q, flags); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1162,24 +1226,41 @@ static bool bio_attempt_front_merge(struct request_queue *q, return true; } -/* - * Attempts to merge with the plugged list in the current process. Returns - * true if merge was successful, otherwise false. +/** + * attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @request_count: out parameter for number of traversed plugged requests + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * This function is called without @q->queue_lock; however, elevator is + * accessed iff there already are requests on the plugged list which in + * turn guarantees validity of the elevator. + * + * Note that, on successful merge, elevator operation + * elevator_bio_merged_fn() will be called without queue lock. Elevator + * must be ready for this. */ -static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, - struct bio *bio) +static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) { struct blk_plug *plug; struct request *rq; bool ret = false; - plug = tsk->plug; + plug = current->plug; if (!plug) goto out; + *request_count = 0; list_for_each_entry_reverse(rq, &plug->list, queuelist) { int el_ret; + (*request_count)++; + if (rq->q != q) continue; @@ -1200,7 +1281,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { - req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; @@ -1213,12 +1293,13 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static int __make_request(struct request_queue *q, struct bio *bio) +void blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; + unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a @@ -1237,8 +1318,8 @@ static int __make_request(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(current, q, bio)) - goto out; + if (attempt_plug_merge(q, bio, &request_count)) + return; spin_lock_irq(q->queue_lock); @@ -1272,6 +1353,10 @@ get_rq: * Returns with the queue unlocked. */ req = get_request_wait(q, rw_flags, bio); + if (unlikely(!req)) { + bio_endio(bio, -ENODEV); /* @q is dead */ + goto out_unlock; + } /* * After dropping the lock and possibly sleeping here, our request @@ -1281,8 +1366,7 @@ get_rq: */ init_request_from_bio(req, bio); - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) req->cpu = raw_smp_processor_id(); plug = current->plug; @@ -1302,11 +1386,10 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } + if (request_count >= BLK_MAX_REQUEST_COUNT) + blk_flush_plug_list(plug, false); list_add_tail(&req->queuelist, &plug->list); - plug->count++; drive_stat_acct(req, 1); - if (plug->count >= BLK_MAX_REQUEST_COUNT) - blk_flush_plug_list(plug, false); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -1314,9 +1397,8 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } -out: - return 0; } +EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location @@ -1415,165 +1497,135 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; } -/** - * generic_make_request - hand a buffer to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct bio, which describes the I/O that needs - * to be done. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bio->bi_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that bi_io_vec - * are set to describe the memory buffer, and that bi_dev and bi_sector are - * set to describe the device address, and the - * bi_end_io and optionally bi_private are set to describe how - * completion notification should be signaled. - * - * generic_make_request and the drivers it calls may use bi_next if this - * bio happens to be merged with someone else, and may change bi_dev and - * bi_sector for remaps as it sees fit. So the values of these fields - * should NOT be depended on after the call to generic_make_request. - */ -static inline void __generic_make_request(struct bio *bio) +static noinline_for_stack bool +generic_make_request_checks(struct bio *bio) { struct request_queue *q; - sector_t old_sector; - int ret, nr_sectors = bio_sectors(bio); - dev_t old_dev; + int nr_sectors = bio_sectors(bio); int err = -EIO; + char b[BDEVNAME_SIZE]; + struct hd_struct *part; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; - /* - * Resolve the mapping until finished. (drivers are - * still free to implement/resolve their own stacking - * by explicitly returning 0) - * - * NOTE: we don't repeat the blk_size check for each new device. - * Stacking drivers are expected to know what they are doing. - */ - old_sector = -1; - old_dev = 0; - do { - char b[BDEVNAME_SIZE]; - struct hd_struct *part; - - q = bdev_get_queue(bio->bi_bdev); - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_sector); - goto end_io; - } - - if (unlikely(!(bio->bi_rw & REQ_DISCARD) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - - part = bio->bi_bdev->bd_part; - if (should_fail_request(part, bio->bi_size) || - should_fail_request(&part_to_disk(part)->part0, - bio->bi_size)) - goto end_io; - - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); + q = bdev_get_queue(bio->bi_bdev); + if (unlikely(!q)) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%Lu)\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_sector); + goto end_io; + } - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; + if (unlikely(!(bio->bi_rw & REQ_DISCARD) && + nr_sectors > queue_max_hw_sectors(q))) { + printk(KERN_ERR "bio too big device %s (%u > %u)\n", + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + queue_max_hw_sectors(q)); + goto end_io; + } - if (old_sector != -1) - trace_block_bio_remap(q, bio, old_dev, old_sector); + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_size) || + should_fail_request(&part_to_disk(part)->part0, + bio->bi_size)) + goto end_io; - old_sector = bio->bi_sector; - old_dev = bio->bi_bdev->bd_dev; + /* + * If this device has partitions, remap block n + * of partition p to block n+start(p) of the disk. + */ + blk_partition_remap(bio); - if (bio_check_eod(bio, nr_sectors)) - goto end_io; + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; - /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. - */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); - if (!nr_sectors) { - err = 0; - goto end_io; - } - } + if (bio_check_eod(bio, nr_sectors)) + goto end_io; - if ((bio->bi_rw & REQ_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && - !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + err = 0; goto end_io; } + } - if (blk_throtl_bio(q, &bio)) - goto end_io; - - /* - * If bio = NULL, bio has been throttled and will be submitted - * later. - */ - if (!bio) - break; - - trace_block_bio_queue(q, bio); + if ((bio->bi_rw & REQ_DISCARD) && + (!blk_queue_discard(q) || + ((bio->bi_rw & REQ_SECURE) && + !blk_queue_secdiscard(q)))) { + err = -EOPNOTSUPP; + goto end_io; + } - ret = q->make_request_fn(q, bio); - } while (ret); + if (blk_throtl_bio(q, bio)) + return false; /* throttled, will be resubmitted later */ - return; + trace_block_bio_queue(q, bio); + return true; end_io: bio_endio(bio, err); + return false; } -/* - * We only want one ->make_request_fn to be active at a time, - * else stack usage with stacked devices could be a problem. - * So use current->bio_list to keep a list of requests - * submited by a make_request_fn function. - * current->bio_list is also used as a flag to say if - * generic_make_request is currently active in this task or not. - * If it is NULL, then no make_request is active. If it is non-NULL, - * then a make_request is active, and new requests should be added - * at the tail +/** + * generic_make_request - hand a buffer to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * generic_make_request() is used to make I/O requests of block + * devices. It is passed a &struct bio, which describes the I/O that needs + * to be done. + * + * generic_make_request() does not return any status. The + * success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the bio->bi_end_io + * function described (one day) else where. + * + * The caller of generic_make_request must make sure that bi_io_vec + * are set to describe the memory buffer, and that bi_dev and bi_sector are + * set to describe the device address, and the + * bi_end_io and optionally bi_private are set to describe how + * completion notification should be signaled. + * + * generic_make_request and the drivers it calls may use bi_next if this + * bio happens to be merged with someone else, and may resubmit the bio to + * a lower device by calling into generic_make_request recursively, which + * means the bio should NOT be touched after the call to ->make_request_fn. */ void generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + if (!generic_make_request_checks(bio)) + return; + + /* + * We only want one ->make_request_fn to be active at a time, else + * stack usage with stacked devices could be a problem. So use + * current->bio_list to keep a list of requests submited by a + * make_request_fn function. current->bio_list is also used as a + * flag to say if generic_make_request is currently active in this + * task or not. If it is NULL, then no make_request is active. If + * it is non-NULL, then a make_request is active, and new requests + * should be added at the tail + */ if (current->bio_list) { - /* make_request is active */ bio_list_add(current->bio_list, bio); return; } + /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers @@ -1581,22 +1633,21 @@ void generic_make_request(struct bio *bio) * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be - * added. __generic_make_request may indeed add some more bios + * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from - * bio_list, and call into __generic_make_request again. - * - * The loop was structured like this to make only one call to - * __generic_make_request (which is important as it is large and - * inlined) and to keep the structure simple. + * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); current->bio_list = &bio_list_on_stack; do { - __generic_make_request(bio); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + q->make_request_fn(q, bio); + bio = bio_list_pop(current->bio_list); } while (bio); current->bio_list = NULL; /* deactivate */ @@ -1702,6 +1753,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits); int blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; + int where = ELEVATOR_INSERT_BACK; if (blk_rq_check_limits(q, rq)) return -EIO; @@ -1718,7 +1770,12 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - add_acct_request(q, rq, ELEVATOR_INSERT_BACK); + if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA)) + where = ELEVATOR_INSERT_FLUSH; + + add_acct_request(q, rq, where); + if (where == ELEVATOR_INSERT_FLUSH) + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -2275,7 +2332,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool __blk_end_bidi_request(struct request *rq, int error, +bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) @@ -2622,6 +2679,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work); #define PLUG_MAGIC 0x91827364 +/** + * blk_start_plug - initialize blk_plug and track it inside the task_struct + * @plug: The &struct blk_plug that needs to be initialized + * + * Description: + * Tracking blk_plug inside the task_struct will help with auto-flushing the + * pending I/O should the task end up blocking between blk_start_plug() and + * blk_finish_plug(). This is important from a performance perspective, but + * also ensures that we don't deadlock. For instance, if the task is blocking + * for a memory allocation, memory reclaim could end up wanting to free a + * page belonging to that request that is currently residing in our private + * plug. By flushing the pending I/O when the process goes to sleep, we avoid + * this kind of deadlock. + */ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; @@ -2630,7 +2701,6 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; - plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2714,7 +2784,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); - plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); diff --git a/block/blk-flush.c b/block/blk-flush.c index bb21e4c36f7..720ad607ff9 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { unsigned int policy = 0; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + if (fflags & REQ_FLUSH) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (blk_rq_sectors(rq)) - policy |= REQ_FSEQ_DATA; if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } @@ -122,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; - rq->end_io = NULL; + rq->end_io = rq->flush.saved_end_io; } /** @@ -300,9 +301,6 @@ void blk_insert_flush(struct request *rq) unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); - BUG_ON(rq->end_io); - BUG_ON(!rq->bio || rq->bio != rq->biotail); - /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. @@ -312,6 +310,19 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags &= ~REQ_FUA; /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ + if (!policy) { + __blk_end_bidi_request(rq, 0, 0, 0); + return; + } + + BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ + + /* * If there's data but flush is not necessary, the request can be * processed directly without going through flush machinery. Queue * for normal execution. @@ -329,6 +340,7 @@ void blk_insert_flush(struct request *rq) memset(&rq->flush, 0, sizeof(rq->flush)); INIT_LIST_HEAD(&rq->flush.list); rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = flush_data_end_io; blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 475fab809a8..1366a89d8e6 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -115,7 +115,7 @@ void __blk_complete_request(struct request *req) /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { + if (req->cpu != -1) { ccpu = req->cpu; if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { ccpu = blk_cpu_to_group(ccpu); @@ -124,6 +124,14 @@ void __blk_complete_request(struct request *req) } else ccpu = cpu; + /* + * If current CPU and requested CPU are in the same group, running + * softirq in current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ if (ccpu == cpu || ccpu == group_cpu) { struct list_head *list; do_local: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0ee17b5e7fb..e7f9f657f10 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -258,11 +258,13 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) { + if (val == 2) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - if (val == 2) - queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); - } else { + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 1) { + queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 0) { queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } @@ -455,11 +457,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, } /** - * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed - * @kobj: the kobj belonging of the request queue to be released + * blk_release_queue: - release a &struct request_queue when it is no longer needed + * @kobj: the kobj belonging to the request queue to be released * * Description: - * blk_cleanup_queue is the pair to blk_init_queue() or + * blk_release_queue is the pair to blk_init_queue() or * blk_queue_make_request(). It should be called when a request queue is * being released; typically when a block device is being de-registered. * Currently, its primary task it to free all the &struct request @@ -477,12 +479,18 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); + if (q->elevator) + elevator_exit(q->elevator); + + blk_throtl_exit(q); + if (rl->rq_pool) mempool_destroy(rl->rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); + blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk-tag.c b/block/blk-tag.c index ece65fc4c79..e74d6d13838 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -286,12 +286,14 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) BUG_ON(tag == -1); - if (unlikely(tag >= bqt->real_max_depth)) + if (unlikely(tag >= bqt->max_depth)) { /* * This can happen after tag depth has been reduced. - * FIXME: how about a warning or info message here? + * But tag shouldn't be larger than real_max_depth. */ + WARN_ON(tag >= bqt->real_max_depth); return; + } list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f6a79412050..4553245d931 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,6 +10,7 @@ #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk-cgroup.h" +#include "blk.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -302,16 +303,16 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) return tg; } -/* - * This function returns with queue lock unlocked in case of error, like - * request queue is no more - */ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) { struct throtl_grp *tg = NULL, *__tg = NULL; struct blkio_cgroup *blkcg; struct request_queue *q = td->queue; + /* no throttling for dead queue */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + rcu_read_lock(); blkcg = task_blkio_cgroup(current); tg = throtl_find_tg(td, blkcg); @@ -323,32 +324,22 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) /* * Need to allocate a group. Allocation of group also needs allocation * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc - * - * Take the request queue reference to make sure queue does not - * go away once we return from allocation. + * we need to drop rcu lock and queue_lock before we call alloc. */ - blk_get_queue(q); rcu_read_unlock(); spin_unlock_irq(q->queue_lock); tg = throtl_alloc_tg(td); - /* - * We might have slept in group allocation. Make sure queue is not - * dead - */ - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - blk_put_queue(q); - if (tg) - kfree(tg); - - return ERR_PTR(-ENODEV); - } - blk_put_queue(q); /* Group allocated and queue is still alive. take the lock */ spin_lock_irq(q->queue_lock); + /* Make sure @q is still alive */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + kfree(tg); + return NULL; + } + /* * Initialize the new group. After sleeping, read the blkcg again. */ @@ -746,7 +737,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - bool sync = bio->bi_rw & REQ_SYNC; + bool sync = rw_is_sync(bio->bi_rw); /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; @@ -1014,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td) } } -static void throtl_td_free(struct throtl_data *td) -{ - kfree(td); -} - /* * Blk cgroup controller notification saying that blkio_group object is being * delinked as associated cgroup object is going away. That also means that @@ -1123,17 +1109,17 @@ static struct blkio_policy_type blkio_policy_throtl = { .plid = BLKIO_POLICY_THROTL, }; -int blk_throtl_bio(struct request_queue *q, struct bio **biop) +bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { struct throtl_data *td = q->td; struct throtl_grp *tg; - struct bio *bio = *biop; bool rw = bio_data_dir(bio), update_disptime = true; struct blkio_cgroup *blkcg; + bool throttled = false; if (bio->bi_rw & REQ_THROTTLED) { bio->bi_rw &= ~REQ_THROTTLED; - return 0; + goto out; } /* @@ -1150,9 +1136,9 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) if (tg_no_rule_group(tg, rw)) { blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, - rw, bio->bi_rw & REQ_SYNC); + rw, rw_is_sync(bio->bi_rw)); rcu_read_unlock(); - return 0; + goto out; } } rcu_read_unlock(); @@ -1161,18 +1147,10 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) * Either group has not been allocated yet or it is not an unlimited * IO group */ - spin_lock_irq(q->queue_lock); tg = throtl_get_tg(td); - - if (IS_ERR(tg)) { - if (PTR_ERR(tg) == -ENODEV) { - /* - * Queue is gone. No queue lock held here. - */ - return -ENODEV; - } - } + if (unlikely(!tg)) + goto out_unlock; if (tg->nr_queued[rw]) { /* @@ -1200,7 +1178,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(td, tg, rw); - goto out; + goto out_unlock; } queue_bio: @@ -1212,16 +1190,52 @@ queue_bio: tg->nr_queued[READ], tg->nr_queued[WRITE]); throtl_add_bio_tg(q->td, tg, bio); - *biop = NULL; + throttled = true; if (update_disptime) { tg_update_disptime(td, tg); throtl_schedule_next_dispatch(td); } +out_unlock: + spin_unlock_irq(q->queue_lock); out: + return throttled; +} + +/** + * blk_throtl_drain - drain throttled bios + * @q: request_queue to drain throttled bios for + * + * Dispatch all currently throttled bios on @q through ->make_request_fn(). + */ +void blk_throtl_drain(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct throtl_data *td = q->td; + struct throtl_rb_root *st = &td->tg_service_tree; + struct throtl_grp *tg; + struct bio_list bl; + struct bio *bio; + + WARN_ON_ONCE(!queue_is_locked(q)); + + bio_list_init(&bl); + + while ((tg = throtl_rb_first(st))) { + throtl_dequeue_tg(td, tg); + + while ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + } spin_unlock_irq(q->queue_lock); - return 0; + + while ((bio = bio_list_pop(&bl))) + generic_make_request(bio); + + spin_lock_irq(q->queue_lock); } int blk_throtl_init(struct request_queue *q) @@ -1296,7 +1310,11 @@ void blk_throtl_exit(struct request_queue *q) * it. */ throtl_shutdown_wq(q); - throtl_td_free(td); +} + +void blk_throtl_release(struct request_queue *q) +{ + kfree(q->td); } static int __init throtl_init(void) diff --git a/block/blk.h b/block/blk.h index d6586287adc..3f6551b3c92 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,8 +15,11 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); +void blk_drain_queue(struct request_queue *q, bool drain_all); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); +bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); @@ -186,4 +189,21 @@ static inline int blk_do_io_stat(struct request *rq) (rq->cmd_flags & REQ_DISCARD)); } -#endif +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); +extern void blk_throtl_drain(struct request_queue *q); +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_release(struct request_queue *q); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +{ + return false; +} +static inline void blk_throtl_drain(struct request_queue *q) { } +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_release(struct request_queue *q) { } +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif /* BLK_INTERNAL_H */ diff --git a/block/bsg-lib.c b/block/bsg-lib.c new file mode 100644 index 00000000000..6690e6e4103 --- /dev/null +++ b/block/bsg-lib.c @@ -0,0 +1,298 @@ +/* + * BSG helper library + * + * Copyright (C) 2008 James Smart, Emulex Corporation + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2011 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/scatterlist.h> +#include <linux/bsg-lib.h> +#include <linux/module.h> +#include <scsi/scsi_cmnd.h> + +/** + * bsg_destroy_job - routine to teardown/delete a bsg job + * @job: bsg_job that is to be torn down + */ +static void bsg_destroy_job(struct bsg_job *job) +{ + put_device(job->dev); /* release reference for the request */ + + kfree(job->request_payload.sg_list); + kfree(job->reply_payload.sg_list); + kfree(job); +} + +/** + * bsg_job_done - completion routine for bsg requests + * @job: bsg_job that is complete + * @result: job reply result + * @reply_payload_rcv_len: length of payload recvd + * + * The LLD should call this when the bsg job has completed. + */ +void bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + struct request *req = job->req; + struct request *rsp = req->next_rq; + int err; + + err = job->req->errors = result; + if (err < 0) + /* we're only returning the result field in the reply */ + job->req->sense_len = sizeof(u32); + else + job->req->sense_len = job->reply_len; + /* we assume all request payload was transferred, residual == 0 */ + req->resid_len = 0; + + if (rsp) { + WARN_ON(reply_payload_rcv_len > rsp->resid_len); + + /* set reply (bidi) residual */ + rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len); + } + blk_complete_request(req); +} +EXPORT_SYMBOL_GPL(bsg_job_done); + +/** + * bsg_softirq_done - softirq done routine for destroying the bsg requests + * @rq: BSG request that holds the job to be destroyed + */ +static void bsg_softirq_done(struct request *rq) +{ + struct bsg_job *job = rq->special; + + blk_end_request_all(rq, rq->errors); + bsg_destroy_job(job); +} + +static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + BUG_ON(!req->nr_phys_segments); + + buf->sg_list = kzalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +/** + * bsg_create_job - create the bsg_job structure for the bsg request + * @dev: device that is being sent the bsg request + * @req: BSG request that needs a job structure + */ +static int bsg_create_job(struct device *dev, struct request *req) +{ + struct request *rsp = req->next_rq; + struct request_queue *q = req->q; + struct bsg_job *job; + int ret; + + BUG_ON(req->special); + + job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL); + if (!job) + return -ENOMEM; + + req->special = job; + job->req = req; + if (q->bsg_job_size) + job->dd_data = (void *)&job[1]; + job->request = req->cmd; + job->request_len = req->cmd_len; + job->reply = req->sense; + job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer + * allocated */ + if (req->bio) { + ret = bsg_map_buffer(&job->request_payload, req); + if (ret) + goto failjob_rls_job; + } + if (rsp && rsp->bio) { + ret = bsg_map_buffer(&job->reply_payload, rsp); + if (ret) + goto failjob_rls_rqst_payload; + } + job->dev = dev; + /* take a reference for the request */ + get_device(job->dev); + return 0; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + kfree(job); + return -ENOMEM; +} + +/* + * bsg_goose_queue - restart queue in case it was stopped + * @q: request q to be restarted + */ +void bsg_goose_queue(struct request_queue *q) +{ + if (!q) + return; + + blk_run_queue_async(q); +} +EXPORT_SYMBOL_GPL(bsg_goose_queue); + +/** + * bsg_request_fn - generic handler for bsg requests + * @q: request queue to manage + * + * On error the create_bsg_job function should return a -Exyz error value + * that will be set to the req->errors. + * + * Drivers/subsys should pass this to the queue init function. + */ +void bsg_request_fn(struct request_queue *q) +{ + struct device *dev = q->queuedata; + struct request *req; + struct bsg_job *job; + int ret; + + if (!get_device(dev)) + return; + + while (1) { + req = blk_fetch_request(q); + if (!req) + break; + spin_unlock_irq(q->queue_lock); + + ret = bsg_create_job(dev, req); + if (ret) { + req->errors = ret; + blk_end_request_all(req, ret); + spin_lock_irq(q->queue_lock); + continue; + } + + job = req->special; + ret = q->bsg_job_fn(job); + spin_lock_irq(q->queue_lock); + if (ret) + break; + } + + spin_unlock_irq(q->queue_lock); + put_device(dev); + spin_lock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(bsg_request_fn); + +/** + * bsg_setup_queue - Create and add the bsg hooks so we can receive requests + * @dev: device to attach bsg device to + * @q: request queue setup by caller + * @name: device to give bsg device + * @job_fn: bsg job handler + * @dd_job_size: size of LLD data needed for each job + * + * The caller should have setup the reuqest queue with bsg_request_fn + * as the request_fn. + */ +int bsg_setup_queue(struct device *dev, struct request_queue *q, + char *name, bsg_job_fn *job_fn, int dd_job_size) +{ + int ret; + + q->queuedata = dev; + q->bsg_job_size = dd_job_size; + q->bsg_job_fn = job_fn; + queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + blk_queue_softirq_done(q, bsg_softirq_done); + blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); + + ret = bsg_register_queue(q, dev, name, NULL); + if (ret) { + printk(KERN_ERR "%s: bsg interface failed to " + "initialize - register queue\n", dev->kobj.name); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bsg_setup_queue); + +/** + * bsg_remove_queue - Deletes the bsg dev from the q + * @q: the request_queue that is to be torn down. + * + * Notes: + * Before unregistering the queue empty any requests that are blocked + */ +void bsg_remove_queue(struct request_queue *q) +{ + struct request *req; /* block request */ + int counts; /* totals for request_list count and starved */ + + if (!q) + return; + + /* Stop taking in new requests */ + spin_lock_irq(q->queue_lock); + blk_stop_queue(q); + + /* drain all requests in the queue */ + while (1) { + /* need the lock to fetch a request + * this may fetch the same reqeust as the previous pass + */ + req = blk_fetch_request(q); + /* save requests in use and starved */ + counts = q->rq.count[0] + q->rq.count[1] + + q->rq.starved[0] + q->rq.starved[1]; + spin_unlock_irq(q->queue_lock); + /* any requests still outstanding? */ + if (counts == 0) + break; + + /* This may be the same req as the previous iteration, + * always send the blk_end_request_all after a prefetch. + * It is not okay to not end the request because the + * prefetch started the request. + */ + if (req) { + /* return -ENXIO to indicate that this queue is + * going away + */ + req->errors = -ENXIO; + blk_end_request_all(req, -ENXIO); + } + + msleep(200); /* allow bsg to possibly finish */ + spin_lock_irq(q->queue_lock); + } + bsg_unregister_queue(q); +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1f96ad6254f..16ace89613b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -130,6 +130,8 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; + /* pending priority requests */ + int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -682,6 +684,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) + return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; + s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1209,6 +1214,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) hlist_del_init(&cfqg->cfqd_node); + BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); + cfqd->nr_blkcg_linked_grps--; + /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. @@ -1604,6 +1612,10 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); + if (rq->cmd_flags & REQ_PRIO) { + WARN_ON(!cfqq->prio_pending); + cfqq->prio_pending--; + } } static int cfq_merge(struct request_queue *q, struct request **req, @@ -3357,6 +3369,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* + * So both queues are sync. Let the new request get disk time if + * it's a metadata request and the current queue is doing regular IO. + */ + if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) + return true; + + /* * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. */ if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) @@ -3420,6 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; + if (rq->cmd_flags & REQ_PRIO) + cfqq->prio_pending++; cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); diff --git a/block/elevator.c b/block/elevator.c index a3b64bc71d8..66343d6917d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -31,7 +31,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/delay.h> #include <linux/blktrace_api.h> #include <linux/hash.h> #include <linux/uaccess.h> @@ -182,7 +181,7 @@ static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, eq->elevator_data = data; } -static char chosen_elevator[16]; +static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) { @@ -606,43 +605,35 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) void elv_drain_elevator(struct request_queue *q) { static int printed; + + lockdep_assert_held(q->queue_lock); + while (q->elevator->ops->elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted == 0) - return; - if (printed++ < 10) { + if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", q->elevator->elevator_type->elevator_name, q->nr_sorted); } } -/* - * Call with queue lock held, interrupts disabled - */ void elv_quiesce_start(struct request_queue *q) { if (!q->elevator) return; + spin_lock_irq(q->queue_lock); queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); - /* - * make sure we don't have any requests in flight - */ - elv_drain_elevator(q); - while (q->rq.elvpriv) { - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); - msleep(10); - spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); - } + blk_drain_queue(q, false); } void elv_quiesce_end(struct request_queue *q) { + spin_lock_irq(q->queue_lock); queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); } void __elv_add_request(struct request_queue *q, struct request *rq, int where) @@ -972,7 +963,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * Turn on BYPASS and drain all requests w/ elevator private data */ - spin_lock_irq(q->queue_lock); elv_quiesce_start(q); /* @@ -983,8 +973,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * attach and start new elevator */ + spin_lock_irq(q->queue_lock); elevator_attach(q, e, data); - spin_unlock_irq(q->queue_lock); if (old_elevator->registered) { @@ -999,9 +989,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * finally exit old elevator and turn off BYPASS. */ elevator_exit(old_elevator); - spin_lock_irq(q->queue_lock); elv_quiesce_end(q); - spin_unlock_irq(q->queue_lock); blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); @@ -1015,10 +1003,7 @@ fail_register: elevator_exit(e); q->elevator = old_elevator; elv_register_queue(q); - - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); + elv_quiesce_end(q); return err; } diff --git a/block/genhd.c b/block/genhd.c index 2429ecbbd97..9253839714f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> +#include <linux/ctype.h> #include "blk.h" @@ -611,6 +612,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -909,6 +916,74 @@ static int __init genhd_device_init(void) subsys_initcall(genhd_device_init); +static ssize_t alias_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + ssize_t ret = 0; + + if (disk->alias) + ret = snprintf(buf, ALIAS_LEN, "%s\n", disk->alias); + return ret; +} + +static ssize_t alias_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + char *alias; + char *envp[] = { NULL, NULL }; + unsigned char c; + int i; + ssize_t ret = count; + + if (!count) + return -EINVAL; + + if (count >= ALIAS_LEN) { + printk(KERN_ERR "alias: alias is too long\n"); + return -EINVAL; + } + + /* Validation check */ + for (i = 0; i < count; i++) { + c = buf[i]; + if (i == count - 1 && c == '\n') + break; + if (!isalnum(c) && c != '_' && c != '-') { + printk(KERN_ERR "alias: invalid alias\n"); + return -EINVAL; + } + } + + if (disk->alias) { + printk(KERN_INFO "alias: %s is already assigned (%s)\n", + disk->disk_name, disk->alias); + return -EINVAL; + } + + alias = kasprintf(GFP_KERNEL, "%s", buf); + if (!alias) + return -ENOMEM; + + if (alias[count - 1] == '\n') + alias[count - 1] = '\0'; + + envp[0] = kasprintf(GFP_KERNEL, "ALIAS=%s", alias); + if (!envp[0]) { + kfree(alias); + return -ENOMEM; + } + + disk->alias = alias; + printk(KERN_INFO "alias: assigned %s to %s\n", alias, disk->disk_name); + + kobject_uevent_env(&dev->kobj, KOBJ_ADD, envp); + + kfree(envp[0]); + return ret; +} + static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -968,6 +1043,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static DEVICE_ATTR(alias, S_IRUGO|S_IWUSR, alias_show, alias_store); static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -990,6 +1066,7 @@ static struct device_attribute dev_attr_fail_timeout = #endif static struct attribute *disk_attrs[] = { + &dev_attr_alias.attr, &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, @@ -1095,6 +1172,8 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { @@ -1146,17 +1225,17 @@ static int diskstats_show(struct seq_file *seqf, void *v) cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu " - "%u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s %lu %lu %lu " + "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[READ]), part_stat_read(hd, merges[READ]), - (unsigned long long)part_stat_read(hd, sectors[READ]), + part_stat_read(hd, sectors[READ]), jiffies_to_msecs(part_stat_read(hd, ticks[READ])), part_stat_read(hd, ios[WRITE]), part_stat_read(hd, merges[WRITE]), - (unsigned long long)part_stat_read(hd, sectors[WRITE]), + part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4f4230b79bb..fbdf0d802ec 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -565,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod { int err; - if (!q || blk_get_queue(q)) + if (!q) return -ENXIO; switch (cmd) { @@ -686,7 +686,6 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod err = -ENOTTY; } - blk_put_queue(q); return err; } EXPORT_SYMBOL(scsi_cmd_ioctl); |