diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 5 | ||||
-rw-r--r-- | block/blk-barrier.c | 319 | ||||
-rw-r--r-- | block/blk-core.c | 1255 | ||||
-rw-r--r-- | block/blk-exec.c | 105 | ||||
-rw-r--r-- | block/blk-ioc.c | 194 | ||||
-rw-r--r-- | block/blk-map.c | 264 | ||||
-rw-r--r-- | block/blk-settings.c | 402 | ||||
-rw-r--r-- | block/blk.h | 17 |
8 files changed, 1312 insertions, 1249 deletions
diff --git a/block/Makefile b/block/Makefile index fcaae4ae670..2002046d0a9 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,8 +2,9 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o ioctl.o \ - genhd.o scsi_ioctl.o +obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ + blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ + blk-exec.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c new file mode 100644 index 00000000000..5f74fec327d --- /dev/null +++ b/block/blk-barrier.c @@ -0,0 +1,319 @@ +/* + * Functions related to barrier IO handling + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> + +#include "blk.h" + +/** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @ordered: one of QUEUE_ORDERED_* + * @prepare_flush_fn: rq setup helper for cache flush ordered writes + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + **/ +int blk_queue_ordered(struct request_queue *q, unsigned ordered, + prepare_flush_fn *prepare_flush_fn) +{ + if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && + prepare_flush_fn == NULL) { + printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); + return -EINVAL; + } + + if (ordered != QUEUE_ORDERED_NONE && + ordered != QUEUE_ORDERED_DRAIN && + ordered != QUEUE_ORDERED_DRAIN_FLUSH && + ordered != QUEUE_ORDERED_DRAIN_FUA && + ordered != QUEUE_ORDERED_TAG && + ordered != QUEUE_ORDERED_TAG_FLUSH && + ordered != QUEUE_ORDERED_TAG_FUA) { + printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); + return -EINVAL; + } + + q->ordered = ordered; + q->next_ordered = ordered; + q->prepare_flush_fn = prepare_flush_fn; + + return 0; +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/* + * Cache flushing for ordered writes handling + */ +inline unsigned blk_ordered_cur_seq(struct request_queue *q) +{ + if (!q->ordseq) + return 0; + return 1 << ffz(q->ordseq); +} + +unsigned blk_ordered_req_seq(struct request *rq) +{ + struct request_queue *q = rq->q; + + BUG_ON(q->ordseq == 0); + + if (rq == &q->pre_flush_rq) + return QUEUE_ORDSEQ_PREFLUSH; + if (rq == &q->bar_rq) + return QUEUE_ORDSEQ_BAR; + if (rq == &q->post_flush_rq) + return QUEUE_ORDSEQ_POSTFLUSH; + + /* + * !fs requests don't need to follow barrier ordering. Always + * put them at the front. This fixes the following deadlock. + * + * http://thread.gmane.org/gmane.linux.kernel/537473 + */ + if (!blk_fs_request(rq)) + return QUEUE_ORDSEQ_DRAIN; + + if ((rq->cmd_flags & REQ_ORDERED_COLOR) == + (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) + return QUEUE_ORDSEQ_DRAIN; + else + return QUEUE_ORDSEQ_DONE; +} + +void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) +{ + struct request *rq; + + if (error && !q->orderr) + q->orderr = error; + + BUG_ON(q->ordseq & seq); + q->ordseq |= seq; + + if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) + return; + + /* + * Okay, sequence complete. + */ + q->ordseq = 0; + rq = q->orig_bar_rq; + + if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) + BUG(); +} + +static void pre_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); +} + +static void bar_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); +} + +static void post_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); +} + +static void queue_flush(struct request_queue *q, unsigned which) +{ + struct request *rq; + rq_end_io_fn *end_io; + + if (which == QUEUE_ORDERED_PREFLUSH) { + rq = &q->pre_flush_rq; + end_io = pre_flush_end_io; + } else { + rq = &q->post_flush_rq; + end_io = post_flush_end_io; + } + + rq->cmd_flags = REQ_HARDBARRIER; + rq_init(q, rq); + rq->elevator_private = NULL; + rq->elevator_private2 = NULL; + rq->rq_disk = q->bar_rq.rq_disk; + rq->end_io = end_io; + q->prepare_flush_fn(q, rq); + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); +} + +static inline struct request *start_ordered(struct request_queue *q, + struct request *rq) +{ + q->orderr = 0; + q->ordered = q->next_ordered; + q->ordseq |= QUEUE_ORDSEQ_STARTED; + + /* + * Prep proxy barrier request. + */ + blkdev_dequeue_request(rq); + q->orig_bar_rq = rq; + rq = &q->bar_rq; + rq->cmd_flags = 0; + rq_init(q, rq); + if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) + rq->cmd_flags |= REQ_RW; + if (q->ordered & QUEUE_ORDERED_FUA) + rq->cmd_flags |= REQ_FUA; + rq->elevator_private = NULL; + rq->elevator_private2 = NULL; + init_request_from_bio(rq, q->orig_bar_rq->bio); + rq->end_io = bar_end_io; + + /* + * Queue ordered sequence. As we stack them at the head, we + * need to queue in reverse order. Note that we rely on that + * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs + * request gets inbetween ordered sequence. If this request is + * an empty barrier, we don't need to do a postflush ever since + * there will be no data written between the pre and post flush. + * Hence a single flush will suffice. + */ + if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) + queue_flush(q, QUEUE_ORDERED_POSTFLUSH); + else + q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); + + if (q->ordered & QUEUE_ORDERED_PREFLUSH) { + queue_flush(q, QUEUE_ORDERED_PREFLUSH); + rq = &q->pre_flush_rq; + } else + q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; + + if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) + q->ordseq |= QUEUE_ORDSEQ_DRAIN; + else + rq = NULL; + + return rq; +} + +int blk_do_ordered(struct request_queue *q, struct request **rqp) +{ + struct request *rq = *rqp; + const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); + + if (!q->ordseq) { + if (!is_barrier) + return 1; + + if (q->next_ordered != QUEUE_ORDERED_NONE) { + *rqp = start_ordered(q, rq); + return 1; + } else { + /* + * This can happen when the queue switches to + * ORDERED_NONE while this request is on it. + */ + blkdev_dequeue_request(rq); + if (__blk_end_request(rq, -EOPNOTSUPP, + blk_rq_bytes(rq))) + BUG(); + *rqp = NULL; + return 0; + } + } + + /* + * Ordered sequence in progress + */ + + /* Special requests are not subject to ordering rules. */ + if (!blk_fs_request(rq) && + rq != &q->pre_flush_rq && rq != &q->post_flush_rq) + return 1; + + if (q->ordered & QUEUE_ORDERED_TAG) { + /* Ordered by tag. Blocking the next barrier is enough. */ + if (is_barrier && rq != &q->bar_rq) + *rqp = NULL; + } else { + /* Ordered by draining. Wait for turn. */ + WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); + if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) + *rqp = NULL; + } + + return 1; +} + +static void bio_end_empty_barrier(struct bio *bio, int err) +{ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + + complete(bio->bi_private); +} + +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @error_sector: error sector + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. Caller must run wait_for_completion() on its own. + */ +int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q; + struct bio *bio; + int ret; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = bio_end_empty_barrier; + bio->bi_private = &wait; + bio->bi_bdev = bdev; + submit_bio(1 << BIO_RW_BARRIER, bio); + + wait_for_completion(&wait); + + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be copied + * from rq->sector. + */ + if (error_sector) + *error_sector = bio->bi_sector; + + ret = 0; + if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} + +EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-core.c b/block/blk-core.c index 937f9d0b9bd..2c73ed1a813 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,7 +20,6 @@ #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> @@ -34,20 +33,9 @@ #include "blk.h" -/* - * for max sense size - */ -#include <scsi/scsi_cmnd.h> - -static void blk_unplug_work(struct work_struct *work); -static void blk_unplug_timeout(unsigned long data); static void drive_stat_acct(struct request *rq, int new_io); -static void init_request_from_bio(struct request *req, struct bio *bio); static int __make_request(struct request_queue *q, struct bio *bio); -static struct io_context *current_io_context(gfp_t gfp_flags, int node); static void blk_recalc_rq_segments(struct request *rq); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio); /* * For the allocated request tables @@ -60,28 +48,12 @@ struct kmem_cache *request_cachep; struct kmem_cache *blk_requestq_cachep = NULL; /* - * For io context allocations - */ -static struct kmem_cache *iocontext_cachep; - -/* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; -unsigned long blk_max_low_pfn, blk_max_pfn; - -EXPORT_SYMBOL(blk_max_low_pfn); -EXPORT_SYMBOL(blk_max_pfn); - static DEFINE_PER_CPU(struct list_head, blk_cpu_done); -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) - -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -117,113 +89,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) } EXPORT_SYMBOL(blk_get_backing_dev_info); -/** - * blk_queue_prep_rq - set a prepare_request function for queue - * @q: queue - * @pfn: prepare_request function - * - * It's possible for a queue to register a prepare_request callback which - * is invoked before the request is handed to the request_fn. The goal of - * the function is to prepare a request for I/O, it can be used to build a - * cdb from the request data for instance. - * - */ -void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) -{ - q->prep_rq_fn = pfn; -} - -EXPORT_SYMBOL(blk_queue_prep_rq); - -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} - -EXPORT_SYMBOL(blk_queue_merge_bvec); - -void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) -{ - q->softirq_done_fn = fn; -} - -EXPORT_SYMBOL(blk_queue_softirq_done); - -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); - q->make_request_fn = mfn; - q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); - blk_queue_hardsect_size(q, 512); - blk_queue_dma_alignment(q, 511); - blk_queue_congestion_threshold(q); - q->nr_batching = BLK_BATCH_REQ; - - q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ - if (q->unplug_delay == 0) - q->unplug_delay = 1; - - INIT_WORK(&q->unplug_work, blk_unplug_work); - - q->unplug_timer.function = blk_unplug_timeout; - q->unplug_timer.data = (unsigned long)q; - - /* - * by default assume old behaviour and bounce for any highmem page - */ - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); -} - -EXPORT_SYMBOL(blk_queue_make_request); - -static void rq_init(struct request_queue *q, struct request *rq) +void rq_init(struct request_queue *q, struct request *rq) { INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->donelist); @@ -247,255 +113,6 @@ static void rq_init(struct request_queue *q, struct request *rq) rq->next_rq = NULL; } -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * @prepare_flush_fn: rq setup helper for cache flush ordered writes - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn) -{ - if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && - prepare_flush_fn == NULL) { - printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); - return -EINVAL; - } - - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - q->prepare_flush_fn = prepare_flush_fn; - - return 0; -} - -EXPORT_SYMBOL(blk_queue_ordered); - -/* - * Cache flushing for ordered writes handling - */ -inline unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (!blk_fs_request(rq)) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) -{ - struct request *rq; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - - if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) - BUG(); -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, unsigned which) -{ - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - - rq->cmd_flags = REQ_HARDBARRIER; - rq_init(q, rq); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - rq->rq_disk = q->bar_rq.rq_disk; - rq->end_io = end_io; - q->prepare_flush_fn(q, rq); - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static inline struct request *start_ordered(struct request_queue *q, - struct request *rq) -{ - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * Prep proxy barrier request. - */ - blkdev_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = &q->bar_rq; - rq->cmd_flags = 0; - rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; - if (q->ordered & QUEUE_ORDERED_FUA) - rq->cmd_flags |= REQ_FUA; - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->end_io = bar_end_io; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. If this request is - * an empty barrier, we don't need to do a postflush ever since - * there will be no data written between the pre and post flush. - * Hence a single flush will suffice. - */ - if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) - queue_flush(q, QUEUE_ORDERED_POSTFLUSH); - else - q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - - if (q->ordered & QUEUE_ORDERED_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_PREFLUSH); - rq = &q->pre_flush_rq; - } else - q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; - - if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) - q->ordseq |= QUEUE_ORDSEQ_DRAIN; - else - rq = NULL; - - return rq; -} - -int blk_do_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); - - if (!q->ordseq) { - if (!is_barrier) - return 1; - - if (q->next_ordered != QUEUE_ORDERED_NONE) { - *rqp = start_ordered(q, rq); - return 1; - } else { - /* - * This can happen when the queue switches to - * ORDERED_NONE while this request is on it. - */ - blkdev_dequeue_request(rq); - if (__blk_end_request(rq, -EOPNOTSUPP, - blk_rq_bytes(rq))) - BUG(); - *rqp = NULL; - return 0; - } - } - - /* - * Ordered sequence in progress - */ - - /* Special requests are not subject to ordering rules. */ - if (!blk_fs_request(rq) && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return 1; - - if (q->ordered & QUEUE_ORDERED_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } - - return 1; -} - static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { @@ -528,279 +145,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio, } } -/** - * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @dma_addr: bus address limit - * - * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @page. - **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) -{ - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* Assume anything <= 4GB can be handled by IOMMU. - Actually some IOMMUs can handle everything, but I don't - know of a way to test this here. */ - if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->bounce_pfn = max_low_pfn; -#else - if (bounce_pfn < blk_max_low_pfn) - dma = 1; - q->bounce_pfn = bounce_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->bounce_pfn = bounce_pfn; - } -} - -EXPORT_SYMBOL(blk_queue_bounce_limit); - -/** - * blk_queue_max_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit - * - * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. - **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) -{ - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); - printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); - } - - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->max_hw_sectors = q->max_sectors = max_sectors; - else { - q->max_sectors = BLK_DEF_MAX_SECTORS; - q->max_hw_sectors = max_sectors; - } -} - -EXPORT_SYMBOL(blk_queue_max_sectors); - -/** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. - **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_phys_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give as once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_hw_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_hw_segments); - -/** - * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg - * @q: the request queue for the device - * @max_size: max size of segment in bytes - * - * Description: - * Enables a low level driver to set an upper limit on the size of a - * coalesced segment - **/ -void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) -{ - if (max_size < PAGE_CACHE_SIZE) { - max_size = PAGE_CACHE_SIZE; - printk("%s: set to minimum %d\n", __FUNCTION__, max_size); - } - - q->max_segment_size = max_size; -} - -EXPORT_SYMBOL(blk_queue_max_segment_size); - -/** - * blk_queue_hardsect_size - set hardware sector size for the queue - * @q: the request queue for the device - * @size: the hardware sector size, in bytes - * - * Description: - * This should typically be set to the lowest possible sector size - * that the hardware can operate on (possible without reverting to - * even internal read-modify-write operations). Usually the default - * of 512 covers most hardware. - **/ -void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) -{ - q->hardsect_size = size; -} - -EXPORT_SYMBOL(blk_queue_hardsect_size); - -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - -/** - * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers - * @t: the stacking driver (top) - * @b: the underlying device (bottom) - **/ -void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) -{ - /* zero is "infinity" */ - t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); - t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); - - t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); - t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); - t->max_segment_size = min(t->max_segment_size,b->max_segment_size); - t->hardsect_size = max(t->hardsect_size,b->hardsect_size); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); -} - -EXPORT_SYMBOL(blk_queue_stack_limits); - -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * - * @q: the request queue for the device - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, void *buf, - unsigned int size) -{ - if (q->max_hw_segments < 2 || q->max_phys_segments < 2) - return -EINVAL; - /* make room for appending the drain */ - --q->max_hw_segments; - --q->max_phys_segments; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} - -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - -/** - * blk_queue_segment_boundary - set boundary rules for segment merging - * @q: the request queue for the device - * @mask: the memory boundary mask - **/ -void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) -{ - if (mask < PAGE_CACHE_SIZE - 1) { - mask = PAGE_CACHE_SIZE - 1; - printk("%s: set to minimum %lx\n", __FUNCTION__, mask); - } - - q->seg_boundary_mask = mask; -} - -EXPORT_SYMBOL(blk_queue_segment_boundary); - -/** - * blk_queue_dma_alignment - set dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * set required memory and length aligment for direct dma transactions. - * this is used when buiding direct io requests for the queue. - * - **/ -void blk_queue_dma_alignment(struct request_queue *q, int mask) -{ - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_dma_alignment); - -/** - * blk_queue_update_dma_alignment - update dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * update required memory and length aligment for direct dma transactions. - * If the requested alignment is larger than the current alignment, then - * the current queue alignment is updated to the new value, otherwise it - * is left alone. The design of this is to allow multiple objects - * (driver, device, transport etc) to set their respective - * alignments without having them interfere. - * - **/ -void blk_queue_update_dma_alignment(struct request_queue *q, int mask) -{ - BUG_ON(mask > PAGE_SIZE); - - if (mask > q->dma_alignment) - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_update_dma_alignment); - void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; @@ -1074,8 +418,8 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; } -static int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) +int ll_back_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio) { unsigned short max_sectors; int len; @@ -1285,7 +629,7 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi, blk_unplug(q); } -static void blk_unplug_work(struct work_struct *work) +void blk_unplug_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, unplug_work); @@ -1296,7 +640,7 @@ static void blk_unplug_work(struct work_struct *work) q->unplug_fn(q); } -static void blk_unplug_timeout(unsigned long data) +void blk_unplug_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; @@ -1961,393 +1305,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq, EXPORT_SYMBOL(blk_insert_request); -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - -int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - if (!rq->bio) - blk_rq_bio_prep(q, rq, bio); - else if (!ll_back_merge_fn(q, rq, bio)) - return -EINVAL; - else { - rq->biotail->bi_next = bio; - rq->biotail = bio; - - rq->data_len += bio->bi_size; - } - return 0; -} -EXPORT_SYMBOL(blk_rq_append_bio); - -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned int len) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) - bio = bio_map_user(q, NULL, uaddr, len, reading); - else - bio = bio_copy_user(q, uaddr, len, reading); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @ubuf: the user buffer - * @len: length of user data - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned long len) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !ubuf) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, ubuf, map_len); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - } - - rq->buffer = rq->data = NULL; - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - return ret; -} - -EXPORT_SYMBOL(blk_rq_map_user); - -/** - * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to map data to - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct sg_iovec *iov, int iov_count, unsigned int len) -{ - struct bio *bio; - - if (!iov || iov_count <= 0) - return -EINVAL; - - /* we don't allow misaligned data like bio_map_user() does. If the - * user is using sg, they're expected to know the alignment constraints - * and respect them accordingly */ - bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (bio->bi_size != len) { - bio_endio(bio, 0); - bio_unmap_user(bio); - return -EINVAL; - } - - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_user_iov); - -/** - * blk_rq_unmap_user - unmap a request with user data - * @bio: start of bio list - * - * Description: - * Unmap a rq previously mapped by blk_rq_map_user(). The caller must - * supply the original rq->bio from the blk_rq_map_user() return, since - * the io completion may have changed rq->bio. - */ -int blk_rq_unmap_user(struct bio *bio) -{ - struct bio *mapped_bio; - int ret = 0, ret2; - - while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; - - mapped_bio = bio; - bio = bio->bi_next; - bio_put(mapped_bio); - } - - return ret; -} - -EXPORT_SYMBOL(blk_rq_unmap_user); - -/** - * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to fill - * @kbuf: the kernel buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) -{ - struct bio *bio; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !kbuf) - return -EINVAL; - - bio = bio_map_kern(q, kbuf, len, gfp_mask); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); - - blk_rq_bio_prep(q, rq, bio); - blk_queue_bounce(q, &rq->bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_kern); - -/** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler - * - * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution. Don't wait for completion. - */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) -{ - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - - rq->rq_disk = bd_disk; - rq->cmd_flags |= REQ_NOMERGE; - rq->end_io = done; - WARN_ON(irqs_disabled()); - spin_lock_irq(q->queue_lock); - __elv_add_request(q, rq, where, 1); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); - -/** - * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * - * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution and wait for completion. - */ -int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - char sense[SCSI_SENSE_BUFFERSIZE]; - int err = 0; - - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - - if (!rq->sense) { - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; - } - - rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); - - if (rq->errors) - err = -EIO; - - return err; -} - -EXPORT_SYMBOL(blk_execute_rq); - -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - - complete(bio->bi_private); -} - -/** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @error_sector: error sector - * - * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. Caller must run wait_for_completion() on its own. - */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; - bio->bi_bdev = bdev; - submit_bio(1 << BIO_RW_BARRIER, bio); - - wait_for_completion(&wait); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from rq->sector. - */ - if (error_sector) - *error_sector = bio->bi_sector; - - ret = 0; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} - -EXPORT_SYMBOL(blkdev_issue_flush); - static void drive_stat_acct(struct request *rq, int new_io) { int rw = rq_data_dir(rq); @@ -2459,26 +1416,6 @@ void blk_put_request(struct request *req) EXPORT_SYMBOL(blk_put_request); -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end io status of the request - */ -void blk_end_sync_rq(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} -EXPORT_SYMBOL(blk_end_sync_rq); - /* * Has to be called with the request spinlock acquired */ @@ -2557,7 +1494,7 @@ static inline int attempt_front_merge(struct request_queue *q, return 0; } -static void init_request_from_bio(struct request *req, struct bio *bio) +void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -3524,8 +2461,8 @@ int blk_end_request_callback(struct request *rq, int error, int nr_bytes, } EXPORT_SYMBOL_GPL(blk_end_request_callback); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio) +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio) { /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= (bio->bi_rw & 3); @@ -3571,188 +2508,12 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - iocontext_cachep = kmem_cache_create("blkdev_ioc", - sizeof(struct io_context), 0, SLAB_PANIC, NULL); - for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); register_hotcpu_notifier(&blk_cpu_notifier); - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - - return 0; -} - -static void cfq_dtor(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - /* - * We don't have a specific key to lookup with, so use the gang - * lookup to just retrieve the first item stored. The cfq exit - * function will iterate the full tree, so any member will do. - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - if (r > 0) - cic[0]->dtor(ioc); -} - -/* - * IO Context helper functions. put_io_context() returns 1 if there are no - * more users of this io context, 0 otherwise. - */ -int put_io_context(struct io_context *ioc) -{ - if (ioc == NULL) - return 1; - - BUG_ON(atomic_read(&ioc->refcount) == 0); - - if (atomic_dec_and_test(&ioc->refcount)) { - rcu_read_lock(); - if (ioc->aic && ioc->aic->dtor) - ioc->aic->dtor(ioc->aic); - rcu_read_unlock(); - cfq_dtor(ioc); - - kmem_cache_free(iocontext_cachep, ioc); - return 1; - } return 0; } -EXPORT_SYMBOL(put_io_context); - -static void cfq_exit(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - rcu_read_lock(); - /* - * See comment for cfq_dtor() - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - rcu_read_unlock(); - - if (r > 0) - cic[0]->exit(ioc); -} - -/* Called by the exitting task */ -void exit_io_context(void) -{ - struct io_context *ioc; - - task_lock(current); - ioc = current->io_context; - current->io_context = NULL; - task_unlock(current); - - if (atomic_dec_and_test(&ioc->nr_tasks)) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); - cfq_exit(ioc); - - put_io_context(ioc); - } -} - -struct io_context *alloc_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret; - - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - ret->ioc_data = NULL; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * Otherwise, return its existing IO context. - * - * This returned IO context doesn't have a specifically elevated refcount, - * but since the current task itself holds a reference, the context can be - * used in general code, so long as it stays within `current` context. - */ -static struct io_context *current_io_context(gfp_t gfp_flags, int node) -{ - struct task_struct *tsk = current; - struct io_context *ret; - - ret = tsk->io_context; - if (likely(ret)) - return ret; - - ret = alloc_io_context(gfp_flags, node); - if (ret) { - /* make sure set_task_ioprio() sees the settings above */ - smp_wmb(); - tsk->io_context = ret; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. - * - * This is always called in the context of the task which submitted the I/O. - */ -struct io_context *get_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret = NULL; - - /* - * Check for unlikely race with exiting task. ioc ref count is - * zero when ioc is being detached. - */ - do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) - break; - } while (!atomic_inc_not_zero(&ret->refcount)); - - return ret; -} -EXPORT_SYMBOL(get_io_context); - -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_read(&src->refcount) == 0); - atomic_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - -void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) -{ - struct io_context *temp; - temp = *ioc1; - *ioc1 = *ioc2; - *ioc2 = temp; -} -EXPORT_SYMBOL(swap_io_context); diff --git a/block/blk-exec.c b/block/blk-exec.c new file mode 100644 index 00000000000..ebfb44e959a --- /dev/null +++ b/block/blk-exec.c @@ -0,0 +1,105 @@ +/* + * Functions related to setting various queue properties from drivers + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> + +#include "blk.h" + +/* + * for max sense size + */ +#include <scsi/scsi_cmnd.h> + +/** + * blk_end_sync_rq - executes a completion event on a request + * @rq: request to complete + * @error: end io status of the request + */ +void blk_end_sync_rq(struct request *rq, int error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = NULL; + __blk_put_request(rq->q, rq); + + /* + * complete last, if this is a stack request the process (and thus + * the rq pointer) could be invalid right after this complete() + */ + complete(waiting); +} +EXPORT_SYMBOL(blk_end_sync_rq); + +/** + * blk_execute_rq_nowait - insert a request into queue for execution + * @q: queue to insert the request in + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * @done: I/O completion handler + * + * Description: + * Insert a fully prepared request at the back of the io scheduler queue + * for execution. Don't wait for completion. + */ +void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head, + rq_end_io_fn *done) +{ + int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + + rq->rq_disk = bd_disk; + rq->cmd_flags |= REQ_NOMERGE; + rq->end_io = done; + WARN_ON(irqs_disabled()); + spin_lock_irq(q->queue_lock); + __elv_add_request(q, rq, where, 1); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +/** + * blk_execute_rq - insert a request into queue for execution + * @q: queue to insert the request in + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the io scheduler queue + * for execution and wait for completion. + */ +int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + char sense[SCSI_SENSE_BUFFERSIZE]; + int err = 0; + + /* + * we need an extra reference to the request, so we can look at + * it after io completion + */ + rq->ref_count++; + + if (!rq->sense) { + memset(sense, 0, sizeof(sense)); + rq->sense = sense; + rq->sense_len = 0; + } + + rq->end_io_data = &wait; + blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); + wait_for_completion(&wait); + + if (rq->errors) + err = -EIO; + + return err; +} + +EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-ioc.c b/block/blk-ioc.c new file mode 100644 index 00000000000..6d1675508eb --- /dev/null +++ b/block/blk-ioc.c @@ -0,0 +1,194 @@ +/* + * Functions related to io context handling + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ + +#include "blk.h" + +/* + * For io context allocations + */ +static struct kmem_cache *iocontext_cachep; + +static void cfq_dtor(struct io_context *ioc) +{ + struct cfq_io_context *cic[1]; + int r; + + /* + * We don't have a specific key to lookup with, so use the gang + * lookup to just retrieve the first item stored. The cfq exit + * function will iterate the full tree, so any member will do. + */ + r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); + if (r > 0) + cic[0]->dtor(ioc); +} + +/* + * IO Context helper functions. put_io_context() returns 1 if there are no + * more users of this io context, 0 otherwise. + */ +int put_io_context(struct io_context *ioc) +{ + if (ioc == NULL) + return 1; + + BUG_ON(atomic_read(&ioc->refcount) == 0); + + if (atomic_dec_and_test(&ioc->refcount)) { + rcu_read_lock(); + if (ioc->aic && ioc->aic->dtor) + ioc->aic->dtor(ioc->aic); + rcu_read_unlock(); + cfq_dtor(ioc); + + kmem_cache_free(iocontext_cachep, ioc); + return 1; + } + return 0; +} +EXPORT_SYMBOL(put_io_context); + +static void cfq_exit(struct io_context *ioc) +{ + struct cfq_io_context *cic[1]; + int r; + + rcu_read_lock(); + /* + * See comment for cfq_dtor() + */ + r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); + rcu_read_unlock(); + + if (r > 0) + cic[0]->exit(ioc); +} + +/* Called by the exitting task */ +void exit_io_context(void) +{ + struct io_context *ioc; + + task_lock(current); + ioc = current->io_context; + current->io_context = NULL; + task_unlock(current); + + if (atomic_dec_and_test(&ioc->nr_tasks)) { + if (ioc->aic && ioc->aic->exit) + ioc->aic->exit(ioc->aic); + cfq_exit(ioc); + + put_io_context(ioc); + } +} + +struct io_context *alloc_io_context(gfp_t gfp_flags, int node) +{ + struct io_context *ret; + + ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); + if (ret) { + atomic_set(&ret->refcount, 1); + atomic_set(&ret->nr_tasks, 1); + spin_lock_init(&ret->lock); + ret->ioprio_changed = 0; + ret->ioprio = 0; + ret->last_waited = jiffies; /* doesn't matter... */ + ret->nr_batch_requests = 0; /* because this is 0 */ + ret->aic = NULL; + INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); + ret->ioc_data = NULL; + } + + return ret; +} + +/* + * If the current task has no IO context then create one and initialise it. + * Otherwise, return its existing IO context. + * + * This returned IO context doesn't have a specifically elevated refcount, + * but since the current task itself holds a reference, the context can be + * used in general code, so long as it stays within `current` context. + */ +struct io_context *current_io_context(gfp_t gfp_flags, int node) +{ + struct task_struct *tsk = current; + struct io_context *ret; + + ret = tsk->io_context; + if (likely(ret)) + return ret; + + ret = alloc_io_context(gfp_flags, node); + if (ret) { + /* make sure set_task_ioprio() sees the settings above */ + smp_wmb(); + tsk->io_context = ret; + } + + return ret; +} + +/* + * If the current task has no IO context then create one and initialise it. + * If it does have a context, take a ref on it. + * + * This is always called in the context of the task which submitted the I/O. + */ +struct io_context *get_io_context(gfp_t gfp_flags, int node) +{ + struct io_context *ret = NULL; + + /* + * Check for unlikely race with exiting task. ioc ref count is + * zero when ioc is being detached. + */ + do { + ret = current_io_context(gfp_flags, node); + if (unlikely(!ret)) + break; + } while (!atomic_inc_not_zero(&ret->refcount)); + + return ret; +} +EXPORT_SYMBOL(get_io_context); + +void copy_io_context(struct io_context **pdst, struct io_context **psrc) +{ + struct io_context *src = *psrc; + struct io_context *dst = *pdst; + + if (src) { + BUG_ON(atomic_read(&src->refcount) == 0); + atomic_inc(&src->refcount); + put_io_context(dst); + *pdst = src; + } +} +EXPORT_SYMBOL(copy_io_context); + +void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) +{ + struct io_context *temp; + temp = *ioc1; + *ioc1 = *ioc2; + *ioc2 = temp; +} +EXPORT_SYMBOL(swap_io_context); + +int __init blk_ioc_init(void) +{ + iocontext_cachep = kmem_cache_create("blkdev_ioc", + sizeof(struct io_context), 0, SLAB_PANIC, NULL); + return 0; +} +subsys_initcall(blk_ioc_init); diff --git a/block/blk-map.c b/block/blk-map.c new file mode 100644 index 00000000000..916cfc96ffa --- /dev/null +++ b/block/blk-map.c @@ -0,0 +1,264 @@ +/* + * Functions related to mapping data to requests + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> + +#include "blk.h" + +int blk_rq_append_bio(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (!rq->bio) + blk_rq_bio_prep(q, rq, bio); + else if (!ll_back_merge_fn(q, rq, bio)) + return -EINVAL; + else { + rq->biotail->bi_next = bio; + rq->biotail = bio; + + rq->data_len += bio->bi_size; + } + return 0; +} +EXPORT_SYMBOL(blk_rq_append_bio); + +static int __blk_rq_unmap_user(struct bio *bio) +{ + int ret = 0; + + if (bio) { + if (bio_flagged(bio, BIO_USER_MAPPED)) + bio_unmap_user(bio); + else + ret = bio_uncopy_user(bio); + } + + return ret; +} + +static int __blk_rq_map_user(struct request_queue *q, struct request *rq, + void __user *ubuf, unsigned int len) +{ + unsigned long uaddr; + struct bio *bio, *orig_bio; + int reading, ret; + + reading = rq_data_dir(rq) == READ; + + /* + * if alignment requirement is satisfied, map in user pages for + * direct dma. else, set up kernel bounce buffers + */ + uaddr = (unsigned long) ubuf; + if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) + bio = bio_map_user(q, NULL, uaddr, len, reading); + else + bio = bio_copy_user(q, uaddr, len, reading); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + orig_bio = bio; + blk_queue_bounce(q, &bio); + + /* + * We link the bounce buffer in and could have to traverse it + * later so we have to get a ref to prevent it from being freed + */ + bio_get(bio); + + ret = blk_rq_append_bio(q, rq, bio); + if (!ret) + return bio->bi_size; + + /* if it was boucned we must call the end io function */ + bio_endio(bio, 0); + __blk_rq_unmap_user(orig_bio); + bio_put(bio); + return ret; +} + +/** + * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage + * @q: request queue where request should be inserted + * @rq: request structure to fill + * @ubuf: the user buffer + * @len: length of user data + * + * Description: + * Data will be mapped directly for zero copy io, if possible. Otherwise + * a kernel bounce buffer is used. + * + * A matching blk_rq_unmap_user() must be issued at the end of io, while + * still in process context. + * + * Note: The mapped bio may need to be bounced through blk_queue_bounce() + * before being submitted to the device, as pages mapped may be out of + * reach. It's the callers responsibility to make sure this happens. The + * original bio must be passed back in to blk_rq_unmap_user() for proper + * unmapping. + */ +int blk_rq_map_user(struct request_queue *q, struct request *rq, + void __user *ubuf, unsigned long len) +{ + unsigned long bytes_read = 0; + struct bio *bio = NULL; + int ret; + + if (len > (q->max_hw_sectors << 9)) + return -EINVAL; + if (!len || !ubuf) + return -EINVAL; + + while (bytes_read != len) { + unsigned long map_len, end, start; + + map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); + end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) + >> PAGE_SHIFT; + start = (unsigned long)ubuf >> PAGE_SHIFT; + + /* + * A bad offset could cause us to require BIO_MAX_PAGES + 1 + * pages. If this happens we just lower the requested + * mapping len by a page so that we can fit + */ + if (end - start > BIO_MAX_PAGES) + map_len -= PAGE_SIZE; + + ret = __blk_rq_map_user(q, rq, ubuf, map_len); + if (ret < 0) + goto unmap_rq; + if (!bio) + bio = rq->bio; + bytes_read += ret; + ubuf += ret; + } + + rq->buffer = rq->data = NULL; + return 0; +unmap_rq: + blk_rq_unmap_user(bio); + return ret; +} + +EXPORT_SYMBOL(blk_rq_map_user); + +/** + * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage + * @q: request queue where request should be inserted + * @rq: request to map data to + * @iov: pointer to the iovec + * @iov_count: number of elements in the iovec + * @len: I/O byte count + * + * Description: + * Data will be mapped directly for zero copy io, if possible. Otherwise + * a kernel bounce buffer is used. + * + * A matching blk_rq_unmap_user() must be issued at the end of io, while + * still in process context. + * + * Note: The mapped bio may need to be bounced through blk_queue_bounce() + * before being submitted to the device, as pages mapped may be out of + * reach. It's the callers responsibility to make sure this happens. The + * original bio must be passed back in to blk_rq_unmap_user() for proper + * unmapping. + */ +int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, + struct sg_iovec *iov, int iov_count, unsigned int len) +{ + struct bio *bio; + + if (!iov || iov_count <= 0) + return -EINVAL; + + /* we don't allow misaligned data like bio_map_user() does. If the + * user is using sg, they're expected to know the alignment constraints + * and respect them accordingly */ + bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio->bi_size != len) { + bio_endio(bio, 0); + bio_unmap_user(bio); + return -EINVAL; + } + + bio_get(bio); + blk_rq_bio_prep(q, rq, bio); + rq->buffer = rq->data = NULL; + return 0; +} + +EXPORT_SYMBOL(blk_rq_map_user_iov); + +/** + * blk_rq_unmap_user - unmap a request with user data + * @bio: start of bio list + * + * Description: + * Unmap a rq previously mapped by blk_rq_map_user(). The caller must + * supply the original rq->bio from the blk_rq_map_user() return, since + * the io completion may have changed rq->bio. + */ +int blk_rq_unmap_user(struct bio *bio) +{ + struct bio *mapped_bio; + int ret = 0, ret2; + + while (bio) { + mapped_bio = bio; + if (unlikely(bio_flagged(bio, BIO_BOUNCED))) + mapped_bio = bio->bi_private; + + ret2 = __blk_rq_unmap_user(mapped_bio); + if (ret2 && !ret) + ret = ret2; + + mapped_bio = bio; + bio = bio->bi_next; + bio_put(mapped_bio); + } + + return ret; +} + +EXPORT_SYMBOL(blk_rq_unmap_user); + +/** + * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage + * @q: request queue where request should be inserted + * @rq: request to fill + * @kbuf: the kernel buffer + * @len: length of user data + * @gfp_mask: memory allocation flags + */ +int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, + unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + if (len > (q->max_hw_sectors << 9)) + return -EINVAL; + if (!len || !kbuf) + return -EINVAL; + + bio = bio_map_kern(q, kbuf, len, gfp_mask); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (rq_data_dir(rq) == WRITE) + bio->bi_rw |= (1 << BIO_RW); + + blk_rq_bio_prep(q, rq, bio); + blk_queue_bounce(q, &rq->bio); + rq->buffer = rq->data = NULL; + return 0; +} + +EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-settings.c b/block/blk-settings.c new file mode 100644 index 00000000000..4df09a1b8f4 --- /dev/null +++ b/block/blk-settings.c @@ -0,0 +1,402 @@ +/* + * Functions related to setting various queue properties from drivers + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ + +#include "blk.h" + +unsigned long blk_max_low_pfn, blk_max_pfn; +EXPORT_SYMBOL(blk_max_low_pfn); +EXPORT_SYMBOL(blk_max_pfn); + +/** + * blk_queue_prep_rq - set a prepare_request function for queue + * @q: queue + * @pfn: prepare_request function + * + * It's possible for a queue to register a prepare_request callback which + * is invoked before the request is handed to the request_fn. The goal of + * the function is to prepare a request for I/O, it can be used to build a + * cdb from the request data for instance. + * + */ +void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) +{ + q->prep_rq_fn = pfn; +} + +EXPORT_SYMBOL(blk_queue_prep_rq); + +/** + * blk_queue_merge_bvec - set a merge_bvec function for queue + * @q: queue + * @mbfn: merge_bvec_fn + * + * Usually queues have static limitations on the max sectors or segments that + * we can put in a request. Stacking drivers may have some settings that + * are dynamic, and thus we have to query the queue whether it is ok to + * add a new bio_vec to a bio at a given offset or not. If the block device + * has such limitations, it needs to register a merge_bvec_fn to control + * the size of bio's sent to it. Note that a block device *must* allow a + * single page to be added to an empty bio. The block device driver may want + * to use the bio_split() function to deal with these bio's. By default + * no merge_bvec_fn is defined for a queue, and only the fixed limits are + * honored. + */ +void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) +{ + q->merge_bvec_fn = mbfn; +} + +EXPORT_SYMBOL(blk_queue_merge_bvec); + +void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) +{ + q->softirq_done_fn = fn; +} + +EXPORT_SYMBOL(blk_queue_softirq_done); + +/** + * blk_queue_make_request - define an alternate make_request function for a device + * @q: the request queue for the device to be affected + * @mfn: the alternate make_request function + * + * Description: + * The normal way for &struct bios to be passed to a device + * driver is for them to be collected into requests on a request + * queue, and then to allow the device driver to select requests + * off that queue when it is ready. This works well for many block + * devices. However some block devices (typically virtual devices + * such as md or lvm) do not benefit from the processing on the + * request queue, and are served best by having the requests passed + * directly to them. This can be achieved by providing a function + * to blk_queue_make_request(). + * + * Caveat: + * The driver that does this *must* be able to deal appropriately + * with buffers in "highmemory". This can be accomplished by either calling + * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling + * blk_queue_bounce() to create a buffer in normal memory. + **/ +void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) +{ + /* + * set defaults + */ + q->nr_requests = BLKDEV_MAX_RQ; + blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); + blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); + q->make_request_fn = mfn; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + q->backing_dev_info.state = 0; + q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + blk_queue_max_sectors(q, SAFE_MAX_SECTORS); + blk_queue_hardsect_size(q, 512); + blk_queue_dma_alignment(q, 511); + blk_queue_congestion_threshold(q); + q->nr_batching = BLK_BATCH_REQ; + + q->unplug_thresh = 4; /* hmm */ + q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + if (q->unplug_delay == 0) + q->unplug_delay = 1; + + INIT_WORK(&q->unplug_work, blk_unplug_work); + + q->unplug_timer.function = blk_unplug_timeout; + q->unplug_timer.data = (unsigned long)q; + + /* + * by default assume old behaviour and bounce for any highmem page + */ + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); +} + +EXPORT_SYMBOL(blk_queue_make_request); + +/** + * blk_queue_bounce_limit - set bounce buffer limit for queue + * @q: the request queue for the device + * @dma_addr: bus address limit + * + * Description: + * Different hardware can have different requirements as to what pages + * it can do I/O directly to. A low level driver can call + * blk_queue_bounce_limit to have lower memory pages allocated as bounce + * buffers for doing I/O to pages residing above @page. + **/ +void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) +{ + unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + int dma = 0; + + q->bounce_gfp = GFP_NOIO; +#if BITS_PER_LONG == 64 + /* Assume anything <= 4GB can be handled by IOMMU. + Actually some IOMMUs can handle everything, but I don't + know of a way to test this here. */ + if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) + dma = 1; + q->bounce_pfn = max_low_pfn; +#else + if (bounce_pfn < blk_max_low_pfn) + dma = 1; + q->bounce_pfn = bounce_pfn; +#endif + if (dma) { + init_emergency_isa_pool(); + q->bounce_gfp = GFP_NOIO | GFP_DMA; + q->bounce_pfn = bounce_pfn; + } +} + +EXPORT_SYMBOL(blk_queue_bounce_limit); + +/** + * blk_queue_max_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_sectors: max sectors in the usual 512b unit + * + * Description: + * Enables a low level driver to set an upper limit on the size of + * received requests. + **/ +void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) +{ + if ((max_sectors << 9) < PAGE_CACHE_SIZE) { + max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); + } + + if (BLK_DEF_MAX_SECTORS > max_sectors) + q->max_hw_sectors = q->max_sectors = max_sectors; + else { + q->max_sectors = BLK_DEF_MAX_SECTORS; + q->max_hw_sectors = max_sectors; + } +} + +EXPORT_SYMBOL(blk_queue_max_sectors); + +/** + * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * physical data segments in a request. This would be the largest sized + * scatter list the driver could handle. + **/ +void blk_queue_max_phys_segments(struct request_queue *q, + unsigned short max_segments) +{ + if (!max_segments) { + max_segments = 1; + printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); + } + + q->max_phys_segments = max_segments; +} + +EXPORT_SYMBOL(blk_queue_max_phys_segments); + +/** + * blk_queue_max_hw_segments - set max hw segments for a request for this queue + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * hw data segments in a request. This would be the largest number of + * address/length pairs the host adapter can actually give as once + * to the device. + **/ +void blk_queue_max_hw_segments(struct request_queue *q, + unsigned short max_segments) +{ + if (!max_segments) { + max_segments = 1; + printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); + } + + q->max_hw_segments = max_segments; +} + +EXPORT_SYMBOL(blk_queue_max_hw_segments); + +/** + * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg + * @q: the request queue for the device + * @max_size: max size of segment in bytes + * + * Description: + * Enables a low level driver to set an upper limit on the size of a + * coalesced segment + **/ +void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) +{ + if (max_size < PAGE_CACHE_SIZE) { + max_size = PAGE_CACHE_SIZE; + printk("%s: set to minimum %d\n", __FUNCTION__, max_size); + } + + q->max_segment_size = max_size; +} + +EXPORT_SYMBOL(blk_queue_max_segment_size); + +/** + * blk_queue_hardsect_size - set hardware sector size for the queue + * @q: the request queue for the device + * @size: the hardware sector size, in bytes + * + * Description: + * This should typically be set to the lowest possible sector size + * that the hardware can operate on (possible without reverting to + * even internal read-modify-write operations). Usually the default + * of 512 covers most hardware. + **/ +void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) +{ + q->hardsect_size = size; +} + +EXPORT_SYMBOL(blk_queue_hardsect_size); + +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +/** + * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers + * @t: the stacking driver (top) + * @b: the underlying device (bottom) + **/ +void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) +{ + /* zero is "infinity" */ + t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); + t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); + + t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); + t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); + t->max_segment_size = min(t->max_segment_size,b->max_segment_size); + t->hardsect_size = max(t->hardsect_size,b->hardsect_size); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_stack_limits); + +/** + * blk_queue_dma_drain - Set up a drain buffer for excess dma. + * + * @q: the request queue for the device + * @buf: physically contiguous buffer + * @size: size of the buffer in bytes + * + * Some devices have excess DMA problems and can't simply discard (or + * zero fill) the unwanted piece of the transfer. They have to have a + * real area of memory to transfer it into. The use case for this is + * ATAPI devices in DMA mode. If the packet command causes a transfer + * bigger than the transfer size some HBAs will lock up if there + * aren't DMA elements to contain the excess transfer. What this API + * does is adjust the queue so that the buf is always appended + * silently to the scatterlist. + * + * Note: This routine adjusts max_hw_segments to make room for + * appending the drain buffer. If you call + * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after + * calling this routine, you must set the limit to one fewer than your + * device can support otherwise there won't be room for the drain + * buffer. + */ +int blk_queue_dma_drain(struct request_queue *q, void *buf, + unsigned int size) +{ + if (q->max_hw_segments < 2 || q->max_phys_segments < 2) + return -EINVAL; + /* make room for appending the drain */ + --q->max_hw_segments; + --q->max_phys_segments; + q->dma_drain_buffer = buf; + q->dma_drain_size = size; + + return 0; +} + +EXPORT_SYMBOL_GPL(blk_queue_dma_drain); + +/** + * blk_queue_segment_boundary - set boundary rules for segment merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) +{ + if (mask < PAGE_CACHE_SIZE - 1) { + mask = PAGE_CACHE_SIZE - 1; + printk("%s: set to minimum %lx\n", __FUNCTION__, mask); + } + + q->seg_boundary_mask = mask; +} + +EXPORT_SYMBOL(blk_queue_segment_boundary); + +/** + * blk_queue_dma_alignment - set dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * set required memory and length aligment for direct dma transactions. + * this is used when buiding direct io requests for the queue. + * + **/ +void blk_queue_dma_alignment(struct request_queue *q, int mask) +{ + q->dma_alignment = mask; +} + +EXPORT_SYMBOL(blk_queue_dma_alignment); + +/** + * blk_queue_update_dma_alignment - update dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * update required memory and length aligment for direct dma transactions. + * If the requested alignment is larger than the current alignment, then + * the current queue alignment is updated to the new value, otherwise it + * is left alone. The design of this is to allow multiple objects + * (driver, device, transport etc) to set their respective + * alignments without having them interfere. + * + **/ +void blk_queue_update_dma_alignment(struct request_queue *q, int mask) +{ + BUG_ON(mask > PAGE_SIZE); + + if (mask > q->dma_alignment) + q->dma_alignment = mask; +} + +EXPORT_SYMBOL(blk_queue_update_dma_alignment); + +int __init blk_settings_init(void) +{ + blk_max_low_pfn = max_low_pfn - 1; + blk_max_pfn = max_pfn - 1; + return 0; +} +subsys_initcall(blk_settings_init); diff --git a/block/blk.h b/block/blk.h index d88549df1b0..08339400719 100644 --- a/block/blk.h +++ b/block/blk.h @@ -1,11 +1,28 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H +/* Amount of time in which a process may batch requests */ +#define BLK_BATCH_TIME (HZ/50UL) + +/* Number of requests a "batching" process may submit */ +#define BLK_BATCH_REQ 32 + extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; +void rq_init(struct request_queue *q, struct request *rq); +void init_request_from_bio(struct request *req, struct bio *bio); +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio); +int ll_back_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio); void __blk_queue_free_tags(struct request_queue *q); +void blk_unplug_work(struct work_struct *work); +void blk_unplug_timeout(unsigned long data); + +struct io_context *current_io_context(gfp_t gfp_flags, int node); + void blk_queue_congestion_threshold(struct request_queue *q); /* |