From 6e66b49392419f3fe134e1be583323ef75da1e4b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:17 -0700 Subject: blk-mq: Keep set->nr_hw_queues and set->map[].nr_queues in sync blk_mq_map_queues() and multiple .map_queues() implementations expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the number of hardware queues. Hence set .nr_queues before calling these functions. This patch fixes the following kernel warning: WARNING: CPU: 0 PID: 2501 at include/linux/cpumask.h:137 Call Trace: blk_mq_run_hw_queue+0x19d/0x350 block/blk-mq.c:1508 blk_mq_run_hw_queues+0x112/0x1a0 block/blk-mq.c:1525 blk_mq_requeue_work+0x502/0x780 block/blk-mq.c:775 process_one_work+0x9af/0x1740 kernel/workqueue.c:2269 worker_thread+0x98/0xe40 kernel/workqueue.c:2415 kthread+0x361/0x430 kernel/kthread.c:255 Fixes: ed76e329d74a ("blk-mq: abstract out queue map") # v5.0 Reported-by: syzbot+d44e1b26ce5c3e77458d@syzkaller.appspotmail.com Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d92088dec6c3..d4bd9b961726 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3023,6 +3023,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { + /* + * blk_mq_map_queues() and multiple .map_queues() implementations + * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the + * number of hardware queues. + */ + if (set->nr_maps == 1) + set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; + if (set->ops->map_queues && !is_kdump_kernel()) { int i; -- cgit v1.2.3 From d0930bb8f46b8fb4a7d429c0bf1c91b3ed00a7cf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:18 -0700 Subject: blk-mq: Fix a recently introduced regression in blk_mq_realloc_hw_ctxs() q->nr_hw_queues must only be updated once it is known that blk_mq_realloc_hw_ctxs() has succeeded. Otherwise it can happen that reallocation fails and that q->nr_hw_queues is larger than the number of allocated hardware queues. This patch fixes the following crash if increasing the number of hardware queues fails: BUG: KASAN: null-ptr-deref in blk_mq_map_swqueue+0x775/0x810 Write of size 8 at addr 0000000000000118 by task check/977 CPU: 3 PID: 977 Comm: check Not tainted 5.6.0-rc1-dbg+ #8 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0xa5/0xe6 __kasan_report.cold+0x65/0x99 kasan_report+0x16/0x20 check_memory_region+0x140/0x1b0 memset+0x28/0x40 blk_mq_map_swqueue+0x775/0x810 blk_mq_update_nr_hw_queues+0x468/0x710 nullb_device_submit_queues_store+0xf7/0x1a0 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: ac0d6b926e74 ("block: Reduce the amount of memory required per request queue") Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Cc: Keith Busch Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d4bd9b961726..37ff8dfb8ab9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2824,7 +2824,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); q->queue_hw_ctx = new_hctxs; - q->nr_hw_queues = set->nr_hw_queues; kfree(hctxs); hctxs = new_hctxs; } -- cgit v1.2.3 From 30a2da7b7e225ef6c87a660419ea04d3cef3f6a7 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 11 Mar 2020 16:07:50 +0530 Subject: block: Fix use-after-free issue accessing struct io_cq There is a potential race between ioc_release_fn() and ioc_clear_queue() as shown below, due to which below kernel crash is observed. It also can result into use-after-free issue. context#1: context#2: ioc_release_fn() __ioc_clear_queue() gets the same icq ->spin_lock(&ioc->lock); ->spin_lock(&ioc->lock); ->ioc_destroy_icq(icq); ->list_del_init(&icq->q_node); ->call_rcu(&icq->__rcu_head, icq_free_icq_rcu); ->spin_unlock(&ioc->lock); ->ioc_destroy_icq(icq); ->hlist_del_init(&icq->ioc_node); This results into below crash as this memory is now used by icq->__rcu_head in context#1. There is a chance that icq could be free'd as well. 22150.386550: <6> Unable to handle kernel write to read-only memory at virtual address ffffffaa8d31ca50 ... Call trace: 22150.607350: <2> ioc_destroy_icq+0x44/0x110 22150.611202: <2> ioc_clear_queue+0xac/0x148 22150.615056: <2> blk_cleanup_queue+0x11c/0x1a0 22150.619174: <2> __scsi_remove_device+0xdc/0x128 22150.623465: <2> scsi_forget_host+0x2c/0x78 22150.627315: <2> scsi_remove_host+0x7c/0x2a0 22150.631257: <2> usb_stor_disconnect+0x74/0xc8 22150.635371: <2> usb_unbind_interface+0xc8/0x278 22150.639665: <2> device_release_driver_internal+0x198/0x250 22150.644897: <2> device_release_driver+0x24/0x30 22150.649176: <2> bus_remove_device+0xec/0x140 22150.653204: <2> device_del+0x270/0x460 22150.656712: <2> usb_disable_device+0x120/0x390 22150.660918: <2> usb_disconnect+0xf4/0x2e0 22150.664684: <2> hub_event+0xd70/0x17e8 22150.668197: <2> process_one_work+0x210/0x480 22150.672222: <2> worker_thread+0x32c/0x4c8 Fix this by adding a new ICQ_DESTROYED flag in ioc_destroy_icq() to indicate this icq is once marked as destroyed. Also, ensure __ioc_clear_queue() is accessing icq within rcu_read_lock/unlock so that icq doesn't get free'd up while it is still using it. Signed-off-by: Sahitya Tummala Co-developed-by: Pradeep P V K Signed-off-by: Pradeep P V K Signed-off-by: Jens Axboe --- block/blk-ioc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5ed59ac6ae58..9df50fb507ca 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq) * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; + icq->flags |= ICQ_DESTROYED; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } @@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; + rcu_read_lock(); while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); + if (icq->flags & ICQ_DESTROYED) { + spin_unlock_irqrestore(&ioc->lock, flags); + continue; + } ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } + rcu_read_unlock(); } /** -- cgit v1.2.3 From 0d72031820a710c82fde4ed61352ff2a4217e6dd Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:33 +0100 Subject: block: fix comment for blk_cloned_rq_check_limits Since the later description mentioned "checked against the new queue limits", so make the change to avoid confusion. Signed-off-by: Guoqing Jiang Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 60dc9552ef8d..e26341aa2e3a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for new the queue limits + * for the new queue limits * @q: the queue * @rq: the request being checked * -- cgit v1.2.3 From 35ed78b32cbbb6499b82e7a3a6769fa14e4b3c92 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:34 +0100 Subject: block: use bio_{wouldblock,io}_error in direct_make_request Use the two functions to simplify code. Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e26341aa2e3a..9a78f62285f9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1121,10 +1121,9 @@ blk_qc_t direct_make_request(struct bio *bio) if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) - bio->bi_status = BLK_STS_AGAIN; + bio_wouldblock_error(bio); else - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); return BLK_QC_T_NONE; } -- cgit v1.2.3 From fc4cc772102511de90e708e72754800686aa0043 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:35 +0100 Subject: block: remove redundant setting of QUEUE_FLAG_DYING Previously, blk_cleanup_queue has called blk_set_queue_dying to set the flag, no need to do it again. Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9a78f62285f9..74edcadd6747 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -346,7 +346,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * Drain all requests queued before DYING marking. Set DEAD flag to -- cgit v1.2.3 From 361301a222193c85bc53dbe64770271a4d4ff7f4 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:36 +0100 Subject: block: cleanup for _blk/blk_rq_prep_clone Both cmd and sense had been moved to scsi_request, so remove the related comments to avoid confusion. And as Bart suggested, move _blk_rq_prep_clone into the only caller (blk_rq_prep_clone). Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- block/blk-core.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 74edcadd6747..abfdcf81a228 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1581,23 +1581,6 @@ void blk_rq_unprep_clone(struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); -/* - * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->sense) are not copied. - */ -static void __blk_rq_prep_clone(struct request *dst, struct request *src) -{ - dst->__sector = blk_rq_pos(src); - dst->__data_len = blk_rq_bytes(src); - if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { - dst->rq_flags |= RQF_SPECIAL_PAYLOAD; - dst->special_vec = src->special_vec; - } - dst->nr_phys_segments = src->nr_phys_segments; - dst->ioprio = src->ioprio; - dst->extra_len = src->extra_len; -} - /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup @@ -1610,8 +1593,6 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->sense) - * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means @@ -1642,7 +1623,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->bio = rq->biotail = bio; } - __blk_rq_prep_clone(rq, rq_src); + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + rq->extra_len = rq_src->extra_len; return 0; -- cgit v1.2.3 From 754a15726f8d82afa87076505ce00a6a5806a48f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:37 +0100 Subject: block: remove unneeded argument from blk_alloc_flush_queue Remove 'q' from arguments since it is not used anymore after commit 7e992f847a08e ("block: remove non mq parts from the flush code"). Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-flush.c | 4 ++-- block/blk-mq.c | 3 +-- block/blk.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 5cc775bdb06a..7f7f98305115 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -485,8 +485,8 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags) +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); diff --git a/block/blk-mq.c b/block/blk-mq.c index 37ff8dfb8ab9..5b2e6550e0b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2409,8 +2409,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, - gfp); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; diff --git a/block/blk.h b/block/blk.h index 0b8884353f6b..670337b7cfa0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,8 +55,8 @@ is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) return hctx->fq->flush_rq == req; } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags); +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); -- cgit v1.2.3 From ce24f736f2e047d1489dc51f0aa66d5a6c5dfb12 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:38 +0100 Subject: block: cleanup comment for blk_flush_complete_seq Remove the comment about return value, since it is not valid after commit 404b8f5a03d84 ("block: cleanup kick/queued handling"). Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 7f7f98305115..843d25683691 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -160,9 +160,6 @@ static void blk_account_io_flush(struct request *rq) * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) - * - * RETURNS: - * %true if requests were added to the dispatch queue, %false otherwise. */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, -- cgit v1.2.3 From 88d6041d070028ef31c52845966216004ebba3bb Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Tue, 3 Mar 2020 12:17:00 -0700 Subject: block: sed-opal: Change the check condition for regular session validity This patch changes the check condition for the validity/authentication of the session. 1. The Host Session Number(HSN) in the response should match the HSN for the session. 2. The TPER Session Number(TSN) can never be less than 4096 for a regular session. Reference: Section 3.2.2.1 of https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage_Opal_SSC_Application_Note_1-00_1-00-Final.pdf Section 3.3.7.1.1 of https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf Co-developed-by: Andrzej Jakowski Signed-off-by: Andrzej Jakowski Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index 325cbba2465f..b486b3ec7dc4 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -36,6 +36,7 @@ enum opal_response_token { #define DTAERROR_NO_METHOD_STATUS 0x89 #define GENERIC_HOST_SESSION_NUM 0x41 +#define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 #define MBR_ENABLED_MASK 0x10 diff --git a/block/sed-opal.c b/block/sed-opal.c index 880cc57a5f6b..daafadbb88ca 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1056,7 +1056,7 @@ static int start_opal_session_cont(struct opal_dev *dev) hsn = response_get_u64(&dev->parsed, 4); tsn = response_get_u64(&dev->parsed, 5); - if (hsn == 0 && tsn == 0) { + if (hsn != GENERIC_HOST_SESSION_NUM || tsn < FIRST_TPER_SESSION_NUM) { pr_debug("Couldn't authenticate session\n"); return -EPERM; } -- cgit v1.2.3 From fa800d73c8d0d36b1f5929198371f421b69e610e Mon Sep 17 00:00:00 2001 From: Weiping Zhang Date: Thu, 27 Feb 2020 09:38:46 +0800 Subject: blk-iocost: remove duplicated lines in comments Acked-by: Tejun Heo Signed-off-by: Weiping Zhang Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 27ca68621137..6a7788f31c22 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -46,9 +46,6 @@ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * - * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate - * device-specific coefficients. - * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. -- cgit v1.2.3 From 11bde986002c0af67eb92d73321d06baefae7128 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 12 Feb 2020 20:40:27 +0300 Subject: block, zoned: fix integer overflow with BLKRESETZONE et al Check for overflow in addition before checking for end-of-block-device. Steps to reproduce: #define _GNU_SOURCE 1 #include #include #include #include typedef unsigned long long __u64; struct blk_zone_range { __u64 sector; __u64 nr_sectors; }; #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) int main(void) { int fd = open("/dev/nullb0", O_RDWR|O_DIRECT); struct blk_zone_range zr = {4096, 0xfffffffffffff000ULL}; ioctl(fd, BLKRESETZONE, &zr); return 0; } BUG: KASAN: null-ptr-deref in submit_bio_wait+0x74/0xe0 Write of size 8 at addr 0000000000000040 by task a.out/1590 CPU: 8 PID: 1590 Comm: a.out Not tainted 5.6.0-rc1-00019-g359c92c02bfa #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 Call Trace: dump_stack+0x76/0xa0 __kasan_report.cold+0x5/0x3e kasan_report+0xe/0x20 submit_bio_wait+0x74/0xe0 blkdev_zone_mgmt+0x26f/0x2a0 blkdev_zone_mgmt_ioctl+0x14b/0x1b0 blkdev_ioctl+0xb28/0xe60 block_ioctl+0x69/0x80 ksys_ioctl+0x3af/0xa50 Reviewed-by: Christoph Hellwig Signed-off-by: Alexey Dobriyan (SK hynix) Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 05741c6f618b..6b442ae96499 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -173,7 +173,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, if (!op_is_zone_mgmt(op)) return -EOPNOTSUPP; - if (!nr_sectors || end_sector > capacity) + if (end_sector <= sector || end_sector > capacity) /* Out of range */ return -EINVAL; -- cgit v1.2.3 From de6a78b601c529398ad1448e3bffcade1fcf5a70 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Mar 2020 11:43:36 +0800 Subject: block: Prevent hung_check firing during long sync IO submit_bio_wait() can be called from ioctl(BLKSECDISCARD), which may take long time to complete, as Salman mentioned, 4K BLKSECDISCARD takes up to 100 second on some devices. Also any block I/O operation that occurs after the BLKSECDISCARD is submitted will also potentially be affected by the hung task timeouts. Another report is that task hang can be observed when running mkfs over raid10 which takes a small max discard sectors limit because of chunk size. So prevent hung_check from firing by taking same approach used in blk_execute_rq(), and the wake-up interval is set as half the hung_check timer period, which keeps overhead low enough. Cc: Salman Qazi Cc: Jesse Barnes Cc: Bart Van Assche Link: https://lkml.org/lkml/2020/2/12/1193 Reported-by: Salman Qazi Reviewed-by: Jesse Barnes Reviewed-by: Salman Qazi Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 94d697217887..0985f3422556 100644 --- a/block/bio.c +++ b/block/bio.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1019,12 +1020,21 @@ static void submit_bio_wait_endio(struct bio *bio) int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - wait_for_completion_io(&done); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); } -- cgit v1.2.3 From e598a72faeb543599bdf0d930df3a71906404e6f Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:05 +0000 Subject: block/genhd: Notify udev about capacity change Allow block/genhd to notify user space (via udev) about disk size changes using a new helper set_capacity_revalidate_and_notify(), which is a wrapper on top of set_capacity(). set_capacity_revalidate_and_notify() will only notify via udev if the current capacity or the target capacity is not zero and iff the capacity changes. Suggested-by: Christoph Hellwig Signed-off-by: Someswarudu Sangaraju Signed-off-by: Balbir Singh Reviewed-by: Bob Liu Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ff6268970ddc..6a60131baffa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -46,6 +46,30 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool revalidate) +{ + sector_t capacity = get_capacity(disk); + + set_capacity(disk, size); + + if (revalidate) + revalidate_disk(disk); + + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; + + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + } +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) -- cgit v1.2.3 From 2f95fa5c955d0a9987ffdc3a095e2f4e62c5f2a9 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Thu, 19 Mar 2020 19:18:13 +0800 Subject: block, bfq: fix use-after-free in bfq_idle_slice_timer_body In bfq_idle_slice_timer func, bfqq = bfqd->in_service_queue is not in bfqd-lock critical section. The bfqq, which is not equal to NULL in bfq_idle_slice_timer, may be freed after passing to bfq_idle_slice_timer_body. So we will access the freed memory. In addition, considering the bfqq may be in race, we should firstly check whether bfqq is in service before doing something on it in bfq_idle_slice_timer_body func. If the bfqq in race is not in service, it means the bfqq has been expired through __bfq_bfqq_expire func, and wait_request flags has been cleared in __bfq_bfqd_reset_in_service func. So we do not need to re-clear the wait_request of bfqq which is not in service. KASAN log is given as follows: [13058.354613] ================================================================== [13058.354640] BUG: KASAN: use-after-free in bfq_idle_slice_timer+0xac/0x290 [13058.354644] Read of size 8 at addr ffffa02cf3e63f78 by task fork13/19767 [13058.354646] [13058.354655] CPU: 96 PID: 19767 Comm: fork13 [13058.354661] Call trace: [13058.354667] dump_backtrace+0x0/0x310 [13058.354672] show_stack+0x28/0x38 [13058.354681] dump_stack+0xd8/0x108 [13058.354687] print_address_description+0x68/0x2d0 [13058.354690] kasan_report+0x124/0x2e0 [13058.354697] __asan_load8+0x88/0xb0 [13058.354702] bfq_idle_slice_timer+0xac/0x290 [13058.354707] __hrtimer_run_queues+0x298/0x8b8 [13058.354710] hrtimer_interrupt+0x1b8/0x678 [13058.354716] arch_timer_handler_phys+0x4c/0x78 [13058.354722] handle_percpu_devid_irq+0xf0/0x558 [13058.354731] generic_handle_irq+0x50/0x70 [13058.354735] __handle_domain_irq+0x94/0x110 [13058.354739] gic_handle_irq+0x8c/0x1b0 [13058.354742] el1_irq+0xb8/0x140 [13058.354748] do_wp_page+0x260/0xe28 [13058.354752] __handle_mm_fault+0x8ec/0x9b0 [13058.354756] handle_mm_fault+0x280/0x460 [13058.354762] do_page_fault+0x3ec/0x890 [13058.354765] do_mem_abort+0xc0/0x1b0 [13058.354768] el0_da+0x24/0x28 [13058.354770] [13058.354773] Allocated by task 19731: [13058.354780] kasan_kmalloc+0xe0/0x190 [13058.354784] kasan_slab_alloc+0x14/0x20 [13058.354788] kmem_cache_alloc_node+0x130/0x440 [13058.354793] bfq_get_queue+0x138/0x858 [13058.354797] bfq_get_bfqq_handle_split+0xd4/0x328 [13058.354801] bfq_init_rq+0x1f4/0x1180 [13058.354806] bfq_insert_requests+0x264/0x1c98 [13058.354811] blk_mq_sched_insert_requests+0x1c4/0x488 [13058.354818] blk_mq_flush_plug_list+0x2d4/0x6e0 [13058.354826] blk_flush_plug_list+0x230/0x548 [13058.354830] blk_finish_plug+0x60/0x80 [13058.354838] read_pages+0xec/0x2c0 [13058.354842] __do_page_cache_readahead+0x374/0x438 [13058.354846] ondemand_readahead+0x24c/0x6b0 [13058.354851] page_cache_sync_readahead+0x17c/0x2f8 [13058.354858] generic_file_buffered_read+0x588/0xc58 [13058.354862] generic_file_read_iter+0x1b4/0x278 [13058.354965] ext4_file_read_iter+0xa8/0x1d8 [ext4] [13058.354972] __vfs_read+0x238/0x320 [13058.354976] vfs_read+0xbc/0x1c0 [13058.354980] ksys_read+0xdc/0x1b8 [13058.354984] __arm64_sys_read+0x50/0x60 [13058.354990] el0_svc_common+0xb4/0x1d8 [13058.354994] el0_svc_handler+0x50/0xa8 [13058.354998] el0_svc+0x8/0xc [13058.354999] [13058.355001] Freed by task 19731: [13058.355007] __kasan_slab_free+0x120/0x228 [13058.355010] kasan_slab_free+0x10/0x18 [13058.355014] kmem_cache_free+0x288/0x3f0 [13058.355018] bfq_put_queue+0x134/0x208 [13058.355022] bfq_exit_icq_bfqq+0x164/0x348 [13058.355026] bfq_exit_icq+0x28/0x40 [13058.355030] ioc_exit_icq+0xa0/0x150 [13058.355035] put_io_context_active+0x250/0x438 [13058.355038] exit_io_context+0xd0/0x138 [13058.355045] do_exit+0x734/0xc58 [13058.355050] do_group_exit+0x78/0x220 [13058.355054] __wake_up_parent+0x0/0x50 [13058.355058] el0_svc_common+0xb4/0x1d8 [13058.355062] el0_svc_handler+0x50/0xa8 [13058.355066] el0_svc+0x8/0xc [13058.355067] [13058.355071] The buggy address belongs to the object at ffffa02cf3e63e70#012 which belongs to the cache bfq_queue of size 464 [13058.355075] The buggy address is located 264 bytes inside of#012 464-byte region [ffffa02cf3e63e70, ffffa02cf3e64040) [13058.355077] The buggy address belongs to the page: [13058.355083] page:ffff7e80b3cf9800 count:1 mapcount:0 mapping:ffff802db5c90780 index:0xffffa02cf3e606f0 compound_mapcount: 0 [13058.366175] flags: 0x2ffffe0000008100(slab|head) [13058.370781] raw: 2ffffe0000008100 ffff7e80b53b1408 ffffa02d730c1c90 ffff802db5c90780 [13058.370787] raw: ffffa02cf3e606f0 0000000000370023 00000001ffffffff 0000000000000000 [13058.370789] page dumped because: kasan: bad access detected [13058.370791] [13058.370792] Memory state around the buggy address: [13058.370797] ffffa02cf3e63e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb [13058.370801] ffffa02cf3e63e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370805] >ffffa02cf3e63f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370808] ^ [13058.370811] ffffa02cf3e63f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370815] ffffa02cf3e64000: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [13058.370817] ================================================================== [13058.370820] Disabling lock debugging due to kernel taint Here, we directly pass the bfqd to bfq_idle_slice_timer_body func. -- V2->V3: rewrite the comment as suggested by Paolo Valente V1->V2: add one comment, and add Fixes and Reported-by tag. Fixes: aee69d78d ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Acked-by: Paolo Valente Reported-by: Wang Wang Signed-off-by: Zhiqiang Liu Signed-off-by: Feilong Lin Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8c436abfaf14..4a44c7f19435 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6215,20 +6215,28 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return bfqq; } -static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) +static void +bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; enum bfqq_expiration reason; unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_clear_bfqq_wait_request(bfqq); + /* + * Considering that bfqq may be in race, we should firstly check + * whether bfqq is in service before doing something on it. If + * the bfqq in race is not in service, it has already been expired + * through __bfq_bfqq_expire func and its wait_request flags has + * been cleared in __bfq_bfqd_reset_in_service func. + */ if (bfqq != bfqd->in_service_queue) { spin_unlock_irqrestore(&bfqd->lock, flags); return; } + bfq_clear_bfqq_wait_request(bfqq); + if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -6273,7 +6281,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) * early. */ if (bfqq) - bfq_idle_slice_timer_body(bfqq); + bfq_idle_slice_timer_body(bfqd, bfqq); return HRTIMER_NORESTART; } -- cgit v1.2.3 From fd1bb3ae54a9a2e0c42709de861c69aa146b8955 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:18 +0100 Subject: block, bfq: move forward the getting of an extra ref in bfq_bfqq_move Commit ecedd3d7e199 ("block, bfq: get extra ref to prevent a queue from being freed during a group move") gets an extra reference to a bfq_queue before possibly deactivating it (temporarily), in bfq_bfqq_move(). This prevents the bfq_queue from disappearing before being reactivated in its new group. Yet, the bfq_queue may also be expired (i.e., its service may be stopped) before the bfq_queue is deactivated. And also an expiration may lead to a premature freeing. This commit fixes this issue by simply moving forward the getting of the extra reference already introduced by commit ecedd3d7e199 ("block, bfq: get extra ref to prevent a queue from being freed during a group move"). Reported-by: cki-project@redhat.com Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index f0ff6654af28..9d963ed518d1 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,6 +642,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; + /* + * Get extra reference to prevent bfqq from being freed in + * next possible expire or deactivate. + */ + bfqq->ref++; + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -652,12 +658,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); - /* - * get extra reference to prevent bfqq from being freed in - * next possible deactivate - */ - bfqq->ref++; - if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) @@ -677,7 +677,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); - /* release extra ref taken above */ + /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } -- cgit v1.2.3 From c8997736650060594845e42c5d01d3118aec8d25 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:19 +0100 Subject: block, bfq: turn put_queue into release_process_ref in __bfq_bic_change_cgroup A bfq_put_queue() may be invoked in __bfq_bic_change_cgroup(). The goal of this put is to release a process reference to a bfq_queue. But process-reference releases may trigger also some extra operation, and, to this goal, are handled through bfq_release_process_ref(). So, turn the invocation of bfq_put_queue() into an invocation of bfq_release_process_ref(). Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 5 +---- block/bfq-iosched.c | 2 -- block/bfq-iosched.h | 1 + 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9d963ed518d1..72c6151ace96 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -714,10 +714,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (entity->sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", - async_bfqq, async_bfqq->ref); - bfq_put_queue(async_bfqq); + bfq_release_process_ref(bfqd, async_bfqq); } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4a44c7f19435..78ba57efd16b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2716,8 +2716,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } - -static void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index d1233af9c684..cd224aaf9f52 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,6 +955,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -- cgit v1.2.3 From 576682fa52cbd95deb3773449566274f206acc58 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:20 +0100 Subject: block, bfq: make reparent_leaf_entity actually work only on leaf entities bfq_reparent_leaf_entity() reparents the input leaf entity (a leaf entity represents just a bfq_queue in an entity tree). Yet, the input entity is guaranteed to always be a leaf entity only in two-level entity trees. In this respect, because of the error fixed by commit 14afc5936197 ("block, bfq: fix overwrite of bfq_group pointer in bfq_find_set_group()"), all (wrongly collapsed) entity trees happened to actually have only two levels. After the latter commit, this does not hold any longer. This commit fixes this problem by modifying bfq_reparent_leaf_entity(), so that it searches an active leaf entity down the path that stems from the input entity. Such a leaf entity is guaranteed to exist when bfq_reparent_leaf_entity() is invoked. Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 72c6151ace96..efb89db7ba24 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -815,39 +815,53 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. - * @entity: the entity to move. + * @entity: the entity to move, if entity is a leaf; or the parent entity + * of an active leaf entity to move, if entity is not a leaf. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) + struct bfq_entity *entity, + int ioprio_class) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_queue *bfqq; + struct bfq_entity *child_entity = entity; + + while (child_entity->my_sched_data) { /* leaf not reached yet */ + struct bfq_sched_data *child_sd = child_entity->my_sched_data; + struct bfq_service_tree *child_st = child_sd->service_tree + + ioprio_class; + struct rb_root *child_active = &child_st->active; + child_entity = bfq_entity_of(rb_first(child_active)); + + if (!child_entity) + child_entity = child_sd->in_service_entity; + } + + bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** - * bfq_reparent_active_entities - move to the root group all active - * entities. + * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. - * @st: the service tree with the entities. + * @st: the service tree to start the search from. */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) +static void bfq_reparent_active_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st, + int ioprio_class) { struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; - - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); + struct bfq_entity *entity; - for (; entity ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); + while ((entity = bfq_entity_of(rb_first(active)))) + bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, - bfqg->sched_data.in_service_entity); + bfqg->sched_data.in_service_entity, + ioprio_class); } /** @@ -898,7 +912,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * There is no need to put the sync queues, as the * scheduler has taken no reference. */ - bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_reparent_active_queues(bfqd, bfqg, st, i); } __bfq_deactivate_entity(entity, false); -- cgit v1.2.3 From 4d38a87fbb77fb9ff2ff4e914162a8ae6453eff5 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:21 +0100 Subject: block, bfq: invoke flush_idle_tree after reparent_active_queues in pd_offline In bfq_pd_offline(), the function bfq_flush_idle_tree() is invoked to flush the rb tree that contains all idle entities belonging to the pd (cgroup) being destroyed. In particular, bfq_flush_idle_tree() is invoked before bfq_reparent_active_queues(). Yet the latter may happen to add some entities to the idle tree. It happens if, in some of the calls to bfq_bfqq_move() performed by bfq_reparent_active_queues(), the queue to move is empty and gets expired. This commit simply reverses the invocation order between bfq_flush_idle_tree() and bfq_reparent_active_queues(). Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index efb89db7ba24..68882b9b8f11 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -893,13 +893,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding @@ -913,6 +906,19 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); + + /* + * The idle tree may still contain bfq_queues + * belonging to exited task because they never + * migrated to a different cgroup from the one being + * destroyed now. In addition, even + * bfq_reparent_active_queues() may happen to add some + * entities to the idle tree. It happens if, in some + * of the calls to bfq_bfqq_move() performed by + * bfq_reparent_active_queues(), the queue to move is + * empty and gets expired. + */ + bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); -- cgit v1.2.3 From d2332c5c040bc49c6e23426106c468cfa500d873 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:10 +0100 Subject: block: remove the blk_lookup_devt export This function is only used by init/do_mounts.c, which can't be modular. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 6a60131baffa..c5d20a48b4de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1487,7 +1487,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_exit(&iter); return devt; } -EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *__alloc_disk_node(int minors, int node_id) { -- cgit v1.2.3 From ea3edd4dc23027083fbb4a73b65114d08fe73a76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:11 +0100 Subject: block: remove __bdevname There is no good reason for __bdevname to exist. Just open code printing the string in the callers. For three of them the format string can be trivially merged into existing printk statements, and in init/do_mounts.c we can at least do the scnprintf once at the start of the function, and unconditional of CONFIG_BLOCK to make the output for tiny configfs a little more helpful. Acked-by: Theodore Ts'o # for ext4 Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index 564fae77711d..98256e6beabb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -57,20 +57,6 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { -- cgit v1.2.3 From 5cbd28e3cef14b43b2a8271d36b75fc61c13bb8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:12 +0100 Subject: block: move disk_name and related helpers out of partition-generic.c Thes functions aren't really related to partition support, so move them to a more suitable place. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ block/genhd.c | 21 +++++++++++++++++++++ block/partition-generic.c | 32 -------------------------------- 3 files changed, 27 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 0985f3422556..209715765a7a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -680,6 +680,12 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) diff --git a/block/genhd.c b/block/genhd.c index c5d20a48b4de..2484348d1850 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -69,6 +69,27 @@ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +/* + * Format the device name of the indicated disk into the supplied buffer and + * return a pointer to that same buffer for convenience. + */ +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); +} +EXPORT_SYMBOL(bdevname); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { diff --git a/block/partition-generic.c b/block/partition-generic.c index 98256e6beabb..6bf5aec2a0dc 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -25,38 +25,6 @@ extern void md_autodetect_dev(dev_t dev); #endif -/* - * disk_name() is used by partition check code and the genhd driver. - * It formats the devicename of the indicated disk into - * the supplied buffer (of size at least 32), and returns - * a pointer to that same buffer (for convenience). - */ - -char *disk_name(struct gendisk *hd, int partno, char *buf) -{ - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); -} - -EXPORT_SYMBOL(bdevname); - -const char *bio_devname(struct bio *bio, char *buf) -{ - return disk_name(bio->bi_disk, bio->bi_partno, buf); -} -EXPORT_SYMBOL(bio_devname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { -- cgit v1.2.3 From 3ad5cee5cd000dc05e6c2410b06fc1d818e7b1e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:13 +0100 Subject: block: move sysfs methods shared by disks and partitions to genhd.c Move the sysfs _show methods that are used both on the full disk and partition nodes to genhd.c instead of hiding them in the partitioning code. Also move the declaration for these methods to block/blk.h so that we don't expose them to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 11 +++++++ block/genhd.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- block/partition-generic.c | 76 +------------------------------------------- 3 files changed, 91 insertions(+), 76 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 670337b7cfa0..43df9dcb3d4e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -214,6 +214,17 @@ static inline void elevator_exit(struct request_queue *q, struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +ssize_t part_size_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/genhd.c b/block/genhd.c index 2484348d1850..f7d60b620b97 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -1199,6 +1200,60 @@ static ssize_t disk_ro_show(struct device *dev, return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight; + + inflight = part_in_flight(q, p); + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + part_stat_read(p, ios[STAT_READ]), + part_stat_read(p, merges[STAT_READ]), + (unsigned long long)part_stat_read(p, sectors[STAT_READ]), + (unsigned int)part_stat_read_msecs(p, STAT_READ), + part_stat_read(p, ios[STAT_WRITE]), + part_stat_read(p, merges[STAT_WRITE]), + (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + inflight, + jiffies_to_msecs(part_stat_read(p, io_ticks)), + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + part_stat_read(p, ios[STAT_DISCARD]), + part_stat_read(p, merges[STAT_DISCARD]), + (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1237,10 +1292,33 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + #ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); diff --git a/block/partition-generic.c b/block/partition-generic.c index 6bf5aec2a0dc..e6fd2226a639 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include #include #include +#include "blk.h" #include "partitions/check.h" @@ -41,13 +42,6 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); -} - static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -69,74 +63,6 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", p->discard_alignment); } -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight; - - inflight = part_in_flight(q, p); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u " - "%8lu %8lu %8llu %8u " - "%8lu %8u" - "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); -} - -ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - - part_in_flight_rw(q, p, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); -- cgit v1.2.3 From f17c21c1ecb80e957bafa07d6454836854be7cf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:14 +0100 Subject: block: remove alloc_part_info and free_part_info There isn't any good reason not to simply open code the allocation and freeing of the partition_meta_info structure. Especially as one of the branches in alloc_part_info is entirely dead code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index e6fd2226a639..f2004f3bd6f7 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -249,7 +249,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); if (!pinfo) { err = -ENOMEM; goto out_free_stats; @@ -308,7 +310,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; out_free_info: - free_part_info(p); + kfree(p->info); out_free_stats: free_part_stats(p); out_free: -- cgit v1.2.3 From 1a9fba3a77a5b39d1c9e1611758303f2649474e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:18 +0100 Subject: block: unexport read_dev_sector and put_dev_sector read_dev_sector and put_dev_sector are now only used by the partition parsing code. Remove the export for read_dev_sector and merge it into the only caller. Clean the mess up a bit by using goto labels and the SECTOR_SHIFT constant. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 31 +++++++++++++++++++------------ block/partitions/check.h | 14 +++++++------- 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index f2004f3bd6f7..fef6bacb2bbb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -484,22 +484,29 @@ out_free_state: return ret; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = state->bdev->bd_inode->i_mapping; struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9); -fail: - put_page(page); + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: p->v = NULL; return NULL; } - -EXPORT_SYMBOL(read_dev_sector); diff --git a/block/partitions/check.h b/block/partitions/check.h index 6042f769471a..0fcf80117887 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -28,14 +28,14 @@ void free_partitions(struct parsed_partitions *state); struct parsed_partitions * check_partition(struct gendisk *, struct block_device *); -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) +typedef struct { + struct page *v; +} Sector; + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); +static inline void put_dev_sector(Sector p) { - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); + put_page(p.v); } static inline void -- cgit v1.2.3 From 74cc979c3c7f8328b24651daf15280f07533e735 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:19 +0100 Subject: block: cleanup how md_autodetect_dev is called Add a new include/linux/raid/detect.h header to declare the md_autodetect_dev prototype which can be shared between md and the partition code. Then use IS_BUILTIN to call it instead of the ifdef magic. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index fef6bacb2bbb..4d771ae835ed 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,14 +18,11 @@ #include #include #include +#include #include "blk.h" #include "partitions/check.h" -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); -#endif - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -407,10 +404,10 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part_to_dev(part)->devt); -#endif + return true; } -- cgit v1.2.3 From ffa9ed647aa4cda79fa8cb9583a566e9d777dd4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:20 +0100 Subject: block: remove warn_no_part The warn_no_part is initialized to 1 and never changed. Remove it and execute the code keyed off from it unconditionally. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/amiga.c | 10 ++++------ block/partitions/check.c | 7 ++----- block/partitions/check.h | 3 --- 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 560936617d9c..7fecc760b78f 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -42,9 +42,8 @@ int amiga_partition(struct parsed_partitions *state) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read RDB block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -85,9 +84,8 @@ int amiga_partition(struct parsed_partitions *state) blk *= blksize; /* Read in terms partition table understands */ data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read partition block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/block/partitions/check.c b/block/partitions/check.c index ffe408fead0c..8fe46881ef63 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -37,8 +37,6 @@ #include "sysv68.h" #include "cmdline.h" -int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ - static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 @@ -186,9 +184,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; if (res) { - if (warn_no_part) - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } diff --git a/block/partitions/check.h b/block/partitions/check.h index 0fcf80117887..19852b494e93 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -50,6 +50,3 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) strlcat(p->pp_buf, tmp, PAGE_SIZE); } } - -extern int warn_no_part; - -- cgit v1.2.3 From 3f1b95ef81b7dd5e5481347d7b7a7b427b29307a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:21 +0100 Subject: block: declare all partition detection routines in check.h There is no good reason to include one header per partition type in core.c. Instead move the prototypes for the detection routins to check.h, and remove all now empty headers in block/partitions/. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 1 - block/partitions/acorn.h | 15 --------------- block/partitions/aix.c | 1 - block/partitions/aix.h | 2 -- block/partitions/amiga.c | 1 - block/partitions/amiga.h | 7 ------- block/partitions/atari.h | 1 - block/partitions/check.c | 16 ---------------- block/partitions/check.h | 22 ++++++++++++++++++++++ block/partitions/cmdline.c | 1 - block/partitions/cmdline.h | 3 --- block/partitions/efi.h | 3 --- block/partitions/ibm.c | 1 - block/partitions/ibm.h | 2 -- block/partitions/karma.h | 3 --- block/partitions/ldm.h | 2 -- block/partitions/mac.h | 1 - block/partitions/msdos.c | 1 - block/partitions/msdos.h | 1 - block/partitions/osf.h | 1 - block/partitions/sgi.h | 1 - block/partitions/sun.h | 1 - block/partitions/sysv68.c | 1 - block/partitions/sysv68.h | 2 -- block/partitions/ultrix.c | 1 - block/partitions/ultrix.h | 6 ------ 26 files changed, 22 insertions(+), 75 deletions(-) delete mode 100644 block/partitions/acorn.h delete mode 100644 block/partitions/aix.h delete mode 100644 block/partitions/amiga.h delete mode 100644 block/partitions/cmdline.h delete mode 100644 block/partitions/ibm.h delete mode 100644 block/partitions/sysv68.h delete mode 100644 block/partitions/ultrix.h (limited to 'block') diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 7587700fad4a..c64c57b958bf 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -11,7 +11,6 @@ #include #include "check.h" -#include "acorn.h" /* * Partition types. (Oh for reusability) diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h deleted file mode 100644 index 67b06601ca4c..000000000000 --- a/block/partitions/acorn.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/fs/partitions/acorn.h - * - * Copyright (C) 1996-2001 Russell King. - * - * I _hate_ this partitioning mess - why can't we have one defined - * format, and everyone stick to it? - */ - -int adfspart_check_CUMANA(struct parsed_partitions *state); -int adfspart_check_ADFS(struct parsed_partitions *state); -int adfspart_check_ICS(struct parsed_partitions *state); -int adfspart_check_POWERTEC(struct parsed_partitions *state); -int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 903f3ed175d0..c7b4fd1a4a97 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "aix.h" struct lvm_rec { char lvm_id[4]; /* "_LVM" */ diff --git a/block/partitions/aix.h b/block/partitions/aix.h deleted file mode 100644 index b4449f0b9f2b..000000000000 --- a/block/partitions/aix.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int aix_partition(struct parsed_partitions *state); diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 7fecc760b78f..9526491d9aed 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -14,7 +14,6 @@ #include #include "check.h" -#include "amiga.h" static __inline__ u32 checksum_block(__be32 *m, int size) diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h deleted file mode 100644 index 7e63f4d9d969..000000000000 --- a/block/partitions/amiga.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/amiga.h - */ - -int amiga_partition(struct parsed_partitions *state); - diff --git a/block/partitions/atari.h b/block/partitions/atari.h index 01c2b9457394..678202442fd3 100644 --- a/block/partitions/atari.h +++ b/block/partitions/atari.h @@ -34,4 +34,3 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __packed; -int atari_partition(struct parsed_partitions *state); diff --git a/block/partitions/check.c b/block/partitions/check.c index 8fe46881ef63..944c478b6f0b 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -21,22 +21,6 @@ #include "check.h" -#include "acorn.h" -#include "amiga.h" -#include "atari.h" -#include "ldm.h" -#include "mac.h" -#include "msdos.h" -#include "osf.h" -#include "sgi.h" -#include "sun.h" -#include "ibm.h" -#include "ultrix.h" -#include "efi.h" -#include "karma.h" -#include "sysv68.h" -#include "cmdline.h" - static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 diff --git a/block/partitions/check.h b/block/partitions/check.h index 19852b494e93..23e7adb79617 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -50,3 +50,25 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) strlcat(p->pp_buf, tmp, PAGE_SIZE); } } + +/* detection routines go here in alphabetical order: */ +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int aix_partition(struct parsed_partitions *state); +int amiga_partition(struct parsed_partitions *state); +int atari_partition(struct parsed_partitions *state); +int cmdline_partition(struct parsed_partitions *state); +int efi_partition(struct parsed_partitions *state); +int ibm_partition(struct parsed_partitions *); +int karma_partition(struct parsed_partitions *state); +int ldm_partition(struct parsed_partitions *state); +int mac_partition(struct parsed_partitions *state); +int msdos_partition(struct parsed_partitions *state); +int osf_partition(struct parsed_partitions *state); +int sgi_partition(struct parsed_partitions *state); +int sun_partition(struct parsed_partitions *state); +int sysv68_partition(struct parsed_partitions *state); +int ultrix_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index f1edd5452249..8f545c36cde4 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -18,7 +18,6 @@ #include #include "check.h" -#include "cmdline.h" static char *cmdline; static struct cmdline_parts *bdev_parts; diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h deleted file mode 100644 index e64a31636a1f..000000000000 --- a/block/partitions/cmdline.h +++ /dev/null @@ -1,3 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -int cmdline_partition(struct parsed_partitions *state); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 3e8576157575..907bac5ce8f7 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -113,7 +113,4 @@ typedef struct _legacy_mbr { __le16 signature; } __packed legacy_mbr; -/* Functions */ -extern int efi_partition(struct parsed_partitions *state); - #endif diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index a5d480f807f3..073faa6a69b8 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -15,7 +15,6 @@ #include #include "check.h" -#include "ibm.h" union label_t { diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h deleted file mode 100644 index 8bf13febb2b6..000000000000 --- a/block/partitions/ibm.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -int ibm_partition(struct parsed_partitions *); diff --git a/block/partitions/karma.h b/block/partitions/karma.h index 48e074d417fb..1b5eec57ee0c 100644 --- a/block/partitions/karma.h +++ b/block/partitions/karma.h @@ -4,6 +4,3 @@ */ #define KARMA_LABEL_MAGIC 0xAB56 - -int karma_partition(struct parsed_partitions *state); - diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 1ca63e97bccc..841580af7f9b 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -193,7 +193,5 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition(struct parsed_partitions *state); - #endif /* _FS_PT_LDM_H_ */ diff --git a/block/partitions/mac.h b/block/partitions/mac.h index 453ed2964804..0e41c9da7532 100644 --- a/block/partitions/mac.h +++ b/block/partitions/mac.h @@ -42,4 +42,3 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state); diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 82c44f7df911..c572022f3781 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -24,7 +24,6 @@ #include "check.h" #include "msdos.h" #include "efi.h" -#include "aix.h" /* * Many architectures don't like unaligned accesses, while diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h index fcacfc486092..123e666bb932 100644 --- a/block/partitions/msdos.h +++ b/block/partitions/msdos.h @@ -5,5 +5,4 @@ #define MSDOS_LABEL_MAGIC 0xAA55 -int msdos_partition(struct parsed_partitions *state); diff --git a/block/partitions/osf.h b/block/partitions/osf.h index 4d8088e7ea8c..80a58c382b3f 100644 --- a/block/partitions/osf.h +++ b/block/partitions/osf.h @@ -5,4 +5,3 @@ #define DISKLABELMAGIC (0x82564557UL) -int osf_partition(struct parsed_partitions *state); diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h index a5b77c3987cf..372cdad19fea 100644 --- a/block/partitions/sgi.h +++ b/block/partitions/sgi.h @@ -3,7 +3,6 @@ * fs/partitions/sgi.h */ -extern int sgi_partition(struct parsed_partitions *state); #define SGI_LABEL_MAGIC 0x0be5a941 diff --git a/block/partitions/sun.h b/block/partitions/sun.h index ae1b9eed3fd7..4c8877a5b52d 100644 --- a/block/partitions/sun.h +++ b/block/partitions/sun.h @@ -6,4 +6,3 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE -int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c index 92e810826b01..6f6257fd4eb4 100644 --- a/block/partitions/sysv68.c +++ b/block/partitions/sysv68.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "sysv68.h" /* * Volume ID structure: on first 256-bytes sector of disk diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h deleted file mode 100644 index 4fb6b8ec78ae..000000000000 --- a/block/partitions/sysv68.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int sysv68_partition(struct parsed_partitions *state); diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c index ecd0d7346c3d..4aaa81043ca0 100644 --- a/block/partitions/ultrix.c +++ b/block/partitions/ultrix.c @@ -8,7 +8,6 @@ */ #include "check.h" -#include "ultrix.h" int ultrix_partition(struct parsed_partitions *state) { diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h deleted file mode 100644 index 9f676cead222..000000000000 --- a/block/partitions/ultrix.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/ultrix.h - */ - -int ultrix_partition(struct parsed_partitions *state); -- cgit v1.2.3 From f6d17358dc7eb2259115c0017a1ff9958a59eb2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:22 +0100 Subject: block: remove block/partitions/karma.h Just move the single define to block/partitions/karma.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/karma.c | 3 ++- block/partitions/karma.h | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 block/partitions/karma.h (limited to 'block') diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 59812d705c3d..4d93512f4bd4 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -8,9 +8,10 @@ */ #include "check.h" -#include "karma.h" #include +#define KARMA_LABEL_MAGIC 0xAB56 + int karma_partition(struct parsed_partitions *state) { int i; diff --git a/block/partitions/karma.h b/block/partitions/karma.h deleted file mode 100644 index 1b5eec57ee0c..000000000000 --- a/block/partitions/karma.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/karma.h - */ - -#define KARMA_LABEL_MAGIC 0xAB56 -- cgit v1.2.3 From 3466f63a7cfe55217534e71af866037be2e1909e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:23 +0100 Subject: block: remove block/partitions/osf.h Just move the single define to block/partitions/osf.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/osf.c | 2 +- block/partitions/osf.h | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 block/partitions/osf.h (limited to 'block') diff --git a/block/partitions/osf.c b/block/partitions/osf.c index 4b873973d6c0..84560d0765ed 100644 --- a/block/partitions/osf.c +++ b/block/partitions/osf.c @@ -9,9 +9,9 @@ */ #include "check.h" -#include "osf.h" #define MAX_OSF_PARTITIONS 18 +#define DISKLABELMAGIC (0x82564557UL) int osf_partition(struct parsed_partitions *state) { diff --git a/block/partitions/osf.h b/block/partitions/osf.h deleted file mode 100644 index 80a58c382b3f..000000000000 --- a/block/partitions/osf.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/osf.h - */ - -#define DISKLABELMAGIC (0x82564557UL) - -- cgit v1.2.3 From 95f77ef35a990e2947cee100f72c0e3ddd1ccd75 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:24 +0100 Subject: block: remove block/partitions/sgi.h Just move the single define to block/partitions/sgi.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 3 ++- block/partitions/sgi.h | 8 -------- 2 files changed, 2 insertions(+), 9 deletions(-) delete mode 100644 block/partitions/sgi.h (limited to 'block') diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index d7b421c6e530..927cf501603e 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -6,7 +6,8 @@ */ #include "check.h" -#include "sgi.h" + +#define SGI_LABEL_MAGIC 0x0be5a941 struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h deleted file mode 100644 index 372cdad19fea..000000000000 --- a/block/partitions/sgi.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sgi.h - */ - - -#define SGI_LABEL_MAGIC 0x0be5a941 - -- cgit v1.2.3 From cbb5cb3b29f9eb158bd2db39cdc07db6d8087461 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:25 +0100 Subject: block: remove block/partitions/sun.h Just move the two defines to block/partitions/sun.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sun.c | 4 +++- block/partitions/sun.h | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) delete mode 100644 block/partitions/sun.h (limited to 'block') diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 90f36724e796..28b44100f2b1 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -9,7 +9,9 @@ */ #include "check.h" -#include "sun.h" + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE int sun_partition(struct parsed_partitions *state) { diff --git a/block/partitions/sun.h b/block/partitions/sun.h deleted file mode 100644 index 4c8877a5b52d..000000000000 --- a/block/partitions/sun.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sun.h - */ - -#define SUN_LABEL_MAGIC 0xDABE -#define SUN_VTOC_SANITY 0x600DDEEE - -- cgit v1.2.3 From 1442f76d4317b420580e11238d20789708c742a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:26 +0100 Subject: block: move struct partition out of genhd.h struct partition is the on-disk format of a MSDOS partition table entry. Move it out of genhd.h into a new msdos_partition.h header and give it a msdos_ prefix to avoid confusion. Also move the magic number from block/partitions/msdos.h to the new header so that it can be used by the SCSI drivers looking at the DOS partition tables. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 6 +++--- block/partitions/msdos.c | 28 ++++++++++++++-------------- block/partitions/msdos.h | 8 -------- 3 files changed, 17 insertions(+), 25 deletions(-) delete mode 100644 block/partitions/msdos.h (limited to 'block') diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index a2d97ee1908c..6fdfcb40c537 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -14,10 +14,10 @@ #include #include #include +#include #include "ldm.h" #include "check.h" -#include "msdos.h" /* * ldm_debug/info/error/crit - Output an error message @@ -493,7 +493,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; - struct partition *p; + struct msdos_partition *p; int i; bool result = false; @@ -508,7 +508,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; - p = (struct partition*)(data + 0x01BE); + p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (SYS_IND (p) == LDM_PARTITION) { result = true; diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index c572022f3781..88ee5ee7f442 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -20,9 +20,9 @@ * Re-organised Feb 1998 Russell King */ #include +#include #include "check.h" -#include "msdos.h" #include "efi.h" /* @@ -34,17 +34,17 @@ #define SYS_IND(p) get_unaligned(&p->sys_ind) -static inline sector_t nr_sects(struct partition *p) +static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } -static inline sector_t start_sect(struct partition *p) +static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } -static inline int is_extended_partition(struct partition *p) +static inline int is_extended_partition(struct msdos_partition *p) { return (SYS_IND(p) == DOS_EXTENDED_PARTITION || SYS_IND(p) == WIN98_EXTENDED_PARTITION || @@ -67,7 +67,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { - struct partition *pt = (struct partition *) (p + 0x1be); + struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; @@ -121,7 +121,7 @@ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { - struct partition *p; + struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; @@ -145,7 +145,7 @@ static void parse_extended(struct parsed_partitions *state, if (!msdos_magic_present(data + 510)) goto done; - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, @@ -402,14 +402,14 @@ static void parse_minix(struct parsed_partitions *state, #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; - p = (struct partition *)(data + 0x1be); + p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or @@ -453,7 +453,7 @@ int msdos_partition(struct parsed_partitions *state) sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; @@ -487,7 +487,7 @@ int msdos_partition(struct parsed_partitions *state) * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* @@ -509,7 +509,7 @@ int msdos_partition(struct parsed_partitions *state) } #ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { @@ -518,7 +518,7 @@ int msdos_partition(struct parsed_partitions *state) } } #endif - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); @@ -565,7 +565,7 @@ int msdos_partition(struct parsed_partitions *state) strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); + p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = SYS_IND(p); int n; diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h deleted file mode 100644 index 123e666bb932..000000000000 --- a/block/partitions/msdos.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - - -- cgit v1.2.3 From 0226e9ead44b2eb8f2471d24e0b0c5ff60bc322c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:27 +0100 Subject: block: move the *_PARTITION enum out of genhd.h The enum containing the *_PARTITION symbolic names is only relevant for the partition parser. More specifically most values are MSDOS partition table system indicators and thus should go straight into msdos.c. One value is only used by the sun partition parser, and the sun and sgi partition parsers use the same value as the x86 Linux RAID indicator to also indicate RAID autodetection. Duplicate them in sun.c and sgi.c given that the different partition types use entirely different values otherwise. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 4 ++++ block/partitions/sun.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'block') diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 927cf501603e..4273f1bb0515 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -9,6 +9,10 @@ #define SGI_LABEL_MAGIC 0x0be5a941 +enum { + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 28b44100f2b1..47dc53eccf77 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -13,6 +13,11 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE +enum { + SUN_WHOLE_DISK = 5, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + int sun_partition(struct parsed_partitions *state) { int i; -- cgit v1.2.3 From cb0ab52652123c47cb72e665ce9fdd3029dcb175 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:28 +0100 Subject: partitions/msdos: remove LINUX_SWAP_PARTITION Just always use NEW_SOLARIS_X86_PARTITION and explain the situation, as that is less confusing than two names for a single value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 88ee5ee7f442..e44e2f0a02cc 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -77,13 +77,19 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; - /* Assume the partition table is valid if Linux partitions exists */ + + /* + * Assume the partition table is valid if Linux partitions exists. + * Note that old Solaris/x86 partitions use the same indicator as + * Linux swap partitions, so we consider that a Linux partition as + * well. + */ for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) + if (pt->sys_ind == SOLARIS_X86_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); -- cgit v1.2.3 From 3f4fc59c1321b74df27dcd5d77b37989ed93265b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:29 +0100 Subject: block: move the various x86 Unix label formats out of genhd.h All these are just used in block/partitions/msdos.c, so move them out of the genhd.h driver included by every driver. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) (limited to 'block') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index e44e2f0a02cc..8f2fcc080264 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -18,6 +18,12 @@ * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King + * + * BSD disklabel support by Yossi Gottlieb + * updated by Marc Espie + * + * Unixware slices support by Andrzej Krzysztofowicz + * and Krzysztof G. Baranowski */ #include #include @@ -215,6 +221,30 @@ done: put_dev_sector(sect); } +#define SOLARIS_X86_NUMSLICE 16 +#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) + +struct solaris_x86_slice { + __le16 s_tag; /* ID tag of partition */ + __le16 s_flag; /* permission flags */ + __le32 s_start; /* start sector no of partition */ + __le32 s_size; /* # of blocks in partition */ +}; + +struct solaris_x86_vtoc { + unsigned int v_bootinfo[3]; /* info needed by mboot */ + __le32 v_sanity; /* to verify vtoc sanity */ + __le32 v_version; /* layout version */ + char v_volume[8]; /* volume name */ + __le16 v_sectorsz; /* sector size in bytes */ + __le16 v_nparts; /* number of partitions */ + unsigned int v_reserved[10]; /* free space */ + struct solaris_x86_slice + v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ + unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ + char v_asciilabel[128]; /* for compatibility */ +}; + /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ @@ -270,6 +300,54 @@ static void parse_solaris_x86(struct parsed_partitions *state, #endif } +/* check against BSD src/sys/sys/disklabel.h for consistency */ +#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ +#define BSD_MAXPARTITIONS 16 +#define OPENBSD_MAXPARTITIONS 16 +#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ +struct bsd_disklabel { + __le32 d_magic; /* the magic number */ + __s16 d_type; /* drive type */ + __s16 d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + char d_packname[16]; /* pack identifier */ + __u32 d_secsize; /* # of bytes per sector */ + __u32 d_nsectors; /* # of data sectors per track */ + __u32 d_ntracks; /* # of tracks per cylinder */ + __u32 d_ncylinders; /* # of data cylinders per unit */ + __u32 d_secpercyl; /* # of data sectors per cylinder */ + __u32 d_secperunit; /* # of data sectors per unit */ + __u16 d_sparespertrack; /* # of spare sectors per track */ + __u16 d_sparespercyl; /* # of spare sectors per cylinder */ + __u32 d_acylinders; /* # of alt. cylinders per unit */ + __u16 d_rpm; /* rotational speed */ + __u16 d_interleave; /* hardware sector interleave */ + __u16 d_trackskew; /* sector 0 skew, per track */ + __u16 d_cylskew; /* sector 0 skew, per cylinder */ + __u32 d_headswitch; /* head switch time, usec */ + __u32 d_trkseek; /* track-to-track seek, usec */ + __u32 d_flags; /* generic flags */ +#define NDDATA 5 + __u32 d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + __u32 d_spare[NSPARE]; /* reserved for future use */ + __le32 d_magic2; /* the magic number (again) */ + __le16 d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + __le16 d_npartitions; /* number of partitions in following */ + __le32 d_bbsize; /* size of boot area at sn0, bytes */ + __le32 d_sbsize; /* max size of fs superblock, bytes */ + struct bsd_partition { /* the partition table */ + __le32 p_size; /* number of sectors in partition */ + __le32 p_offset; /* starting sector */ + __le32 p_fsize; /* filesystem basic fragment size */ + __u8 p_fstype; /* filesystem type, see below */ + __u8 p_frag; /* filesystem fragments per block */ + __le16 p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ +}; + #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a @@ -354,6 +432,51 @@ static void parse_openbsd(struct parsed_partitions *state, #endif } +#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ +#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ +#define UNIXWARE_NUMSLICE 16 +#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ + +struct unixware_slice { + __le16 s_label; /* label */ + __le16 s_flags; /* permission flags */ + __le32 start_sect; /* starting sector */ + __le32 nr_sects; /* number of sectors in slice */ +}; + +struct unixware_disklabel { + __le32 d_type; /* drive type */ + __le32 d_magic; /* the magic number */ + __le32 d_version; /* version number */ + char d_serial[12]; /* serial number of the device */ + __le32 d_ncylinders; /* # of data cylinders per device */ + __le32 d_ntracks; /* # of tracks per cylinder */ + __le32 d_nsectors; /* # of data sectors per track */ + __le32 d_secsize; /* # of bytes per sector */ + __le32 d_part_start; /* # of first sector of this partition*/ + __le32 d_unknown1[12]; /* ? */ + __le32 d_alt_tbl; /* byte offset of alternate table */ + __le32 d_alt_len; /* byte length of alternate table */ + __le32 d_phys_cyl; /* # of physical cylinders per device */ + __le32 d_phys_trk; /* # of physical tracks per cylinder */ + __le32 d_phys_sec; /* # of physical sectors per track */ + __le32 d_phys_bytes; /* # of physical bytes per sector */ + __le32 d_unknown2; /* ? */ + __le32 d_unknown3; /* ? */ + __le32 d_pad[8]; /* pad */ + + struct unixware_vtoc { + __le32 v_magic; /* the magic number */ + __le32 v_version; /* version number */ + char v_name[8]; /* volume name */ + __le16 v_nslices; /* # of slices */ + __le16 v_unknown1; /* ? */ + __le32 v_reserved[10]; /* reserved */ + struct unixware_slice + v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ + } vtoc; +}; /* 408 */ + /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. @@ -397,6 +520,8 @@ static void parse_unixware(struct parsed_partitions *state, #endif } +#define MINIX_NR_SUBPARTITIONS 4 + /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy -- cgit v1.2.3 From 387048bf67eeff8bdf7c2a41b03b48230a88b3d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:30 +0100 Subject: block: merge partition-generic.c and check.c Merge block/partition-generic.c and block/partitions/check.c into a single block/partitions/core.c as the content is closely related and both files are tiny. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/partition-generic.c | 509 ----------------------------------- block/partitions/Makefile | 3 +- block/partitions/check.c | 179 ------------- block/partitions/check.h | 5 - block/partitions/core.c | 659 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 661 insertions(+), 697 deletions(-) delete mode 100644 block/partition-generic.c delete mode 100644 block/partitions/check.c create mode 100644 block/partitions/core.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 1a43750f4b01..206b96e9387f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,8 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/partition-generic.c b/block/partition-generic.c deleted file mode 100644 index 4d771ae835ed..000000000000 --- a/block/partition-generic.c +++ /dev/null @@ -1,509 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "blk.h" - -#include "partitions/check.h" - -static ssize_t part_partition_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->partno); -} - -static ssize_t part_start_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); -} - -static ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); -} - -static ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); -} - -static ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); -} - -static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); -static DEVICE_ATTR(start, 0444, part_start_show, NULL); -static DEVICE_ATTR(size, 0444, part_size_show, NULL); -static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); -static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); -static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); -static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); -static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); -#ifdef CONFIG_FAIL_MAKE_REQUEST -static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif - -static struct attribute *part_attrs[] = { - &dev_attr_partition.attr, - &dev_attr_start.attr, - &dev_attr_size.attr, - &dev_attr_ro.attr, - &dev_attr_alignment_offset.attr, - &dev_attr_discard_alignment.attr, - &dev_attr_stat.attr, - &dev_attr_inflight.attr, -#ifdef CONFIG_FAIL_MAKE_REQUEST - &dev_attr_fail.attr, -#endif - NULL -}; - -static struct attribute_group part_attr_group = { - .attrs = part_attrs, -}; - -static const struct attribute_group *part_attr_groups[] = { - &part_attr_group, -#ifdef CONFIG_BLK_DEV_IO_TRACE - &blk_trace_attr_group, -#endif - NULL -}; - -static void part_release(struct device *dev) -{ - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); -} - -static int part_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - struct hd_struct *part = dev_to_part(dev); - - add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); - return 0; -} - -struct device_type part_type = { - .name = "partition", - .groups = part_attr_groups, - .release = part_release, - .uevent = part_uevent, -}; - -static void delete_partition_work_fn(struct work_struct *work) -{ - struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, - rcu_work); - - part->start_sect = 0; - part->nr_sects = 0; - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -void __delete_partition(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); - queue_rcu_work(system_wq, &part->rcu_work); -} - -/* - * Must be called either with bd_mutex held, before a disk can be opened or - * after all disk users are gone. - */ -void delete_partition(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - struct hd_struct *part; - - if (partno >= ptbl->len) - return; - - part = rcu_dereference_protected(ptbl->part[partno], 1); - if (!part) - return; - - rcu_assign_pointer(ptbl->part[partno], NULL); - rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->holder_dir); - device_del(part_to_dev(part)); - - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(part_devt(part)); - hd_struct_kill(part); -} - -static ssize_t whole_disk_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} -static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); - -/* - * Must be called either with bd_mutex held, before a disk can be opened or - * after all disk users are gone. - */ -struct hd_struct *add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags, - struct partition_meta_info *info) -{ - struct hd_struct *p; - dev_t devt = MKDEV(0, 0); - struct device *ddev = disk_to_dev(disk); - struct device *pdev; - struct disk_part_tbl *ptbl; - const char *dname; - int err; - - /* - * Partitions are not supported on zoned block devices that are used as - * such. - */ - switch (disk->queue->limits.zoned) { - case BLK_ZONED_HM: - pr_warn("%s: partitions not supported on host managed zoned block device\n", - disk->disk_name); - return ERR_PTR(-ENXIO); - case BLK_ZONED_HA: - pr_info("%s: disabling host aware zoned block device support due to partitions\n", - disk->disk_name); - disk->queue->limits.zoned = BLK_ZONED_NONE; - break; - case BLK_ZONED_NONE: - break; - } - - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - - if (ptbl->part[partno]) - return ERR_PTR(-EBUSY); - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - - if (!init_part_stats(p)) { - err = -ENOMEM; - goto out_free; - } - - seqcount_init(&p->nr_sects_seq); - pdev = part_to_dev(p); - - p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); - p->nr_sects = len; - p->partno = partno; - p->policy = get_disk_ro(disk); - - if (info) { - struct partition_meta_info *pinfo; - - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; - } - - dname = dev_name(ddev); - if (isdigit(dname[strlen(dname) - 1])) - dev_set_name(pdev, "%sp%d", dname, partno); - else - dev_set_name(pdev, "%s%d", dname, partno); - - device_initialize(pdev); - pdev->class = &block_class; - pdev->type = &part_type; - pdev->parent = ddev; - - err = blk_alloc_devt(p, &devt); - if (err) - goto out_free_info; - pdev->devt = devt; - - /* delay uevent until 'holders' subdir is created */ - dev_set_uevent_suppress(pdev, 1); - err = device_add(pdev); - if (err) - goto out_put; - - err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) - goto out_del; - - dev_set_uevent_suppress(pdev, 0); - if (flags & ADDPART_FLAG_WHOLEDISK) { - err = device_create_file(pdev, &dev_attr_whole_disk); - if (err) - goto out_del; - } - - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - - /* everything is up and running, commence */ - rcu_assign_pointer(ptbl->part[partno], p); - - /* suppress uevent if the disk suppresses it */ - if (!dev_get_uevent_suppress(ddev)) - kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; - -out_free_info: - kfree(p->info); -out_free_stats: - free_part_stats(p); -out_free: - kfree(p); - return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); -out_del: - kobject_put(p->holder_dir); - device_del(pdev); -out_put: - put_device(pdev); - return ERR_PTR(err); -} - -static bool disk_unlock_native_capacity(struct gendisk *disk) -{ - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { - printk(KERN_CONT "truncated\n"); - return false; - } -} - -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) -{ - struct disk_part_iter piter; - struct hd_struct *part; - int res; - - if (!disk_part_scan_enabled(disk)) - return 0; - if (bdev->bd_part_count || bdev->bd_super) - return -EBUSY; - res = invalidate_partition(disk, 0); - if (res) - return res; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part->partno); - disk_part_iter_exit(&piter); - - return 0; -} - -static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, - struct parsed_partitions *state, int p) -{ - sector_t size = state->parts[p].size; - sector_t from = state->parts[p].from; - struct hd_struct *part; - - if (!size) - return true; - - if (from >= get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d start %llu is beyond EOD, ", - disk->disk_name, p, (unsigned long long) from); - if (disk_unlock_native_capacity(disk)) - return false; - return true; - } - - if (from + size > get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d size %llu extends beyond EOD, ", - disk->disk_name, p, (unsigned long long) size); - - if (disk_unlock_native_capacity(disk)) - return false; - - /* - * We can not ignore partitions of broken tables created by for - * example camera firmware, but we limit them to the end of the - * disk to avoid creating invalid block devices. - */ - size = get_capacity(disk) - from; - } - - part = add_partition(disk, p, from, size, state->parts[p].flags, - &state->parts[p].info); - if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); - return true; - } - - if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && - (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); - - return true; -} - -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) -{ - struct parsed_partitions *state; - int ret = -EAGAIN, p, highest; - - if (!disk_part_scan_enabled(disk)) - return 0; - - state = check_partition(disk, bdev); - if (!state) - return 0; - if (IS_ERR(state)) { - /* - * I/O error reading the partition table. If we tried to read - * beyond EOD, retry after unlocking the native capacity. - */ - if (PTR_ERR(state) == -ENOSPC) { - printk(KERN_WARNING "%s: partition table beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - return -EAGAIN; - } - return -EIO; - } - - /* - * Partitions are not supported on host managed zoned block devices. - */ - if (disk->queue->limits.zoned == BLK_ZONED_HM) { - pr_warn("%s: ignoring partition table on host managed zoned block device\n", - disk->disk_name); - ret = 0; - goto out_free_state; - } - - /* - * If we read beyond EOD, try unlocking native capacity even if the - * partition table was successfully read as we could be missing some - * partitions. - */ - if (state->access_beyond_eod) { - printk(KERN_WARNING - "%s: partition table partially beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - goto out_free_state; - } - - /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - - /* - * Detect the highest partition number and preallocate disk->part_tbl. - * This is an optimization and not strictly necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - disk_expand_part_tbl(disk, highest); - - for (p = 1; p < state->limit; p++) - if (!blk_add_partition(disk, bdev, state, p)) - goto out_free_state; - - ret = 0; -out_free_state: - free_partitions(state); - return ret; -} - -void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) -{ - struct address_space *mapping = state->bdev->bd_inode->i_mapping; - struct page *page; - - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - - page = read_mapping_page(mapping, - (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); - if (IS_ERR(page)) - goto out; - if (PageError(page)) - goto out_put_page; - - p->v = page; - return (unsigned char *)page_address(page) + - ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); -out_put_page: - put_page(page); -out: - p->v = NULL; - return NULL; -} diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2f276b677c81..a7f05cdb02a8 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -3,8 +3,7 @@ # Makefile for the linux kernel. # -obj-$(CONFIG_BLOCK) := check.o - +obj-$(CONFIG_BLOCK) += core.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o diff --git a/block/partitions/check.c b/block/partitions/check.c deleted file mode 100644 index 944c478b6f0b..000000000000 --- a/block/partitions/check.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * fs/partitions/check.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - * - * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} - */ - -#include -#include -#include -#include - -#include "check.h" - -static int (*check_part[])(struct parsed_partitions *) = { - /* - * Probe partition formats with tables at disk address 0 - * that also have an ADFS boot block at 0xdc0. - */ -#ifdef CONFIG_ACORN_PARTITION_ICS - adfspart_check_ICS, -#endif -#ifdef CONFIG_ACORN_PARTITION_POWERTEC - adfspart_check_POWERTEC, -#endif -#ifdef CONFIG_ACORN_PARTITION_EESOX - adfspart_check_EESOX, -#endif - - /* - * Now move on to formats that only have partition info at - * disk address 0xdc0. Since these may also have stale - * PC/BIOS partition tables, they need to come before - * the msdos entry. - */ -#ifdef CONFIG_ACORN_PARTITION_CUMANA - adfspart_check_CUMANA, -#endif -#ifdef CONFIG_ACORN_PARTITION_ADFS - adfspart_check_ADFS, -#endif - -#ifdef CONFIG_CMDLINE_PARTITION - cmdline_partition, -#endif -#ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_SGI_PARTITION - sgi_partition, -#endif -#ifdef CONFIG_LDM_PARTITION - ldm_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_MSDOS_PARTITION - msdos_partition, -#endif -#ifdef CONFIG_OSF_PARTITION - osf_partition, -#endif -#ifdef CONFIG_SUN_PARTITION - sun_partition, -#endif -#ifdef CONFIG_AMIGA_PARTITION - amiga_partition, -#endif -#ifdef CONFIG_ATARI_PARTITION - atari_partition, -#endif -#ifdef CONFIG_MAC_PARTITION - mac_partition, -#endif -#ifdef CONFIG_ULTRIX_PARTITION - ultrix_partition, -#endif -#ifdef CONFIG_IBM_PARTITION - ibm_partition, -#endif -#ifdef CONFIG_KARMA_PARTITION - karma_partition, -#endif -#ifdef CONFIG_SYSV68_PARTITION - sysv68_partition, -#endif - NULL -}; - -static struct parsed_partitions *allocate_partitions(struct gendisk *hd) -{ - struct parsed_partitions *state; - int nr; - - state = kzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return NULL; - - nr = disk_max_parts(hd); - state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); - if (!state->parts) { - kfree(state); - return NULL; - } - - state->limit = nr; - - return state; -} - -void free_partitions(struct parsed_partitions *state) -{ - vfree(state->parts); - kfree(state); -} - -struct parsed_partitions * -check_partition(struct gendisk *hd, struct block_device *bdev) -{ - struct parsed_partitions *state; - int i, res, err; - - state = allocate_partitions(hd); - if (!state) - return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { - free_partitions(state); - return NULL; - } - state->pp_buf[0] = '\0'; - - state->bdev = bdev; - disk_name(hd, 0, state->name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); - if (isdigit(state->name[strlen(state->name)-1])) - sprintf(state->name, "p"); - - i = res = err = 0; - while (!res && check_part[i]) { - memset(state->parts, 0, state->limit * sizeof(state->parts[0])); - res = check_part[i++](state); - if (res < 0) { - /* We have hit an I/O error which we don't report now. - * But record it, and let the others do their job. - */ - err = res; - res = 0; - } - - } - if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - return state; - } - if (state->access_beyond_eod) - err = -ENOSPC; - if (err) - /* The partition is unrecognized. So report I/O errors if there were any */ - res = err; - if (res) { - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); - printk(KERN_INFO "%s", state->pp_buf); - } - - free_page((unsigned long)state->pp_buf); - free_partitions(state); - return ERR_PTR(res); -} diff --git a/block/partitions/check.h b/block/partitions/check.h index 23e7adb79617..f845355489ec 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -23,11 +23,6 @@ struct parsed_partitions { char *pp_buf; }; -void free_partitions(struct parsed_partitions *state); - -struct parsed_partitions * -check_partition(struct gendisk *, struct block_device *); - typedef struct { struct page *v; } Sector; diff --git a/block/partitions/core.c b/block/partitions/core.c new file mode 100644 index 000000000000..b442bc209b86 --- /dev/null +++ b/block/partitions/core.c @@ -0,0 +1,659 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ +#include +#include +#include +#include +#include +#include +#include +#include "../blk.h" +#include "check.h" + +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif + + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, +#endif + +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +static struct parsed_partitions *allocate_partitions(struct gendisk *hd) +{ + struct parsed_partitions *state; + int nr; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; + + nr = disk_max_parts(hd); + state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); + if (!state->parts) { + kfree(state); + return NULL; + } + + state->limit = nr; + + return state; +} + +static void free_partitions(struct parsed_partitions *state) +{ + vfree(state->parts); + kfree(state); +} + +static struct parsed_partitions *check_partition(struct gendisk *hd, + struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = allocate_partitions(hd); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + free_partitions(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + i = res = err = 0; + while (!res && check_part[i]) { + memset(state->parts, 0, state->limit * sizeof(state->parts[0])); + res = check_part[i++](state); + if (res < 0) { + /* + * We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + /* + * The partition is unrecognized. So report I/O errors if there were any + */ + if (err) + res = err; + if (res) { + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } + + free_page((unsigned long)state->pp_buf); + free_partitions(state); + return ERR_PTR(res); +} + +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + +static ssize_t part_start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); +} + +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); +} + +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%u\n", p->discard_alignment); +} + +static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); +static DEVICE_ATTR(start, 0444, part_start_show, NULL); +static DEVICE_ATTR(size, 0444, part_size_show, NULL); +static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); +static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); +static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); +static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); +#endif + +static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_ro.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, + &dev_attr_stat.attr, + &dev_attr_inflight.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif + NULL +}; + +static struct attribute_group part_attr_group = { + .attrs = part_attrs, +}; + +static const struct attribute_group *part_attr_groups[] = { + &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif + NULL +}; + +static void part_release(struct device *dev) +{ + struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); + hd_free_part(p); + kfree(p); +} + +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + +struct device_type part_type = { + .name = "partition", + .groups = part_attr_groups, + .release = part_release, + .uevent = part_uevent, +}; + +static void delete_partition_work_fn(struct work_struct *work) +{ + struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, + rcu_work); + + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); +} + +void __delete_partition(struct percpu_ref *ref) +{ + struct hd_struct *part = container_of(ref, struct hd_struct, ref); + INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); + queue_rcu_work(system_wq, &part->rcu_work); +} + +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ +void delete_partition(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + struct hd_struct *part; + + if (partno >= ptbl->len) + return; + + part = rcu_dereference_protected(ptbl->part[partno], 1); + if (!part) + return; + + rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + + /* + * Remove gendisk pointer from idr so that it cannot be looked up + * while RCU period before freeing gendisk is running to prevent + * use-after-free issues. Note that the device number stays + * "in-use" until we really free the gendisk. + */ + blk_invalidate_devt(part_devt(part)); + hd_struct_kill(part); +} + +static ssize_t whole_disk_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} +static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); + +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ +struct hd_struct *add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) +{ + struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; + int err; + + /* + * Partitions are not supported on zoned block devices that are used as + * such. + */ + switch (disk->queue->limits.zoned) { + case BLK_ZONED_HM: + pr_warn("%s: partitions not supported on host managed zoned block device\n", + disk->disk_name); + return ERR_PTR(-ENXIO); + case BLK_ZONED_HA: + pr_info("%s: disabling host aware zoned block device support due to partitions\n", + disk->disk_name); + disk->queue->limits.zoned = BLK_ZONED_NONE; + break; + case BLK_ZONED_NONE: + break; + } + + err = disk_expand_part_tbl(disk, partno); + if (err) + return ERR_PTR(err); + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + + if (ptbl->part[partno]) + return ERR_PTR(-EBUSY); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-EBUSY); + + if (!init_part_stats(p)) { + err = -ENOMEM; + goto out_free; + } + + seqcount_init(&p->nr_sects_seq); + pdev = part_to_dev(p); + + p->start_sect = start; + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); + p->nr_sects = len; + p->partno = partno; + p->policy = get_disk_ro(disk); + + if (info) { + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); + if (!pinfo) { + err = -ENOMEM; + goto out_free_stats; + } + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + dev_set_name(pdev, "%sp%d", dname, partno); + else + dev_set_name(pdev, "%s%d", dname, partno); + + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; + + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free_info; + pdev->devt = devt; + + /* delay uevent until 'holders' subdir is created */ + dev_set_uevent_suppress(pdev, 1); + err = device_add(pdev); + if (err) + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + dev_set_uevent_suppress(pdev, 0); + if (flags & ADDPART_FLAG_WHOLEDISK) { + err = device_create_file(pdev, &dev_attr_whole_disk); + if (err) + goto out_del; + } + + err = hd_ref_init(p); + if (err) { + if (flags & ADDPART_FLAG_WHOLEDISK) + goto out_remove_file; + goto out_del; + } + + /* everything is up and running, commence */ + rcu_assign_pointer(ptbl->part[partno], p); + + /* suppress uevent if the disk suppresses it */ + if (!dev_get_uevent_suppress(ddev)) + kobject_uevent(&pdev->kobj, KOBJ_ADD); + return p; + +out_free_info: + kfree(p->info); +out_free_stats: + free_part_stats(p); +out_free: + kfree(p); + return ERR_PTR(err); +out_remove_file: + device_remove_file(pdev, &dev_attr_whole_disk); +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + return ERR_PTR(err); +} + +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + +int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct disk_part_iter piter; + struct hd_struct *part; + int res; + + if (!disk_part_scan_enabled(disk)) + return 0; + if (bdev->bd_part_count || bdev->bd_super) + return -EBUSY; + res = invalidate_partition(disk, 0); + if (res) + return res; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + + return 0; +} + +static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, + struct parsed_partitions *state, int p) +{ + sector_t size = state->parts[p].size; + sector_t from = state->parts[p].from; + struct hd_struct *part; + + if (!size) + return true; + + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + return false; + return true; + } + + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); + + if (disk_unlock_native_capacity(disk)) + return false; + + /* + * We can not ignore partitions of broken tables created by for + * example camera firmware, but we limit them to the end of the + * disk to avoid creating invalid block devices. + */ + size = get_capacity(disk) - from; + } + + part = add_partition(disk, p, from, size, state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + return true; + } + + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) + md_autodetect_dev(part_to_dev(part)->devt); + + return true; +} + +int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state; + int ret = -EAGAIN, p, highest; + + if (!disk_part_scan_enabled(disk)) + return 0; + + state = check_partition(disk, bdev); + if (!state) + return 0; + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If we tried to read + * beyond EOD, retry after unlocking the native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + return -EAGAIN; + } + return -EIO; + } + + /* + * Partitions are not supported on host managed zoned block devices. + */ + if (disk->queue->limits.zoned == BLK_ZONED_HM) { + pr_warn("%s: ignoring partition table on host managed zoned block device\n", + disk->disk_name); + ret = 0; + goto out_free_state; + } + + /* + * If we read beyond EOD, try unlocking native capacity even if the + * partition table was successfully read as we could be missing some + * partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto out_free_state; + } + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* + * Detect the highest partition number and preallocate disk->part_tbl. + * This is an optimization and not strictly necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + disk_expand_part_tbl(disk, highest); + + for (p = 1; p < state->limit; p++) + if (!blk_add_partition(disk, bdev, state, p)) + goto out_free_state; + + ret = 0; +out_free_state: + free_partitions(state); + return ret; +} + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) +{ + struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct page *page; + + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: + p->v = NULL; + return NULL; +} -- cgit v1.2.3 From 2b8bd423614c595540eaadcfbc702afe8e155e50 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:04 +0300 Subject: block/diskstats: more accurate approximation of io_ticks for slow disks Currently io_ticks is approximated by adding one at each start and end of requests if jiffies counter has changed. This works perfectly for requests shorter than a jiffy or if one of requests starts/ends at each jiffy. If disk executes just one request at a time and they are longer than two jiffies then only first and last jiffies will be accounted. Fix is simple: at the end of request add up into io_ticks jiffies passed since last update rather than just one jiffy. Example: common HDD executes random read 4k requests around 12ms. fio --name=test --filename=/dev/sdb --rw=randread --direct=1 --runtime=30 & iostat -x 10 sdb Note changes of iostat's "%util" 8,43% -> 99,99% before/after patch: Before: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,60 0,00 330,40 0,00 8,00 0,96 12,09 12,09 0,00 1,02 8,43 After: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,50 0,00 330,00 0,00 8,00 1,00 12,10 12,10 0,00 12,12 99,99 Now io_ticks does not loose time between start and end of requests, but for queue-depth > 1 some I/O time between adjacent starts might be lost. For load estimation "%util" is not as useful as average queue length, but it clearly shows how often disk queue is completely empty. Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Konstantin Khlebnikov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 8 ++++---- block/blk-core.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 209715765a7a..68f65ef2ceba 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1768,14 +1768,14 @@ defer: schedule_work(&bio_dirty_work); } -void update_io_ticks(struct hd_struct *part, unsigned long now) +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { - __part_stat_add(part, io_ticks, 1); + __part_stat_add(part, io_ticks, end ? now - stamp : 1); } } if (part->partno) { @@ -1791,7 +1791,7 @@ void generic_start_io_acct(struct request_queue *q, int op, part_stat_lock(); - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); @@ -1809,7 +1809,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, part_stat_lock(); - update_io_ticks(part, now); + update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index abfdcf81a228..4401b30a1751 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1337,7 +1337,7 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); @@ -1379,7 +1379,7 @@ void blk_account_io_start(struct request *rq, bool new_io) rq->part = part; } - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_unlock(); } -- cgit v1.2.3 From ea18e0f0a63af9064db3d4065d90fa743ae0991b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:06 +0300 Subject: block/diskstats: accumulate all per-cpu counters in one pass Reading /proc/diskstats iterates over all cpus for summing each field. It's faster to sum all fields in one pass. Hammering /proc/diskstats with fio shows 2x performance improvement: fio --name=test --numjobs=$JOBS --filename=/proc/diskstats \ --size=1k --bs=1k --fallocate=none --create_on_open=1 \ --time_based=1 --runtime=10 --invalidate=0 --group_report JOBS=1 JOBS=10 Before: 7k iops 64k iops After: 18k iops 120k iops Also this way code is more compact: add/remove: 1/0 grow/shrink: 0/2 up/down: 194/-1540 (-1346) Function old new delta part_stat_read_all - 194 +194 diskstats_show 1344 631 -713 part_stat_show 1219 392 -827 Total: Before=14966947, After=14965601, chg -0.01% Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/genhd.c | 102 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index f7d60b620b97..9eb981f7e5a4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -92,6 +92,34 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); +#ifdef CONFIG_SMP +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; + stat->time_in_queue += ptr->time_in_queue; + } +} +#else /* CONFIG_SMP */ +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); +} +#endif /* CONFIG_SMP */ + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) @@ -1214,9 +1242,12 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; unsigned int inflight; + part_stat_read_all(p, &stat); inflight = part_in_flight(q, p); + return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1224,23 +1255,23 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, @@ -1492,6 +1523,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; + struct disk_stats stat; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1503,7 +1535,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + part_stat_read_all(hd, &stat); inflight = part_in_flight(gp->queue, hd); + seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1513,23 +1547,27 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[STAT_READ]), - part_stat_read(hd, merges[STAT_READ]), - part_stat_read(hd, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(hd, STAT_READ), - part_stat_read(hd, ios[STAT_WRITE]), - part_stat_read(hd, merges[STAT_WRITE]), - part_stat_read(hd, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)), - part_stat_read(hd, ios[STAT_DISCARD]), - part_stat_read(hd, merges[STAT_DISCARD]), - part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), - part_stat_read(hd, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) ); } disk_part_iter_exit(&piter); -- cgit v1.2.3 From 8cd5b8fc00716fb71f6b32d594b38a8f286d6c20 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:08 +0300 Subject: block/diskstats: replace time_in_queue with sum of request times Column "time_in_queue" in diskstats is supposed to show total waiting time of all requests. I.e. value should be equal to the sum of times from other columns. But this is not true, because column "time_in_queue" is counted separately in jiffies rather than in nanoseconds as other times. This patch removes redundant counter for "time_in_queue" and shows total time of read, write, discard and flush requests. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/bio.c | 1 - block/blk-core.c | 1 - block/genhd.c | 13 ++++++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 68f65ef2ceba..bc9152977bf0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1811,7 +1811,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op, update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 4401b30a1751..eaf6cb3887e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1340,7 +1340,6 @@ void blk_account_io_done(struct request *req, u64 now) update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 9eb981f7e5a4..792356e922a1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -110,7 +110,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } stat->io_ticks += ptr->io_ticks; - stat->time_in_queue += ptr->time_in_queue; } } #else /* CONFIG_SMP */ @@ -1265,7 +1264,11 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], @@ -1559,7 +1562,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], stat.sectors[STAT_DISCARD], -- cgit v1.2.3 From c92a41031a6d57395889b5c87cea359220a24d2a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 25 Mar 2020 00:24:44 +0900 Subject: block: factor out requeue handling from dispatch code Factor out the requeue handling from the dispatch code, this will make subsequent addition of different requeueing schemes easier. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5b2e6550e0b6..745ec592a513 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1178,6 +1178,23 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ +static void blk_mq_handle_dev_resource(struct request *rq, + struct list_head *list) +{ + struct request *next = + list_first_entry_or_null(list, struct request, queuelist); + + /* + * If an I/O scheduler has been configured and we got a driver tag for + * the next request already, free it. + */ + if (next) + blk_mq_put_driver_tag(next); + + list_add(&rq->queuelist, list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1245,17 +1262,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - /* - * If an I/O scheduler has been configured and we got a - * driver tag for the next request already, free it - * again. - */ - if (!list_empty(list)) { - nxt = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(nxt); - } - list_add(&rq->queuelist, list); - __blk_mq_requeue_request(rq); + blk_mq_handle_dev_resource(rq, list); break; } -- cgit v1.2.3 From 31eb6186797c149665fd44f3847bdf8d539efa59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:35 +0100 Subject: block: mark block_depr static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 792356e922a1..7dafd7504493 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,7 +27,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -struct kobject *block_depr; +static struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) -- cgit v1.2.3 From 6005771c17db56b6a9acc12bd084134191560e18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:36 +0100 Subject: block: mark part_in_flight and part_in_flight_rw static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 7dafd7504493..5f9df331822a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -139,7 +139,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) +static unsigned int part_in_flight(struct request_queue *q, + struct hd_struct *part) { int cpu; unsigned int inflight; @@ -159,8 +160,8 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) return inflight; } -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) { int cpu; -- cgit v1.2.3 From 572e7fc85b7ac3bfbf4d4ec7d2c6e7ad3507f57a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:37 +0100 Subject: block: unexport disk_get_part disk_get_part is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5f9df331822a..0ee74b7e01f4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -217,7 +217,6 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) return part; } -EXPORT_SYMBOL_GPL(disk_get_part); /** * disk_part_iter_init - initialize partition iterator -- cgit v1.2.3 From a7818aedda7101f2270d5495aa4c4114b13510bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:38 +0100 Subject: block: unexport disk_map_sector_rcu disk_map_sector_rcu is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0ee74b7e01f4..1e4855c8265a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -372,7 +372,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } return &disk->part0; } -EXPORT_SYMBOL_GPL(disk_map_sector_rcu); /* * Can be deleted altogether. Later. -- cgit v1.2.3 From 1b4d4dbdaeb7087122a39d3fb9ae32487e001b6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:39 +0100 Subject: block: unexport get_gendisk get_gendisk is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 1e4855c8265a..6323cc789efa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -980,7 +980,6 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } return disk; } -EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number -- cgit v1.2.3 From 29125ed624eeb3ac2eb7bca313a8de29c1c84dcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:40 +0100 Subject: block: move guard_bio_eod to bio.c This is bio layer functionality and not related to buffer heads. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index bc9152977bf0..11e6aac35092 100644 --- a/block/bio.c +++ b/block/bio.c @@ -588,6 +588,49 @@ void bio_truncate(struct bio *bio, unsigned new_size) bio->bi_iter.bi_size = new_size; } +/** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to -- cgit v1.2.3 From 581e26004a09c50e5017caadc850ea17e374a5ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:41 +0100 Subject: block: move block layer internals out of include/linux/genhd.h None of this needs to be exposed to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 116 +++++++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 1 + block/partitions/check.h | 1 + block/partitions/core.c | 1 - 4 files changed, 118 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 43df9dcb3d4e..ac20f972842e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -149,6 +149,9 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } + +void blk_integrity_add(struct gendisk *); +void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) @@ -171,6 +174,12 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } +static inline void blk_integrity_add(struct gendisk *disk) +{ +} +static inline void blk_integrity_del(struct gendisk *disk) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); @@ -365,4 +374,111 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + +int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +void blk_free_devt(dev_t devt); +void blk_invalidate_devt(dev_t devt); +char *disk_name(struct gendisk *hd, int partno, char *buf); +#define ADDPART_FLAG_NONE 0 +#define ADDPART_FLAG_RAID 1 +#define ADDPART_FLAG_WHOLEDISK 2 +struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info); +void __delete_partition(struct percpu_ref *ref); +void delete_partition(struct gendisk *disk, int partno); +int disk_expand_part_tbl(struct gendisk *disk, int target); + +static inline int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, __delete_partition, 0, + GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + percpu_ref_get(&part->ref); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return percpu_ref_tryget_live(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + percpu_ref_put(&part->ref); +} + +static inline void hd_struct_kill(struct hd_struct *part) +{ + percpu_ref_kill(&part->ref); +} + +static inline void hd_free_part(struct hd_struct *part) +{ + free_part_stats(part); + kfree(part->info); + percpu_ref_exit(&part->ref); +} + +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/ioctl.c b/block/ioctl.c index 127194b9f9bd..6e827de1a4c4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,6 +11,7 @@ #include #include #include +#include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) diff --git a/block/partitions/check.h b/block/partitions/check.h index f845355489ec..c577e9ee67f0 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -2,6 +2,7 @@ #include #include #include +#include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition diff --git a/block/partitions/core.c b/block/partitions/core.c index b442bc209b86..b79c4513629b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -10,7 +10,6 @@ #include #include #include -#include "../blk.h" #include "check.h" static int (*check_part[])(struct parsed_partitions *) = { -- cgit v1.2.3 From c6a564ffadc9105880329710164ee493f0de103c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:42 +0100 Subject: block: move the part_stat* helpers from genhd.h to a new header These macros are just used by a few files. Move them out of genhd.h, which is included everywhere into a new standalone header. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index ac20f972842e..d9673164a145 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "blk-mq.h" #include "blk-mq-sched.h" -- cgit v1.2.3 From 348e114bbd4dce430eae70f01a04c8fc259b4cf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:07:17 +0100 Subject: block: move the ->devnode callback to struct block_device_operations There really isn't any good reason to stash a method directly into struct gendisk. Move it together with the other block device operations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 6323cc789efa..14cf395a1479 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1497,8 +1497,8 @@ static char *block_devnode(struct device *dev, umode_t *mode, { struct gendisk *disk = dev_to_disk(dev); - if (disk->devnode) - return disk->devnode(disk, mode); + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); return NULL; } -- cgit v1.2.3 From 2f227bb99934a4faa6dfe2cda2594bce8897a323 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:08 +0100 Subject: block: add a blk_mq_init_queue_data helper This allows a driver to pass a queuedata member before ->init_hctx is called. null_blk currently open codes this logic, but I'd rather have it in the core to ease future maintainance. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 745ec592a513..216bf62e88b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2724,13 +2724,15 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; /* * Initialize the queue without an elevator. device_add_disk() will do @@ -2742,6 +2744,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; } +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} EXPORT_SYMBOL(blk_mq_init_queue); /* -- cgit v1.2.3 From 3d745ea5b095a3985129e162900b7e6c22518a9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:11 +0100 Subject: block: simplify queue allocation Current make_request based drivers use either blk_alloc_queue_node or blk_alloc_queue to allocate a queue, and then set up the make_request_fn function pointer and a few parameters using the blk_queue_make_request helper. Simplify this by passing the make_request pointer to blk_alloc_queue, and while at it merge the _node variant into the main helper by always passing a node_id, and remove the superfluous gfp_mask parameter. A lower-level __blk_alloc_queue is kept for the blk-mq case. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 39 +++++++++++++++++++++++---------------- block/blk-mq.c | 8 ++------ block/blk-settings.c | 36 ------------------------------------ block/blk.h | 2 ++ 5 files changed, 28 insertions(+), 59 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a229b94d5390..c15a26096038 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1010,7 +1010,7 @@ unlock: * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * Called from __blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index eaf6cb3887e6..18b8c09d093e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -388,12 +388,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_alloc_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -470,24 +464,19 @@ static void blk_timeout_work(struct work_struct *work) { } -/** - * blk_alloc_queue_node - allocate a request queue - * @gfp_mask: memory allocation flags - * @node_id: NUMA node to allocate memory from - */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +struct request_queue *__blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; @@ -495,7 +484,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); if (!q->backing_dev_info) goto fail_split; @@ -541,6 +530,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + return q; fail_ref: @@ -557,7 +549,22 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue_node); + +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) +{ + struct request_queue *q; + + if (WARN_ON_ONCE(!make_request)) + return -EINVAL; + + q = __blk_alloc_queue(node_id); + if (!q) + return NULL; + q->make_request_fn = make_request; + q->nr_requests = BLKDEV_MAX_RQ; + return q; +} +EXPORT_SYMBOL(blk_alloc_queue); bool blk_get_queue(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 216bf62e88b6..f6291ceedee4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2729,7 +2729,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = __blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; @@ -2939,11 +2939,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - - /* - * Do this after blk_queue_make_request() overrides it... - */ + q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index c8eda2e7b91e..126d216a2db6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -86,42 +86,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - - q->make_request_fn = mfn; - blk_queue_dma_alignment(q, 511); - - blk_set_default_limits(&q->limits); -} -EXPORT_SYMBOL(blk_queue_make_request); - /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index d9673164a145..491e52fc0aa6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -482,4 +482,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } +struct request_queue *__blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ -- cgit v1.2.3 From f01b411f41f91fc3196eae4317cf8b4d872830a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:12 +0100 Subject: Revert "blkdev: check for valid request queue before issuing flush" This reverts commit f10d9f617a65905c556c3b37c9b9646ae7d04ed7. We can't have queues without a make_request_fn any more (and the loop device uses blk-mq these days anyway..). Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 843d25683691..c7f396e3d5e2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -454,15 +454,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (!q) return -ENXIO; - /* - * some block devices may not have their queue correctly set up here - * (e.g. loop device without a backing file) and so issuing a flush - * here will panic. Ensure there is a request function before issuing - * the flush. - */ - if (!q->make_request_fn) - return -ENXIO; - bio = bio_alloc(gfp_mask, 0); bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- cgit v1.2.3 From 130879f1ee0e25b0391b8c78b3baac6fe41f4d38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 18:48:37 +0100 Subject: block: move bio_map_* to blk-map.c The bio_map_* helpers are just the low-level helpers for the blk_rq_map_* APIs. Move them together for better logical grouping, as no there isn't much overlap with other code in bio.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 510 +------------------------------------------------------- block/blk-map.c | 508 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 4 + 3 files changed, 513 insertions(+), 509 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 11e6aac35092..21cbaa6a1c20 100644 --- a/block/bio.c +++ b/block/bio.c @@ -780,7 +780,7 @@ static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, * * This should only be used by passthrough bios. */ -static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, bool *same_page) { @@ -1194,90 +1194,6 @@ void bio_list_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_list_copy_data); -struct bio_map_data { - int is_our_pages; - struct iov_iter iter; - struct iovec iov[]; -}; - -static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - if (data->nr_segs > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); - if (!bmd) - return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); - bmd->iter = *data; - bmd->iter.iov = bmd->iov; - return bmd; -} - -/** - * bio_copy_from_iter - copy all pages from iov_iter to bio - * @bio: The &struct bio which describes the I/O as destination - * @iter: iov_iter as source - * - * Copy all pages from iov_iter to bio. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_from_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - iter); - - if (!iov_iter_count(iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - -/** - * bio_copy_to_iter - copy all pages from bio to iov_iter - * @bio: The &struct bio which describes the I/O as source - * @iter: iov_iter as destination - * - * Copy all pages from bio to iov_iter. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_to_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - &iter); - - if (!iov_iter_count(&iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1288,430 +1204,6 @@ void bio_free_pages(struct bio *bio) } EXPORT_SYMBOL(bio_free_pages); -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user_iov() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - int ret = 0; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free - * and return -EINTR so user space doesn't expect any data. - */ - if (!current->mm) - ret = -EINTR; - else if (bio_data_dir(bio) == READ) - ret = bio_copy_to_iter(bio, bmd->iter); - if (bmd->is_our_pages) - bio_free_pages(bio); - } - kfree(bmd); - bio_put(bio); - return ret; -} - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct page *page; - struct bio *bio; - int i = 0, ret; - int nr_pages; - unsigned int len = iter->count; - unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; - - bmd = bio_alloc_map_data(iter, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - /* - * We need to do a deep copy of the iov_iter including the iovecs. - * The caller provided iov might point to an on-stack or otherwise - * shortlived one. - */ - bmd->is_our_pages = map_data ? 0 : 1; - - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { - if (!map_data) - __free_page(page); - break; - } - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - if (map_data) - map_data->offset += bio->bi_iter.bi_size; - - /* - * success - */ - if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = bio_copy_from_iter(bio, iter); - if (ret) - goto cleanup; - } else { - if (bmd->is_our_pages) - zero_fill_bio(bio); - iov_iter_advance(iter, bio->bi_iter.bi_size); - } - - bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; -cleanup: - if (!map_data) - bio_free_pages(bio); - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - int j; - struct bio *bio; - int ret; - - if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (iov_iter_count(iter)) { - struct page **pages; - ssize_t bytes; - size_t offs, added = 0; - int npages; - - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; - j = 0; - } else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!__bio_add_pc_page(q, bio, page, n, offs, - &same_page)) { - if (same_page) - put_page(page); - break; - } - - added += n; - bytes -= n; - offs = 0; - } - iov_iter_advance(iter, added); - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - put_page(pages[j++]); - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) - break; - } - - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; - - out_unmap: - bio_release_pages(bio, false); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - -static void bio_invalidate_vmalloc_pages(struct bio *bio) -{ -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE - if (bio->bi_private && !op_is_write(bio_op(bio))) { - unsigned long i, len = 0; - - for (i = 0; i < bio->bi_vcnt; i++) - len += bio->bi_io_vec[i].bv_len; - invalidate_kernel_vmap_range(bio->bi_private, len); - } -#endif -} - -static void bio_map_kern_endio(struct bio *bio) -{ - bio_invalidate_vmalloc_pages(bio); - bio_put(bio); -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); - bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -static void bio_copy_kern_endio(struct bio *bio) -{ - bio_free_pages(bio); - bio_put(bio); -} - -static void bio_copy_kern_endio_read(struct bio *bio) -{ - char *p = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); - p += bvec->bv_len; - } - - bio_copy_kern_endio(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - struct bio *bio; - void *p = data; - int nr_pages = 0; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) - goto cleanup; - - if (!reading) - memcpy(page_address(page), p, bytes); - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) - break; - - len -= bytes; - p += bytes; - } - - if (reading) { - bio->bi_end_io = bio_copy_kern_endio_read; - bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; - } - - return bio; - -cleanup: - bio_free_pages(bio); - bio_put(bio); - return ERR_PTR(-ENOMEM); -} - /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. diff --git a/block/blk-map.c b/block/blk-map.c index b0790268ed9d..b72c361911a4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -11,6 +11,514 @@ #include "blk.h" +struct bio_map_data { + int is_our_pages; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (data->nr_segs > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); + if (!bmd) + return NULL; + memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); + bmd->iter = *data; + bmd->iter.iov = bmd->iov; + return bmd; +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + iter); + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +static int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. + */ + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +static struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio; + int i = 0, ret; + int nr_pages; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + bmd = bio_alloc_map_data(iter, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + + nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + if (nr_pages > BIO_MAX_PAGES) + nr_pages = BIO_MAX_PAGES; + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); + break; + } + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + /* + * success + */ + if ((iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, iter); + if (ret) + goto cleanup; + } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); + iov_iter_advance(iter, bio->bi_iter.bi_size); + } + + bio->bi_private = bmd; + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + return bio; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_user_iov(struct request_queue *q, + struct iov_iter *iter, gfp_t gfp_mask) +{ + int j; + struct bio *bio; + int ret; + + if (!iov_iter_count(iter)) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (iov_iter_count(iter)) { + struct page **pages; + ssize_t bytes; + size_t offs, added = 0; + int npages; + + bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (unlikely(bytes <= 0)) { + ret = bytes ? bytes : -EFAULT; + goto out_unmap; + } + + npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); + + if (unlikely(offs & queue_dma_alignment(q))) { + ret = -EINVAL; + j = 0; + } else { + for (j = 0; j < npages; j++) { + struct page *page = pages[j]; + unsigned int n = PAGE_SIZE - offs; + bool same_page = false; + + if (n > bytes) + n = bytes; + + if (!__bio_add_pc_page(q, bio, page, n, offs, + &same_page)) { + if (same_page) + put_page(page); + break; + } + + added += n; + bytes -= n; + offs = 0; + } + iov_iter_advance(iter, added); + } + /* + * release the pages we didn't map into the bio, if any + */ + while (j < npages) + put_page(pages[j++]); + kvfree(pages); + /* couldn't stuff something into bio? */ + if (bytes) + break; + } + + bio_set_flag(bio, BIO_USER_MAPPED); + + /* + * subtle -- if bio_map_user_iov() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + return bio; + + out_unmap: + bio_release_pages(bio, false); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. + * + * bio_unmap_user() may sleep. + */ +static void bio_unmap_user(struct bio *bio) +{ + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); + bio_put(bio); +} + +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_invalidate_vmalloc_pages(bio); + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_copy_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} + /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. diff --git a/block/blk.h b/block/blk.h index 491e52fc0aa6..0a94ec68af32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -484,4 +484,8 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) struct request_queue *__blk_alloc_queue(int node_id); +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + bool *same_page); + #endif /* BLK_INTERNAL_H */ -- cgit v1.2.3 From 654a3667df364f778b9b5bcdfb32e545aceb6a51 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 29 Mar 2020 10:08:26 -0600 Subject: block: return NULL in blk_alloc_queue() on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes follwoing warning: block/blk-core.c: In function ‘blk_alloc_queue’: block/blk-core.c:558:10: warning: returning ‘int’ from a function with return type ‘struct request_queue *’ makes pointer from integer without a cast [-Wint-conversion] return -EINVAL; Fixes: 3d745ea5b095a ("block: simplify queue allocation") Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 18b8c09d093e..7e4a1da0715e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -555,7 +555,7 @@ struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) struct request_queue *q; if (WARN_ON_ONCE(!make_request)) - return -EINVAL; + return NULL; q = __blk_alloc_queue(node_id); if (!q) -- cgit v1.2.3