diff options
101 files changed, 1683 insertions, 844 deletions
diff --git a/block/Kconfig b/block/Kconfig index 86122e459fe0..f1364d1c0d93 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select FS_IOMAP select SBITMAP help Provide block layer support for the kernel. @@ -183,6 +184,8 @@ config BLK_DEBUG_FS_ZONED config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" + depends on KEYS + select PSERIES_PLPKS if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4533eb491661..ec8ac8cf6e1b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -123,20 +123,38 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - if (bip->bip_vcnt && - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; + bip->bip_iter.bi_size += len; return len; } @@ -199,8 +217,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int intervals; - blk_status_t status; if (!bi) return true; @@ -224,12 +240,10 @@ bool bio_integrity_prep(struct bio *bio) !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; } - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = intervals * bi->tuple_size; + len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); - status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; @@ -244,12 +258,10 @@ bool bio_integrity_prep(struct bio *bio) if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - status = BLK_STS_RESOURCE; goto err_end_io; } bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) @@ -257,28 +269,18 @@ bool bio_integrity_prep(struct bio *bio) /* Map it */ offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; + for (i = 0; i < nr_pages && len > 0; i++) { bytes = PAGE_SIZE - offset; - if (len <= 0) - break; - if (bytes > len) bytes = len; - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) { + if (bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset) < bytes) { printk(KERN_ERR "could not attach integrity payload\n"); - status = BLK_STS_RESOURCE; goto err_end_io; } - if (ret < bytes) - break; - buf += bytes; len -= bytes; offset = 0; @@ -294,10 +296,9 @@ bool bio_integrity_prep(struct bio *bio) return true; err_end_io: - bio->bi_status = status; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; - } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio.c b/block/bio.c index 8672179213b9..816d412c06e9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) + __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size @@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -919,49 +918,15 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - else if (IS_ENABLED(CONFIG_KMSAN)) - return false; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); -} - -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -static bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; } - return false; + + bv->bv_len += len; + return true; } /* @@ -969,11 +934,10 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; @@ -982,7 +946,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, return false; if (bv->bv_len + len > queue_max_segment_size(q)) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -1002,33 +966,33 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } + + if (bio->bi_vcnt >= + min(bio->bi_max_vecs, queue_max_segments(q))) + return 0; /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(&q->limits, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } - if (bio_full(bio, len)) - return 0; - - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; @@ -1129,11 +1093,21 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + return len; } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); @@ -1207,13 +1181,18 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - __bio_add_page(bio, page, len, offset); + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + if (same_page) + bio_release_page(bio, page); return 0; } - - if (same_page) - bio_release_page(bio, page); + __bio_add_page(bio, page, len, offset); return 0; } @@ -1252,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; - size_t offset, trim; + size_t offset; int ret = 0; /* @@ -1281,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); + if (bio->bi_bdev) { + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + iov_iter_revert(iter, trim); + size -= trim; + } - size -= trim; if (unlikely(!size)) { ret = -EFAULT; goto out; @@ -1337,6 +1318,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); @@ -1490,6 +1474,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1549,6 +1534,7 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9faafcd10e17..4a42ea2972ad 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1511,7 +1511,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1549,21 +1549,20 @@ retry: goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - pd->online = false; - } + blkg->pd[pol->plid] = pd; - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { if (pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid]->online = true; + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); @@ -1580,14 +1579,19 @@ out: return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); diff --git a/block/blk-core.c b/block/blk-core.c index 9866468c72a2..9d51e9894ece 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status) return "<null>"; return blk_errors[idx].name; } +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk-flush.c b/block/blk-flush.c index 8220517c2d67..e73dc22d05c1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -183,13 +183,13 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); - list_add(&rq->queuelist, &q->requeue_list); + list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -201,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -257,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -291,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ @@ -299,7 +299,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, return; /* C2 and C3 */ - if (!list_empty(&fq->flush_data_in_flight) && + if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -374,6 +374,12 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -384,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; - INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; @@ -443,9 +448,9 @@ bool blk_insert_flush(struct request *rq) * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); - rq->flush.seq |= REQ_FSEQ_POSTFLUSH; + rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: @@ -496,7 +501,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); - INIT_LIST_HEAD(&fq->flush_data_in_flight); return fq; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index fd5fec989e39..c1a6aba1d59e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) } } -static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) -{ - static DEFINE_MUTEX(init_mutex); - int ret; - - ret = blkg_conf_open_bdev(ctx); - if (ret) - return ret; - - /* - * blk_iolatency_init() may fail after rq_qos_add() succeeds which can - * confuse iolat_rq_qos() test. Make the test and init atomic. - */ - mutex_lock(&init_mutex); - - if (!iolat_rq_qos(ctx->bdev->bd_queue)) - ret = blk_iolatency_init(ctx->bdev->bd_disk); - - mutex_unlock(&init_mutex); - - return ret; -} - static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, blkg_conf_init(&ctx, buf); - ret = blk_iolatency_try_init(&ctx); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); + if (!iolat_rq_qos(ctx.bdev->bd_queue)) + ret = blk_iolatency_init(ctx.bdev->bd_disk); if (ret) goto out; diff --git a/block/blk-mq.c b/block/blk-mq.c index 953f08354c8c..ec922c6bccbe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -43,6 +43,7 @@ #include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); +static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -1174,15 +1175,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) static void blk_mq_complete_send_ipi(struct request *rq) { - struct llist_head *list; unsigned int cpu; cpu = rq->mq_ctx->cpu; - list = &per_cpu(blk_cpu_done, cpu); - if (llist_add(&rq->ipi_list, list)) { - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); - smp_call_function_single_async(cpu, &rq->csd); - } + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) @@ -1343,7 +1340,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -2242,6 +2239,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) */ WARN_ON_ONCE(!async && in_interrupt()); + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue @@ -2257,8 +2256,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (!need_run) return; - if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || - !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } @@ -2393,7 +2391,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -2423,7 +2421,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) unsigned long i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_start_stopped_hw_queue(hctx, async); + blk_mq_start_stopped_hw_queue(hctx, async || + (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -2481,6 +2480,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); + if (rq->cmd_flags & REQ_NOWAIT) + run_queue_async = true; } spin_lock(&ctx->lock); @@ -2641,7 +2642,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } @@ -4402,9 +4403,13 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; + int i; - if (set->nr_hw_queues >= new_nr_hw_queues) + if (set->nr_hw_queues >= new_nr_hw_queues) { + for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); goto done; + } new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -4416,6 +4421,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; + + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) { + while (--i >= set->nr_hw_queues) + __blk_mq_free_map_and_rqs(set, i); + return -ENOMEM; + } + cond_resched(); + } + done: set->nr_hw_queues = new_nr_hw_queues; return 0; @@ -4749,7 +4764,6 @@ fallback: __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; } blk_mq_map_swqueue(q); @@ -4853,6 +4867,9 @@ static int __init blk_mq_init(void) for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); + for_each_possible_cpu(i) + INIT_CSD(&per_cpu(blk_cpu_csd, i), + __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/block/blk-settings.c b/block/blk-settings.c index 4dd59059b788..0046b447268f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index afc797fb0dfc..63e481262336 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,21 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - int set = -1; - - if (!strncmp(page, "write back", 10)) - set = 1; - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } else { + return -EINVAL; + } return count; } diff --git a/block/blk.h b/block/blk.h index 608c5dcc516b..08a358bc0919 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,15 +15,14 @@ struct elevator_type; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { + spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; + unsigned long flush_data_in_flight; struct request *flush_rq; - - spinlock_t mq_flush_lock; }; bool is_flush_rq(struct request *req); @@ -76,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { @@ -251,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); diff --git a/block/fops.c b/block/fops.c index 838ffada5341..a24a624d3bf7 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/module.h> #include "blk.h" @@ -23,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } -static int blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; @@ -387,6 +379,37 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (iomap->offset >= isize) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + +#ifdef CONFIG_BUFFER_HEAD +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -429,10 +452,58 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; +#else /* CONFIG_BUFFER_HEAD */ +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &blkdev_iomap_ops); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &blkdev_iomap_ops); +} + +static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + loff_t isize = i_size_read(inode); + + if (WARN_ON_ONCE(offset >= isize)) + return -EIO; + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + return blkdev_iomap_begin(inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops blkdev_writeback_ops = { + .map_blocks = blkdev_map_blocks, +}; + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .migrate_folio = filemap_migrate_folio, +}; +#endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero @@ -506,7 +577,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; /* * Use the file private data to store the holder for exclusive openes. @@ -534,6 +605,35 @@ static int blkdev_release(struct inode *inode, struct file *filp) return 0; } +static ssize_t +blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + ssize_t written; + + written = kiocb_invalidate_pages(iocb, count); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = blkdev_direct_IO(iocb, from); + if (written > 0) { + kiocb_invalidate_post_direct_write(iocb, count); + iocb->ki_pos += written; + count -= written; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, count - iov_iter_count(from)); + return written; +} + +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -543,7 +643,8 @@ static int blkdev_release(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; @@ -570,7 +671,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - ret = __generic_file_write_iter(iocb, from); + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = blkdev_direct_write(iocb, from); + if (ret >= 0 && iov_iter_count(from)) + ret = direct_write_fallback(iocb, from, ret, + blkdev_buffered_write(iocb, from)); + } else { + ret = blkdev_buffered_write(iocb, from); + } + if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..f958e79277b8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = max(1U, 3 * (1U << shift) / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } diff --git a/block/opal_proto.h b/block/opal_proto.h index a4e56845dd82..dec7ce3a3edb 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -225,6 +225,10 @@ enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, }; +enum opal_revertlsp { + OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 1af610f0ba8c..c03bc105e575 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -81,8 +81,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) length = min_t(int, next - partdef, sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; + strscpy(new_subpart->name, partdef, length); partdef = ++next; } else @@ -140,8 +139,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) } length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; + strscpy(newparts->name, bdevdef, length); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; @@ -153,8 +151,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) length = (!next) ? (sizeof(buf) - 1) : min_t(int, next - bdevdef, sizeof(buf) - 1); - strncpy(buf, bdevdef, length); - buf[length] = '\0'; + strscpy(buf, bdevdef, length); ret = parse_subpart(next_subpart, buf); if (ret) @@ -267,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, label_min = min_t(int, sizeof(info->volname) - 1, sizeof(subpart->name)); - strncpy(info->volname, subpart->name, label_min); - info->volname[label_min] = '\0'; + strscpy(info->volname, subpart->name, label_min); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); diff --git a/block/sed-opal.c b/block/sed-opal.c index c18339446ef3..6d7f25d1711b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -20,6 +20,9 @@ #include <linux/sed-opal.h> #include <linux/string.h> #include <linux/kdev_t.h> +#include <linux/key.h> +#include <linux/key-type.h> +#include <keys/user-type.h> #include "opal_proto.h" @@ -29,6 +32,8 @@ /* Number of bytes needed by cmd_finalize. */ #define CMD_FINALIZE_BYTES_NEEDED 7 +static struct key *sed_opal_keyring; + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -269,6 +274,101 @@ static void print_buffer(const u8 *ptr, u32 length) #endif } +/* + * Allocate/update a SED Opal key and add it to the SED Opal keyring. + */ +static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen) +{ + key_ref_t kr; + + if (!sed_opal_keyring) + return -ENOKEY; + + kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user", + desc, (const void *)key_data, keylen, + KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(kr)) { + pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr)); + return PTR_ERR(kr); + } + + return 0; +} + +/* + * Read a SED Opal key from the SED Opal keyring. + */ +static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) +{ + int ret; + key_ref_t kref; + struct key *key; + + if (!sed_opal_keyring) + return -ENOKEY; + + kref = keyring_search(make_key_ref(sed_opal_keyring, true), + &key_type_user, key_name, true); + + if (IS_ERR(kref)) + ret = PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + down_read(&key->sem); + ret = key_validate(key); + if (ret == 0) { + if (buflen > key->datalen) + buflen = key->datalen; + + ret = key->type->read(key, (char *)buffer, buflen); + } + up_read(&key->sem); + + key_ref_put(kref); + + return ret; +} + +static int opal_get_key(struct opal_dev *dev, struct opal_key *key) +{ + int ret = 0; + + switch (key->key_type) { + case OPAL_INCLUDED: + /* the key is ready to use */ + break; + case OPAL_KEYRING: + /* the key is in the keyring */ + ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX); + if (ret > 0) { + if (ret > U8_MAX) { + ret = -ENOSPC; + goto error; + } + key->key_len = ret; + key->key_type = OPAL_INCLUDED; + } + break; + default: + ret = -EINVAL; + break; + } + if (ret < 0) + goto error; + + /* must have a PEK by now or it's an error */ + if (key->key_type != OPAL_INCLUDED || key->key_len == 0) { + ret = -EINVAL; + goto error; + } + return 0; +error: + pr_debug("Error getting password: %d\n", ret); + return ret; +} + static bool check_tper(const void *data) { const struct d0_tper_features *tper = data; @@ -463,8 +563,11 @@ out_error: return error; } -static int opal_discovery0_end(struct opal_dev *dev) +static int opal_discovery0_end(struct opal_dev *dev, void *data) { + struct opal_discovery *discv_out = data; /* may be NULL */ + u8 __user *buf_out; + u64 len_out; bool found_com_id = false, supported = true, single_user = false; const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; @@ -480,6 +583,15 @@ static int opal_discovery0_end(struct opal_dev *dev) return -EFAULT; } + if (discv_out) { + buf_out = (u8 __user *)(uintptr_t)discv_out->data; + len_out = min_t(u64, discv_out->size, hlen); + if (buf_out && copy_to_user(buf_out, dev->resp, len_out)) + return -EFAULT; + + discv_out->size = hlen; /* actual size of data */ + } + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ @@ -565,13 +677,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data) if (ret) return ret; - return opal_discovery0_end(dev); + return opal_discovery0_end(dev, data); } static int opal_discovery0_step(struct opal_dev *dev) { const struct opal_step discovery0_step = { - opal_discovery0, + opal_discovery0, NULL }; return execute_step(dev, &discovery0_step, 0); @@ -1757,6 +1869,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int revert_lsp(struct opal_dev *dev, void *data) +{ + struct opal_revert_lsp *rev = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REVERTSP]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY); + add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ? + OPAL_TRUE : OPAL_FALSE); + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) { + pr_debug("Error building REVERT SP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int erase_locking_range(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -2427,6 +2559,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2435,6 +2570,42 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, return ret; } +static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) +{ + const struct opal_step discovery0_step = { + opal_discovery0, discv + }; + int ret = 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_step(dev, &discovery0_step, 0); + mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + return discv->size; /* modified to actual length of data */ +} + +static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) +{ + /* controller will terminate session */ + const struct opal_step steps[] = { + { start_admin1LSP_opal_session, &rev->key }, + { revert_lsp, rev } + }; + int ret; + + ret = opal_get_key(dev, &rev->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -2445,6 +2616,9 @@ static int opal_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2473,6 +2647,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, opal_mbr->enable_disable != OPAL_MBR_DISABLE) return -EINVAL; + ret = opal_get_key(dev, &opal_mbr->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2498,6 +2675,9 @@ static int opal_set_mbr_done(struct opal_dev *dev, mbr_done->done_flag != OPAL_MBR_NOT_DONE) return -EINVAL; + ret = opal_get_key(dev, &mbr_done->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2519,6 +2699,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, if (info->size == 0) return 0; + ret = opal_get_key(dev, &info->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2576,6 +2759,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2598,6 +2784,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi int ret; + ret = opal_get_key(dev, opal); + + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); if (psid) @@ -2698,6 +2888,9 @@ static int opal_lock_unlock(struct opal_dev *dev, if (lk_unlk->session.who > OPAL_USER9) return -EINVAL; + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); opal_lock_check_for_saved_key(dev, lk_unlk); ret = __opal_lock_unlock(dev, lk_unlk); @@ -2721,6 +2914,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) if (!dev) return -ENODEV; + ret = opal_get_key(dev, opal); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); @@ -2743,6 +2939,9 @@ static int opal_activate_lsp(struct opal_dev *dev, if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) return -EINVAL; + ret = opal_get_key(dev, &opal_lr_act->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); @@ -2761,6 +2960,9 @@ static int opal_setup_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); @@ -2814,6 +3016,14 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + + /* update keyring with new password */ + ret = update_sed_opal_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + return ret; } @@ -2834,6 +3044,9 @@ static int opal_activate_user(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); @@ -2920,6 +3133,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev, { int ret, bit_set; + ret = opal_get_key(dev, &rw_tbl->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -2988,9 +3204,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!dev) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!(dev->flags & OPAL_FL_SUPPORTED)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (cmd & IOC_IN) { p = memdup_user(arg, _IOC_SIZE(cmd)); @@ -3056,6 +3272,13 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_GEOMETRY: ret = opal_get_geometry(dev, arg); break; + case IOC_OPAL_REVERT_LSP: + ret = opal_revertlsp(dev, p); + break; + case IOC_OPAL_DISCOVERY: + ret = opal_get_discv(dev, p); + break; + default: break; } @@ -3065,3 +3288,22 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); + +static int __init sed_opal_init(void) +{ + struct key *kr; + + kr = keyring_alloc(".sed_opal", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(kr)) + return PTR_ERR(kr); + + sed_opal_keyring = kr; + + return 0; +} +late_initcall(sed_opal_init); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 42e0159bb258..df1cd0f718b8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2334,6 +2334,7 @@ static struct genl_family nbd_genl_family __ro_after_init = { .mcgrps = nbd_mcast_grps, .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), }; +MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME); static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) { diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index dc43a63b3469..c2bc85826358 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1277,7 +1277,7 @@ static struct macio_driver swim3_driver = }; -int swim3_init(void) +static int swim3_init(void) { macio_register_driver(&swim3_driver); return 0; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 21d2e71c5514..630ddfe6657b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -56,16 +56,21 @@ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ - | UBLK_F_USER_COPY) + | UBLK_F_USER_COPY \ + | UBLK_F_ZONED) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) +#define UBLK_PARAM_TYPE_ALL \ + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) struct ublk_rq_data { struct llist_node node; struct kref ref; + __u64 sector; + __u32 operation; + __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -185,6 +190,266 @@ struct ublk_params_header { __u32 types; }; +static inline unsigned int ublk_req_build_flags(struct request *req); +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag); + +static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_ZONED; +} + +static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_ZONED; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +static int ublk_get_nr_zones(const struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; + + /* Zone size is a power of 2 */ + return p->dev_sectors >> ilog2(p->chunk_sectors); +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return blk_revalidate_disk_zones(ub->ub_disk, NULL); +} + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + int nr_zones; + + if (!ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (!p->max_zone_append_sectors) + return -EINVAL; + + nr_zones = ublk_get_nr_zones(ub); + + if (p->max_active_zones > nr_zones) + return -EINVAL; + + if (p->max_open_zones > nr_zones) + return -EINVAL; + + return 0; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + + disk_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); + blk_queue_required_elevator_features(ub->ub_disk->queue, + ELEVATOR_F_ZBD_SEQ_WRITE); + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); + + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); + + return 0; +} + +/* Based on virtblk_alloc_report_buffer */ +static void *ublk_alloc_report_buffer(struct ublk_device *ublk, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ublk->ub_disk->queue; + size_t bufsize; + void *buf; + + nr_zones = min_t(unsigned int, nr_zones, + ublk->ub_disk->nr_zones); + + bufsize = nr_zones * sizeof(struct blk_zone); + bufsize = + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); + + while (bufsize >= sizeof(struct blk_zone)) { + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + + *buflen = 0; + return NULL; +} + +static int ublk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct ublk_device *ub = disk->private_data; + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; + unsigned int first_zone = sector >> ilog2(zone_size_sectors); + unsigned int done_zones = 0; + unsigned int max_zones_per_request; + int ret; + struct blk_zone *buffer; + size_t buffer_length; + + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, + nr_zones); + + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); + if (!buffer) + return -ENOMEM; + + max_zones_per_request = buffer_length / sizeof(struct blk_zone); + + while (done_zones < nr_zones) { + unsigned int remaining_zones = nr_zones - done_zones; + unsigned int zones_in_request = + min_t(unsigned int, remaining_zones, max_zones_per_request); + struct request *req; + struct ublk_rq_data *pdu; + blk_status_t status; + + memset(buffer, 0, buffer_length); + + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + pdu = blk_mq_rq_to_pdu(req); + pdu->operation = UBLK_IO_OP_REPORT_ZONES; + pdu->sector = sector; + pdu->nr_zones = zones_in_request; + + ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, + GFP_KERNEL); + if (ret) { + blk_mq_free_request(req); + goto out; + } + + status = blk_execute_rq(req, 0); + ret = blk_status_to_errno(status); + blk_mq_free_request(req); + if (ret) + goto out; + + for (unsigned int i = 0; i < zones_in_request; i++) { + struct blk_zone *zone = buffer + i; + + /* A zero length zone means no more zones in this response */ + if (!zone->len) + break; + + ret = cb(zone, i, data); + if (ret) + goto out; + + done_zones++; + sector += zone_size_sectors; + + } + } + + ret = done_zones; + +out: + kvfree(buffer); + return ret; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_ZONE_OPEN: + ublk_op = UBLK_IO_OP_ZONE_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + ublk_op = UBLK_IO_OP_ZONE_CLOSE; + break; + case REQ_OP_ZONE_FINISH: + ublk_op = UBLK_IO_OP_ZONE_FINISH; + break; + case REQ_OP_ZONE_RESET: + ublk_op = UBLK_IO_OP_ZONE_RESET; + break; + case REQ_OP_ZONE_APPEND: + ublk_op = UBLK_IO_OP_ZONE_APPEND; + break; + case REQ_OP_ZONE_RESET_ALL: + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; + break; + case REQ_OP_DRV_IN: + ublk_op = pdu->operation; + switch (ublk_op) { + case UBLK_IO_OP_REPORT_ZONES: + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_zones = pdu->nr_zones; + iod->start_sector = pdu->sector; + return BLK_STS_OK; + default: + return BLK_STS_IOERR; + } + case REQ_OP_DRV_OUT: + /* We do not support drv_out */ + return BLK_STS_NOTSUPP; + default: + return BLK_STS_IOERR; + } + + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +#else + +#define ublk_report_zones (NULL) + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return 0; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + return BLK_STS_NOTSUPP; +} + +#endif + static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); @@ -281,6 +546,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) + return -EINVAL; } else return -EINVAL; @@ -299,6 +567,11 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DEVT) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_validate(ub); + else if (ublk_dev_is_zoned(ub)) + return -EINVAL; + return 0; } @@ -312,6 +585,9 @@ static int ublk_apply_params(struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) ublk_dev_param_discard_apply(ub); + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_apply(ub); + return 0; } @@ -482,6 +758,7 @@ static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, .open = ublk_open, .free_disk = ublk_free_disk, + .report_zones = ublk_report_zones, }; #define UBLK_MAX_PIN_PAGES 32 @@ -596,7 +873,8 @@ static inline bool ublk_need_map_req(const struct request *req) static inline bool ublk_need_unmap_req(const struct request *req) { - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; + return ublk_rq_has_data(req) && + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, @@ -680,8 +958,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; + enum req_op op = req_op(req); u32 ublk_op; + if (!ublk_queue_is_zoned(ubq) && + (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) + return BLK_STS_IOERR; + switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; @@ -699,6 +982,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) ublk_op = UBLK_IO_OP_WRITE_ZEROES; break; default: + if (ublk_queue_is_zoned(ubq)) + return ublk_setup_iod_zoned(ubq, req); return BLK_STS_IOERR; } @@ -751,7 +1036,8 @@ static inline void __ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && + req_op(req) != REQ_OP_DRV_IN) goto exit; /* for READ request, writing data in iod->addr to rq buffers */ @@ -1114,8 +1400,13 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (WARN_ON_ONCE(unlikely(!req))) + return; - if (req && likely(!blk_should_fake_timeout(req->q))) + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + + if (likely(!blk_should_fake_timeout(req->q))) ublk_put_req_ref(ubq, req); } @@ -1414,11 +1705,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - if (ublk_support_user_copy(ubq) && ub_cmd->addr) { - ret = -EINVAL; - goto out; - } - ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1445,6 +1731,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); @@ -1464,7 +1754,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; + } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ + ret = -EINVAL; + goto out; } + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; @@ -1537,11 +1835,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { /* copy ubuf to request pages */ - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && + ubuf_dir == ITER_SOURCE) return true; /* copy request pages to ubuf */ - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) + if ((req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_ZONE_APPEND) && + ubuf_dir == ITER_DEST) return true; return false; @@ -1881,17 +2182,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) get_device(&ub->cdev_dev); ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_dev_is_zoned(ub)) { + ret = ublk_revalidate_disk_zones(ub); + if (ret) + goto out_put_cdev; + } + ret = add_disk(disk); + if (ret) + goto out_put_cdev; + + set_bit(UB_STATE_USED, &ub->state); + +out_put_cdev: if (ret) { - /* - * Has to drop the reference since ->free_disk won't be - * called in case of add_disk failure. - */ ub->dev_info.state = UBLK_S_DEV_DEAD; ublk_put_device(ub); - goto out_put_disk; } - set_bit(UB_STATE_USED, &ub->state); out_put_disk: if (ret) put_disk(disk); @@ -2038,9 +2346,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) UBLK_F_URING_CMD_COMP_IN_TASK; /* GET_DATA isn't needed any more with USER_COPY */ - if (ub->dev_info.flags & UBLK_F_USER_COPY) + if (ublk_dev_is_user_copy(ub)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* Zoned storage support requires user copy feature */ + if (ublk_dev_is_zoned(ub) && + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + ret = -EINVAL; + goto out_free_dev_number; + } + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; @@ -2433,14 +2748,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, if (header->len < header->dev_path_len) return -EINVAL; - dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); - if (!dev_path) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(dev_path, argp, header->dev_path_len)) - goto exit; - dev_path[header->dev_path_len] = 0; + dev_path = memdup_user_nul(argp, header->dev_path_len); + if (IS_ERR(dev_path)) + return PTR_ERR(dev_path); ret = -EINVAL; switch (_IOC_NR(cmd->cmd_op)) { diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b0a22e99bade..2a8b081bce7d 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" select BLOCK_HOLDER_DEPRECATED if SYSFS + select BUFFER_HEAD # BLOCK_LEGACY_AUTOLOAD requirement should be removed # after relevant mdadm enhancements - to make "names=yes" # the default - are widely available. @@ -50,6 +51,16 @@ config MD_AUTODETECT If unsure, say Y. +config MD_BITMAP_FILE + bool "MD bitmap file support (deprecated)" + default y + help + If you say Y here, support for write intent bitmaps in files on an + external file system is enabled. This is an alternative to the internal + bitmaps near the MD superblock, and very problematic code that abuses + various kernel APIs and can only work with files on a file system not + actually sitting on the MD device. + config MD_LINEAR tristate "Linear (append) mode (deprecated)" depends on BLK_DEV_MD diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dc6227d353e..f2662c21a6df 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index becdb689190e..5f9991765f27 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3723,7 +3723,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1ff712889a3b..6f9ff14971f9 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -139,29 +139,26 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page */ /* IO operations when bitmap is stored near all superblocks */ + +/* choose a good rdev and read the page from there */ static int read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) + struct page *page, unsigned long index, int size) { - /* choose a good rdev and read the page from there */ + sector_t sector = mddev->bitmap_info.offset + offset + + index * (PAGE_SIZE / SECTOR_SIZE); struct md_rdev *rdev; - sector_t target; rdev_for_each(rdev, mddev) { - if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags) - || test_bit(Bitmap_sync, &rdev->flags)) - continue; + u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev)); - target = offset + index * (PAGE_SIZE/512); + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; - if (sync_page_io(rdev, target, - roundup(size, bdev_logical_block_size(rdev->bdev)), - page, REQ_OP_READ, true)) { - page->index = index; + if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true)) return 0; - } } return -EIO; } @@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, } static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, - struct page *page) + unsigned long pg_index, struct page *page) { struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; loff_t sboff, offset = mddev->bitmap_info.offset; - sector_t ps, doff; + sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; + sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (page->index == store->file_pages - 1) { + if (pg_index == store->file_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, opt_size = optimal_io_size(bdev, last_page_size, size); } - ps = page->index * PAGE_SIZE / SECTOR_SIZE; sboff = rdev->sb_start + offset; doff = rdev->data_offset; @@ -279,55 +276,41 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return 0; } -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, + struct page *page, bool wait) { - struct md_rdev *rdev; struct mddev *mddev = bitmap->mddev; - int ret; do { - rdev = NULL; + struct md_rdev *rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - ret = __write_sb_page(rdev, bitmap, page); - if (ret) - return ret; + if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) { + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + return; + } } } while (wait && md_super_wait(mddev) < 0); - - return 0; } static void md_bitmap_file_kick(struct bitmap *bitmap); -/* - * write out a page to a file - */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct buffer_head *bh; - - if (bitmap->storage.file == NULL) { - switch (write_sb_page(bitmap, page, wait)) { - case -EINVAL: - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); - } - } else { - - bh = page_buffers(page); - while (bh && bh->b_blocknr) { - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); - bh = bh->b_this_page; - } +#ifdef CONFIG_MD_BITMAP_FILE +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh = page_buffers(page); - if (wait) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + bh = bh->b_this_page; } - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - md_bitmap_file_kick(bitmap); + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) @@ -364,10 +347,8 @@ static void free_buffers(struct page *page) * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ -static int read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count, - struct page *page) +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) { int ret = 0; struct inode *inode = file_inode(file); @@ -415,7 +396,6 @@ static int read_page(struct file *file, unsigned long index, blk_cur++; bh = bh->b_this_page; } - page->index = index; wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); @@ -429,12 +409,46 @@ out: ret); return ret; } +#else /* CONFIG_MD_BITMAP_FILE */ +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ +} +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) +{ + return -EIO; +} +static void free_buffers(struct page *page) +{ + put_page(page); +} +#endif /* CONFIG_MD_BITMAP_FILE */ /* * bitmap file superblock operations */ /* + * write out a page to a file + */ +static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, + bool wait) +{ + struct bitmap_storage *store = &bitmap->storage; + struct page *page = store->filemap[pg_index]; + + if (mddev_is_clustered(bitmap->mddev)) { + pg_index += bitmap->cluster_slot * + DIV_ROUND_UP(store->bytes, PAGE_SIZE); + } + + if (store->file) + write_file_page(bitmap, page, wait); + else + write_sb_page(bitmap, pg_index, page, wait); +} + +/* * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from * md_bitmap_daemon_work(), have completed. @@ -488,7 +502,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap) sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_atomic(sb); - write_page(bitmap, bitmap->storage.sb_page, 1); + + if (bitmap->storage.file) + write_file_page(bitmap, bitmap->storage.sb_page, 1); + else + write_sb_page(bitmap, bitmap->storage.sb_index, + bitmap->storage.sb_page, 1); } EXPORT_SYMBOL(md_bitmap_update_sb); @@ -540,7 +559,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; - bitmap->storage.sb_page->index = 0; + bitmap->storage.sb_index = 0; sb = kmap_atomic(bitmap->storage.sb_page); @@ -601,7 +620,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; - loff_t offset = bitmap->mddev->bitmap_info.offset; + loff_t offset = 0; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; @@ -628,7 +647,7 @@ re_read: bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); + offset = bitmap->cluster_slot * (bm_blocks << 3); pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, bitmap->cluster_slot, offset); } @@ -637,13 +656,11 @@ re_read: loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - err = read_page(bitmap->storage.file, 0, + err = read_file_page(bitmap->storage.file, 0, bitmap, bytes, sb_page); } else { - err = read_sb_page(bitmap->mddev, - offset, - sb_page, - 0, sizeof(bitmap_super_t)); + err = read_sb_page(bitmap->mddev, offset, sb_page, 0, + sizeof(bitmap_super_t)); } if (err) return err; @@ -819,7 +836,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; - store->sb_page->index = offset; + store->sb_index = offset; } for ( ; pnum < num_pages; pnum++) { @@ -828,7 +845,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, store->file_pages = pnum; return -ENOMEM; } - store->filemap[pnum]->index = pnum + offset; } store->file_pages = pnum; @@ -847,14 +863,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, static void md_bitmap_file_unmap(struct bitmap_storage *store) { - struct page **map, *sb_page; - int pages; - struct file *file; - - file = store->file; - map = store->filemap; - pages = store->file_pages; - sb_page = store->sb_page; + struct file *file = store->file; + struct page *sb_page = store->sb_page; + struct page **map = store->filemap; + int pages = store->file_pages; while (pages--) if (map[pages] != sb_page) /* 0 is sb_page, release it below */ @@ -879,21 +891,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) */ static void md_bitmap_file_kick(struct bitmap *bitmap) { - char *path, *ptr = NULL; - if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { md_bitmap_update_sb(bitmap); if (bitmap->storage.file) { - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = file_path(bitmap->storage.file, - path, PAGE_SIZE); + pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", + bmname(bitmap), bitmap->storage.file); - pr_warn("%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); - - kfree(path); } else pr_warn("%s: disabling internal bitmap due to errors\n", bmname(bitmap)); @@ -945,6 +949,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -962,9 +967,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) else set_bit_le(bit, kaddr); kunmap_atomic(kaddr); - pr_debug("set file bit %lu page %lu\n", bit, page->index); + pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); } static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) @@ -974,6 +979,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -989,8 +995,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) else clear_bit_le(bit, paddr); kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); + if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } @@ -1042,7 +1048,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); - write_page(bitmap, bitmap->storage.filemap[i], 0); + filemap_write_page(bitmap, i, false); writing = 1; } } @@ -1084,33 +1090,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) EXPORT_SYMBOL(md_bitmap_unplug_async); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); -/* * bitmap_init_from_disk -- called at bitmap_create time to initialize - * the in-memory bitmap from the on-disk bitmap -- also, sets up the - * memory mapping of the bitmap file - * Special cases: - * if there's no bitmap file, or if the bitmap file had been - * previously kicked from the array, we mark all the bits as - * 1's in order to cause a full resync. + +/* + * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory + * mapping of the bitmap file. + * + * Special case: If there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as 1's in order to + * cause a full resync. * * We ignore all bits for sectors that end earlier than 'start'. - * This is used when reading an out-of-date bitmap... + * This is used when reading an out-of-date bitmap. */ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { - unsigned long i, chunks, index, oldindex, bit, node_offset = 0; - struct page *page = NULL; - unsigned long bit_cnt = 0; - struct file *file; - unsigned long offset; - int outofdate; - int ret = -ENOSPC; - void *paddr; + bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); + struct mddev *mddev = bitmap->mddev; + unsigned long chunks = bitmap->counts.chunks; struct bitmap_storage *store = &bitmap->storage; + struct file *file = store->file; + unsigned long node_offset = 0; + unsigned long bit_cnt = 0; + unsigned long i; + int ret; - chunks = bitmap->counts.chunks; - file = store->file; - - if (!file && !bitmap->mddev->bitmap_info.offset) { + if (!file && !mddev->bitmap_info.offset) { /* No permanent bitmap - fill with '1s'. */ store->filemap = NULL; store->file_pages = 0; @@ -1125,77 +1129,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return 0; } - outofdate = test_bit(BITMAP_STALE, &bitmap->flags); - if (outofdate) - pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); - if (file && i_size_read(file->f_mapping->host) < store->bytes) { pr_warn("%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), store->bytes); + ret = -ENOSPC; goto err; } - oldindex = ~0L; - offset = 0; - if (!bitmap->mddev->bitmap_info.external) - offset = sizeof(bitmap_super_t); - - if (mddev_is_clustered(bitmap->mddev)) + if (mddev_is_clustered(mddev)) node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); - for (i = 0; i < chunks; i++) { - int b; - index = file_page_index(&bitmap->storage, i); - bit = file_page_offset(&bitmap->storage, i); - if (index != oldindex) { /* this is a new page, read it in */ - int count; - /* unmap the old page, we're done with it */ - if (index == store->file_pages-1) - count = store->bytes - index * PAGE_SIZE; - else - count = PAGE_SIZE; - page = store->filemap[index]; - if (file) - ret = read_page(file, index, bitmap, - count, page); - else - ret = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index + node_offset, count); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + int count; - if (ret) - goto err; + /* unmap the old page, we're done with it */ + if (i == store->file_pages - 1) + count = store->bytes - i * PAGE_SIZE; + else + count = PAGE_SIZE; - oldindex = index; + if (file) + ret = read_file_page(file, i, bitmap, count, page); + else + ret = read_sb_page(mddev, 0, page, i + node_offset, + count); + if (ret) + goto err; + } - if (outofdate) { - /* - * if bitmap is out of date, dirty the - * whole page and write it out - */ - paddr = kmap_atomic(page); - memset(paddr + offset, 0xff, - PAGE_SIZE - offset); - kunmap_atomic(paddr); - write_page(bitmap, page, 1); + if (outofdate) { + pr_warn("%s: bitmap file is out of date, doing full recovery\n", + bmname(bitmap)); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + unsigned long offset = 0; + void *paddr; + + if (i == 0 && !mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); + + /* + * If the bitmap is out of date, dirty the whole page + * and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, PAGE_SIZE - offset); + kunmap_atomic(paddr); + + filemap_write_page(bitmap, i, true); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { ret = -EIO; - if (test_bit(BITMAP_WRITE_ERROR, - &bitmap->flags)) - goto err; + goto err; } } + } + + for (i = 0; i < chunks; i++) { + struct page *page = filemap_get_page(&bitmap->storage, i); + unsigned long bit = file_page_offset(&bitmap->storage, i); + void *paddr; + bool was_set; + paddr = kmap_atomic(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - b = test_bit(bit, paddr); + was_set = test_bit(bit, paddr); else - b = test_bit_le(bit, paddr); + was_set = test_bit_le(bit, paddr); kunmap_atomic(paddr); - if (b) { + + if (was_set) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); @@ -1204,7 +1210,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) needed); bit_cnt++; } - offset = 0; } pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", @@ -1396,9 +1401,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) break; if (bitmap->storage.filemap && test_and_clear_page_attr(bitmap, j, - BITMAP_PAGE_NEEDWRITE)) { - write_page(bitmap, bitmap->storage.filemap[j], 0); - } + BITMAP_PAGE_NEEDWRITE)) + filemap_write_page(bitmap, j, false); } done: @@ -2542,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; + rv = mddev_lock(mddev); + if (rv) + return rv; + /* * Without write mostly device, it doesn't make sense to set * backlog for max_write_behind. @@ -2555,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!has_write_mostly) { pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", mdname(mddev)); + mddev_unlock(mddev); return -EINVAL; } @@ -2565,13 +2574,13 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_destroy_serial_pool(mddev, NULL, false); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); + + mddev_unlock(mddev); return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 8a3788c9bfef..bb9eb418780a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -201,6 +201,7 @@ struct bitmap { struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap * file superblock */ + unsigned long sb_index; struct page **filemap; /* list of cache pages for * the file */ unsigned long *filemap_attr; /* attributes associated diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3d9fd74233df..1e26eb223349 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes) return 0; err: set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 50ad818978a4..a039e8e20f55 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) failit = 1; } } + + md_account_bio(mddev, &bio); if (failit) { struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 4eb72b9dd933..71ac99646827 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + md_account_bio(mddev, &bio); bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 92c45be203d7..d22276870283 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) && md_flush_request(mddev, bio)) return true; + md_account_bio(mddev, &bio); mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh->master_bio = bio; diff --git a/drivers/md/md.c b/drivers/md/md.c index 78be7811a89f..0fe7ab6e8ab9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev) mddev->pers->prepare_suspend(mddev); wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); @@ -465,14 +464,15 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { - /* entred the memalloc scope from mddev_suspend() */ - memalloc_noio_restore(mddev->noio_flag); lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; + + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 0); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->sync_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -650,6 +651,7 @@ void mddev_init(struct mddev *mddev) timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); + atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -2304,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), @@ -4747,6 +4749,62 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } +static void stop_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return; + + if (mddev_lock(mddev)) + return; + + /* + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is + * held. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + mddev_unlock(mddev); + return; + } + + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); + + mddev_unlock(mddev); +} + +static void idle_sync_thread(struct mddev *mddev) +{ + int sync_seq = atomic_read(&mddev->sync_seq); + + mutex_lock(&mddev->sync_mutex); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); + + wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + + mutex_unlock(&mddev->sync_mutex); +} + +static void frozen_sync_thread(struct mddev *mddev) +{ + mutex_lock(&mddev->sync_mutex); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); + + wait_event(resync_wait, mddev->sync_thread == NULL && + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + + mutex_unlock(&mddev->sync_mutex); +} + static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4754,35 +4812,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - mddev_lock(mddev) == 0) { - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - sector_t save_rp = mddev->reshape_position; - - mddev_unlock(mddev); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); - mddev_lock_nointr(mddev); - /* - * set RECOVERY_INTR again and restore reshape - * position in case others changed them after - * got lock, eg, reshape_position_store and - * md_check_recovery. - */ - mddev->reshape_position = save_rp; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - mddev_unlock(mddev); - } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (cmd_match(page, "idle")) + idle_sync_thread(mddev); + else if (cmd_match(page, "frozen")) + frozen_sync_thread(mddev); + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5842,6 +5876,13 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } + if (!bioset_initialized(&mddev->io_clone_set)) { + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + } + spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -6019,6 +6060,8 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: + bioset_exit(&mddev->io_clone_set); +exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); @@ -6176,7 +6219,6 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -6216,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); if (mddev->queue) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ } @@ -6243,6 +6285,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -7012,6 +7055,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ + + if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { + pr_warn("%s: bitmap files not supported by this kernel\n", + mdname(mddev)); + return -EINVAL; + } + pr_warn("%s: using deprecated bitmap file support\n", + mdname(mddev)); + f = fget(fd); if (f == NULL) { @@ -7940,9 +7992,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread __rcu **threadp) +void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { - struct md_thread *thread = rcu_dereference_protected(*threadp, true); + struct md_thread *thread = rcu_dereference_protected(*threadp, + lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; @@ -8601,62 +8654,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -int acct_bioset_init(struct mddev *mddev) +static void md_end_clone_io(struct bio *bio) { - int err = 0; - - if (!bioset_initialized(&mddev->io_acct_set)) - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); - return err; -} -EXPORT_SYMBOL_GPL(acct_bioset_init); - -void acct_bioset_exit(struct mddev *mddev) -{ - bioset_exit(&mddev->io_acct_set); -} -EXPORT_SYMBOL_GPL(acct_bioset_exit); - -static void md_end_io_acct(struct bio *bio) -{ - struct md_io_acct *md_io_acct = bio->bi_private; - struct bio *orig_bio = md_io_acct->orig_bio; - struct mddev *mddev = md_io_acct->mddev; + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; orig_bio->bi_status = bio->bi_status; - bio_end_io_acct(orig_bio, md_io_acct->start_time); + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + bio_put(bio); bio_endio(orig_bio); - percpu_ref_put(&mddev->active_io); } -/* - * Used by personalities that don't already clone the bio and thus can't - * easily add the timestamp to their extended bio structure. - */ -void md_account_bio(struct mddev *mddev, struct bio **bio) +static void md_clone_bio(struct mddev *mddev, struct bio **bio) { struct block_device *bdev = (*bio)->bi_bdev; - struct md_io_acct *md_io_acct; - struct bio *clone; - - if (!blk_queue_io_stat(bdev->bd_disk->queue)) - return; + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat(bdev->bd_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ percpu_ref_get(&mddev->active_io); - - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); - md_io_acct->orig_bio = *bio; - md_io_acct->start_time = bio_start_io_acct(*bio); - md_io_acct->mddev = mddev; - - clone->bi_end_io = md_end_io_acct; - clone->bi_private = md_io_acct; - *bio = clone; + md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); @@ -9329,7 +9364,6 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9358,17 +9392,24 @@ void md_check_recovery(struct mddev *mddev) if (mddev->sb_flags) md_update_sb(mddev, 0); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - if (mddev->sync_thread) { - md_unregister_thread(&mddev->sync_thread); + /* + * Never start a new sync thread if MD_RECOVERY_RUNNING is + * still set. + */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + goto unlock; + md_reap_sync_thread(mddev); goto unlock; } + /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ @@ -9445,7 +9486,10 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* sync_thread should be unregistered, collect result */ + /* resync has finished, collect result */ + md_unregister_thread(mddev, &mddev->sync_thread); + atomic_inc(&mddev->sync_seq); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { @@ -9490,7 +9534,6 @@ void md_reap_sync_thread(struct mddev *mddev) if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) md_cluster_ops->update_size(mddev, old_dev_sectors); - wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9498,6 +9541,7 @@ void md_reap_sync_thread(struct mddev *mddev) md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); + wake_up(&resync_wait); } EXPORT_SYMBOL(md_reap_sync_thread); diff --git a/drivers/md/md.h b/drivers/md/md.h index 1aef86bf3fc3..9bcb77bca963 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -510,7 +510,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -535,6 +535,11 @@ struct mddev { */ struct list_head deleting; + /* Used to synchronize idle and frozen for action_store() */ + struct mutex sync_mutex; + /* The sequence number for sync thread */ + atomic_t sync_seq; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; @@ -731,7 +736,7 @@ struct md_thread { void *private; }; -struct md_io_acct { +struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; @@ -756,7 +761,7 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); @@ -769,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); -int acct_bioset_init(struct mddev *mddev); -void acct_bioset_exit(struct mddev *mddev); void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d1ac73fcd852..c50a7abda744 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv) struct r0conf *conf = priv; free_conf(mddev, conf); - acct_bioset_exit(mddev); } static int raid0_run(struct mddev *mddev) @@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - if (acct_bioset_init(mddev)) { - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); - return -ENOMEM; - } - /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); if (ret < 0) - goto exit_acct_set; + return ret; mddev->private = conf; } conf = mddev->private; @@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - goto free; + free_conf(mddev, conf); return ret; - -free: - free_conf(mddev, conf); -exit_acct_set: - acct_bioset_exit(mddev); - return ret; } /* @@ -557,54 +545,20 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; - sector_t bio_sector; - sector_t sector; - sector_t orig_sector; - unsigned chunk_sects; - unsigned sectors; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { - raid0_handle_discard(mddev, bio); - return true; - } - - bio_sector = bio->bi_iter.bi_sector; - sector = bio_sector; - chunk_sects = mddev->chunk_sectors; - - sectors = chunk_sects - - (likely(is_power_of_2(chunk_sects)) - ? (sector & (chunk_sects-1)) - : sector_div(sector, chunk_sects)); - - /* Restore due to sector_div */ - sector = bio_sector; - - if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, - &mddev->bio_set); - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; - } + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; - if (bio->bi_pool != &mddev->bio_set) - md_account_bio(mddev, &bio); + md_account_bio(mddev, &bio); - orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { case RAID0_ORIG_LAYOUT: - tmp_dev = map_sector(mddev, zone, orig_sector, §or); + tmp_dev = map_sector(mddev, zone, bio_sector, §or); break; case RAID0_ALT_MULTIZONE_LAYOUT: tmp_dev = map_sector(mddev, zone, sector, §or); @@ -612,13 +566,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) default: WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); - return true; + return; } if (unlikely(is_rdev_broken(tmp_dev))) { bio_io_error(bio); md_error(mddev, tmp_dev); - return true; + return; } bio_set_dev(bio, tmp_dev->bdev); @@ -630,6 +584,40 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio_sector); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); +} + +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +{ + sector_t sector; + unsigned chunk_sects; + unsigned sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return true; + } + + sector = bio->bi_iter.bi_sector; + chunk_sects = mddev->chunk_sectors; + + sectors = chunk_sects - + (likely(is_power_of_2(chunk_sects)) + ? (sector & (chunk_sects-1)) + : sector_div(sector, chunk_sects)); + + if (sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, sectors, GFP_NOIO, + &mddev->bio_set); + bio_chain(split, bio); + raid0_map_submit_bio(mddev, bio); + bio = split; + } + + raid0_map_submit_bio(mddev, bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dd25832eb045..4b30a1742162 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -313,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; struct r1conf *conf = r1_bio->mddev->private; + sector_t sector = r1_bio->sector; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -323,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + + free_r1bio(r1_bio); /* * Wake up any possible resync thread that waits for the device * to go idle. All I/Os, even write-behind writes, are done. */ - allow_barrier(conf, r1_bio->sector); - - free_r1bio(r1_bio); + allow_barrier(conf, sector); } /* @@ -791,11 +790,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } +static void wake_up_barrier(struct r1conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ raid1_prepare_flush_writes(conf->mddev->bitmap); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; @@ -972,7 +977,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for the barrier in same barrier unit bucket to drop. */ /* Return false when nowait flag is set */ @@ -1015,7 +1020,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for array to be unfrozen */ /* Return false when nowait flag is set */ @@ -1044,7 +1049,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static void allow_barrier(struct r1conf *conf, sector_t sector_nr) @@ -1173,7 +1178,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1303,10 +1308,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } r1_bio->read_disk = rdisk; - - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); - + if (!r1bio_existed) { + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; + } read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, &mddev->bio_set); @@ -1373,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } + retry_write: r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1388,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - retry_write: blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1468,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; + free_r1bio(r1_bio); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { @@ -1500,8 +1505,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -1518,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && - test_bit(WriteMostly, &rdev->flags) && + if (bitmap && write_behind && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { @@ -1576,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) @@ -1766,7 +1770,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; int err = -EEXIST; - int mirror = 0; + int mirror = 0, repl_slot = -1; struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; @@ -1809,17 +1813,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } if (test_bit(WantReplacement, &p->rdev->flags) && - p[conf->raid_disks].rdev == NULL) { - /* Add this device as a replacement */ - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); - break; - } + p[conf->raid_disks].rdev == NULL && repl_slot < 0) + repl_slot = mirror; } + + if (err && repl_slot >= 0) { + /* Add this device as a replacement */ + p = conf->mirrors + repl_slot; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + } + print_conf(conf); return err; } @@ -1829,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; + + if (unlikely(number >= conf->raid_disks)) + goto abort; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) @@ -2299,7 +2311,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d++; if (d == conf->raid_disks * 2) d = 0; - } while (!success && d != read_disk); + } while (d != read_disk); if (!success) { /* Cannot read from anywhere - mark it bad */ @@ -2498,6 +2510,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; + sector_t sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2527,12 +2540,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) } rdev_dec_pending(rdev, conf->mddev); - allow_barrier(conf, r1_bio->sector); + sector = r1_bio->sector; bio = r1_bio->master_bio; /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ r1_bio->state = 0; raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); + allow_barrier(conf, sector); } static void raid1d(struct md_thread *thread) @@ -3144,7 +3158,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { - md_unregister_thread(&conf->thread); + md_unregister_thread(mddev, &conf->thread); ret = -EINVAL; goto abort; } @@ -3171,7 +3185,7 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); goto abort; } return 0; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 468f189da7a0..14d4211a123a 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -157,7 +157,6 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5051149e27bb..023413120851 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (r10_bio->start_time) - bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, } static void raid10_read_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) + struct r10bio *r10_bio, bool io_accounting) { struct r10conf *conf = mddev->private; struct bio *read_bio; @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (!r10_bio->start_time && - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + if (io_accounting) { + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; + } read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1322,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, + struct md_rdev **prrdev) +{ + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(mirror->replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(mirror->rdev); + if (rdev == rrdev) + rrdev = NULL; + + *prrdev = rrdev; + return rdev; +} + static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1332,11 +1350,9 @@ retry_wait: blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1465,15 +1481,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rrdev = rcu_dereference(conf->mirrors[d].replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == rrdev) - rrdev = NULL; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1543,8 +1551,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); @@ -1571,12 +1579,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - r10_bio->start_time = 0; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) - raid10_read_request(mddev, bio, r10_bio); + raid10_read_request(mddev, bio, r10_bio, true); else raid10_write_request(mddev, bio, r10_bio); } @@ -1780,10 +1787,9 @@ retry_discard: */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); + struct md_rdev *rdev, *rrdev; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; @@ -2720,10 +2726,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; + int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); - int d = r10_bio->devs[r10_bio->read_slot].devnum; + int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot * have been cleared recently. @@ -2744,13 +2750,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 pr_notice("md/raid10:%s: %pg: Failing raid device\n", mdname(mddev), rdev->bdev); md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + r10_bio->devs[slot].bio = IO_BLOCKED; return; } while(sectors) { int s = sectors; - int sl = r10_bio->read_slot; + int sl = slot; int success = 0; int start; @@ -2785,7 +2791,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != r10_bio->read_slot); + } while (sl != slot); rcu_read_unlock(); if (!success) { @@ -2793,16 +2799,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 * as bad on the first device to discourage future * reads. */ - int dn = r10_bio->devs[r10_bio->read_slot].devnum; + int dn = r10_bio->devs[slot].devnum; rdev = conf->mirrors[dn].rdev; if (!rdev_set_badblocks( rdev, - r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].addr + sect, s, 0)) { md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio + r10_bio->devs[slot].bio = IO_BLOCKED; } break; @@ -2811,7 +2817,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ rcu_read_lock(); - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2845,7 +2851,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rcu_read_lock(); } sl = start; - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2985,7 +2991,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) rdev_dec_pending(rdev, mddev); r10_bio->state = 0; - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending. @@ -4314,7 +4320,7 @@ static int raid10_run(struct mddev *mddev) return 0; out_free_conf: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); raid10_free_conf(conf); mddev->private = NULL; out: @@ -4411,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - WRITE_ONCE(conf->barrier, 1); } return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 63e48b11b552..2e75e88d0802 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -123,7 +123,6 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 47ba7d9e81e1..518b7cfa78b9 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio) if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); spin_lock_irqsave(&log->io_list_lock, flags); list_for_each_entry(io, &log->flushing_ios, log_sibling) r5l_io_run_stripes(io); list_splice_tail_init(&log->flushing_ios, &log->finished_ios); spin_unlock_irqrestore(&log->io_list_lock, flags); - - bio_uninit(bio); } /* @@ -3168,12 +3167,15 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - /* Ensure disable_writeback_work wakes up and exits */ - wake_up(&conf->mddev->sb_wait); - flush_work(&log->disable_writeback_work); - md_unregister_thread(&log->reclaim_thread); + md_unregister_thread(conf->mddev, &log->reclaim_thread); + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ conf->log = NULL; + wake_up(&conf->mddev->sb_wait); + flush_work(&log->disable_writeback_work); mempool_exit(&log->meta_pool); bioset_exit(&log->bs); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 85b3004594e0..4cb9c608ee19 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_acct *md_io_acct = bi->bi_private; - struct bio *raid_bi = md_io_acct->orig_bio; - struct mddev *mddev; - struct r5conf *conf; - struct md_rdev *rdev; + struct bio *raid_bi = bi->bi_private; + struct md_rdev *rdev = (void *)raid_bi->bi_next; + struct mddev *mddev = rdev->mddev; + struct r5conf *conf = mddev->private; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_acct->start_time; bio_put(bi); - - rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; - mddev = rdev->mddev; - conf = mddev->private; - rdev_dec_pending(rdev, conf->mddev); if (!error) { - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) - bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_acct *md_io_acct; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, - &mddev->io_acct_set); - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + md_account_bio(mddev, &raid_bio); raid_bio->bi_next = (void *)rdev; - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, + &mddev->bio_set); align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_acct; + align_bio->bi_private = raid_bio; align_bio->bi_iter.bi_sector = sector; /* No reshape active, so we can trust rdev->data_offset */ @@ -7787,19 +7774,12 @@ static int raid5_run(struct mddev *mddev) struct md_rdev *rdev; struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; - int i, ret = 0; + int i; long long min_offset_diff = 0; int first = 1; - if (acct_bioset_init(mddev)) { - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); + if (mddev_init_writes_pending(mddev) < 0) return -ENOMEM; - } - - if (mddev_init_writes_pending(mddev) < 0) { - ret = -ENOMEM; - goto exit_acct_set; - } if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7830,8 +7810,7 @@ static int raid5_run(struct mddev *mddev) (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->reshape_position != MaxSector) { @@ -7856,15 +7835,13 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->new_level != mddev->level) { pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one @@ -7880,8 +7857,7 @@ static int raid5_run(struct mddev *mddev) if (sector_div(here_new, chunk_sectors * new_data_disks)) { pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ @@ -7903,8 +7879,7 @@ static int raid5_run(struct mddev *mddev) else if (mddev->ro == 0) { pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } } else if (mddev->reshape_backwards ? (here_new * chunk_sectors + min_offset_diff <= @@ -7914,8 +7889,7 @@ static int raid5_run(struct mddev *mddev) /* Reading from the same stripe as writing to - bad */ pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ @@ -7939,10 +7913,8 @@ static int raid5_run(struct mddev *mddev) else conf = mddev->private; - if (IS_ERR(conf)) { - ret = PTR_ERR(conf); - goto exit_acct_set; - } + if (IS_ERR(conf)) + return PTR_ERR(conf); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { @@ -8135,15 +8107,12 @@ static int raid5_run(struct mddev *mddev) return 0; abort: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - ret = -EIO; -exit_acct_set: - acct_bioset_exit(mddev); - return ret; + return -EIO; } static void raid5_free(struct mddev *mddev, void *priv) @@ -8151,7 +8120,6 @@ static void raid5_free(struct mddev *mddev, void *priv) struct r5conf *conf = priv; free_conf(conf); - acct_bioset_exit(mddev); mddev->to_remove = &raid5_attrs_group; } diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index d39f3219358b..d8ff796fd5f2 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, goto out_free_meta; } - bip->bip_iter.bi_size = len; bip->bip_iter.bi_sector = seed; ret = bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 2733e0158585..468833675cc9 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); rc = bio_integrity_add_page(bio, miter->page, len, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ad9afae49544..59176946ab56 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -300,11 +300,6 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) cmd->budget_token = -1; } -static void scsi_kick_queue(struct request_queue *q) -{ - blk_mq_run_hw_queues(q, false); -} - /* * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with * interrupts disabled. @@ -340,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev) * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ - scsi_kick_queue(current_sdev->request_queue); + blk_mq_run_hw_queues(current_sdev->request_queue, + shost->queuecommand_may_block); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) @@ -427,7 +423,7 @@ static void scsi_starved_list_run(struct Scsi_Host *shost) continue; spin_unlock_irqrestore(shost->host_lock, flags); - scsi_kick_queue(slq); + blk_mq_run_hw_queues(slq, false); blk_put_queue(slq); spin_lock_irqsave(shost->host_lock, flags); @@ -452,8 +448,8 @@ static void scsi_run_queue(struct request_queue *q) if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); + /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */ blk_mq_kick_requeue_list(q); - blk_mq_run_hw_queues(q, false); } void scsi_requeue_run_queue(struct work_struct *work) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3d1b511ea284..a7050f63b7cc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); @@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, (unsigned long long)bip->bip_iter.bi_sector); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); diff --git a/fs/Kconfig b/fs/Kconfig index 4f8bd14df0df..aa7e03cc1941 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +config BUFFER_HEAD + bool + # old blockdev_direct_IO implementation. Use iomap for new code instead config LEGACY_DIRECT_IO + depends on BUFFER_HEAD bool if BLOCK diff --git a/fs/Makefile b/fs/Makefile index e513aaee0603..f9541f40be4e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 44738fed6625..1b97058f0c4a 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -2,6 +2,7 @@ config ADFS_FS tristate "ADFS file system support" depends on BLOCK + select BUFFER_HEAD help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 962b86374e1c..1ae432d266c3 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select BUFFER_HEAD select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 9550b6462b81..5fcfc4024ffe 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -2,6 +2,7 @@ config BEFS_FS tristate "BeOS file system (BeFS) support (read only)" depends on BLOCK + select BUFFER_HEAD select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3a757805b585..8e7ef866b62a 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -2,6 +2,7 @@ config BFS_FS tristate "BFS file system support" depends on BLOCK + select BUFFER_HEAD help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/buffer.c b/fs/buffer.c index 0f17c36922e6..2379564e5aea 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -563,12 +563,6 @@ repeat: return err; } -void emergency_thaw_bdev(struct super_block *sb) -{ - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) - printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 2df1bac8b375..0833e533df9d 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -2,6 +2,7 @@ config EFS_FS tristate "EFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 147edeb04469..cbeca8e44d9b 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -2,6 +2,7 @@ config EXFAT_FS tristate "exFAT filesystem support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 77393fda99af..74d98965902e 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select BUFFER_HEAD select FS_IOMAP select LEGACY_DIRECT_IO help diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab28..e20d59221fc0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3f581ced672..89737d5a1614 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6138,7 +6138,7 @@ retry_alloc: if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 03ef087537c7..68a1e23e1557 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD select NLS select CRYPTO select CRYPTO_CRC32 diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 35886a52edfb..ce9d567cd5fe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -159,7 +159,7 @@ out_sem: sb_end_pagefault(inode->i_sb); err: - return block_page_mkwrite_return(err); + return vmf_fs_error(err); } static const struct vm_operations_struct f2fs_file_vm_ops = { diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index afe83b4e7172..25fae1c83725 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config FAT_FS tristate + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 0e2fc08f7de4..912107ebea6f 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -2,6 +2,7 @@ config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK + select BUFFER_HEAD help FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 03c966840422..be7f87a8e11a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config GFS2_FS tristate "GFS2 file system support" + select BUFFER_HEAD select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 766186c80682..34cd57c6b68d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } @@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_rindex_update(sdp); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } @@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) ap.target = data_blocks + ind_blocks; err = gfs2_quota_lock_check(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } err = gfs2_inplace_reserve(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_quota_unlock; } @@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) } err = gfs2_trans_begin(sdp, rblocks, 0); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_fail; } @@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_end; } } @@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_allocate_page_backing(page, length); if (err) - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out_page_locked: if (ret != VM_FAULT_LOCKED) @@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } ret = filemap_fault(vmf); diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index d985066006d5..5ea5cd8ecea9 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -2,6 +2,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8034e7827a69..8ce4a33a9ac7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -2,6 +2,7 @@ config HFSPLUS_FS tristate "Apple Extended HFS file system support" depends on BLOCK + select BUFFER_HEAD select NLS select NLS_UTF8 select LEGACY_DIRECT_IO diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index ec975f466877..ac1e9318e65a 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select BUFFER_HEAD select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS diff --git a/fs/internal.h b/fs/internal.h index 74d3b161dd2c..d64ae03998cc 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,16 +23,10 @@ struct mnt_idmap; */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); - -void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } -static inline int emergency_thaw_bdev(struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ /* diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 283fb96f6609..ae8673ce08b1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1436,7 +1436,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 08ffd37b9bb8..51434f2a471b 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISO9660_FS tristate "ISO 9660 CDROM file system support" + select BUFFER_HEAD help This is the standard file system used on CD-ROMs. It was previously known as "High Sierra File System" and is called "hsfs" on other diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 51e856f0e4b8..17488440eef1 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config JFS_FS tristate "JFS filesystem support" + select BUFFER_HEAD select NLS select CRC32 select LEGACY_DIRECT_IO diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index de2003974ff0..90ddfad2a75e 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -2,6 +2,7 @@ config MINIX_FS tristate "Minix file system support" depends on BLOCK + select BUFFER_HEAD help Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 7d59567465e1..7dae168e346e 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NILFS2_FS tristate "NILFS2 file system support" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a9eb3487efb2..740ce26d1e76 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index f93e69a61283..7b2509741735 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS_FS tristate "NTFS file system support" + select BUFFER_HEAD select NLS help NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 96cc236f7f7b..cdfdf51e55d7 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS3_FS tristate "NTFS Read-Write file system support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 3123da7cfb30..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,6 +2,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig index 42b2ec35a05b..8470f6c3e64e 100644 --- a/fs/omfs/Kconfig +++ b/fs/omfs/Kconfig @@ -2,6 +2,7 @@ config OMFS_FS tristate "SonicBlue Optimized MPEG File System support" depends on BLOCK + select BUFFER_HEAD select CRC_ITU_T help This is the proprietary file system used by the Rio Karma music diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index 45b5b98376c4..a2eb826e76c6 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -2,6 +2,7 @@ config QNX4FS_FS tristate "QNX4 file system support (read only)" depends on BLOCK + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 4 and QNX 6 (the latter is also called QNX RTP). diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig index 6a9d6bce1586..8e865d72204e 100644 --- a/fs/qnx6/Kconfig +++ b/fs/qnx6/Kconfig @@ -2,6 +2,7 @@ config QNX6FS_FS tristate "QNX6 file system support (read only)" depends on BLOCK && CRC32 + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 6 (also called QNX RTP). diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 4d22ecfe0fab..0e6fe26458fe 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS tristate "Reiserfs support (deprecated)" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 8eb87008b55a..f24a96a331af 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -57,6 +57,7 @@ endchoice config ROMFS_ON_BLOCK bool default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + select BUFFER_HEAD config ROMFS_ON_MTD bool diff --git a/fs/super.c b/fs/super.c index 1db67a6e138c..bd8dcfc822c3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1209,7 +1209,9 @@ static void do_thaw_all_callback(struct super_block *sb) bool born = super_lock_excl(sb); if (born && sb->s_root) { - emergency_thaw_bdev(sb); + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { super_unlock_excl(sb); diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index b4e23e03fbeb..67b3f90afbfd 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -2,6 +2,7 @@ config SYSV_FS tristate "System V/Xenix/V7/Coherent file system support" depends on BLOCK + select BUFFER_HEAD help SCO, Xenix and Coherent are commercial Unix systems for Intel machines, and Version 7 was used on the DEC PDP-11. Saying Y diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 82e8bfa2dfd9..8f7ce30d47fd 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config UDF_FS tristate "UDF file system support" + select BUFFER_HEAD select CRC_ITU_T select NLS select LEGACY_DIRECT_IO diff --git a/fs/udf/file.c b/fs/udf/file.c index 0292d75e60cc..0ceac4b5937c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -65,7 +65,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) err = __block_write_begin(page, 0, end, udf_get_block); if (err) { unlock_page(page); - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 6d30adb6b890..9301e7ecd092 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -2,6 +2,7 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/include/linux/bio.h b/include/linux/bio.h index e8767f165bad..41d417ee1349 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -493,7 +493,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -void zero_fill_bio(struct bio *bio); +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); + +static inline void zero_fill_bio(struct bio *bio) +{ + zero_fill_bio_iter(bio, bio->bi_iter); +} static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 495ca198775f..958ed7e89b30 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -178,14 +178,10 @@ struct request { struct { unsigned int seq; - struct list_head list; rq_end_io_fn *saved_end_io; } flush; - union { - struct __call_single_data csd; - u64 fifo_time; - }; + u64 fifo_time; /* * completion callback. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83ce87354e9a..eef450f25982 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -538,6 +538,7 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ +#define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ @@ -846,6 +847,7 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 06566aee94ca..4ede47649a81 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -16,8 +16,6 @@ #include <linux/wait.h> #include <linux/atomic.h> -#ifdef CONFIG_BLOCK - enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ @@ -196,7 +194,6 @@ void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); -bool try_to_free_buffers(struct folio *); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, bool retry); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, @@ -211,10 +208,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); -int inode_has_buffers(struct inode *); -void invalidate_inode_buffers(struct inode *); -int remove_inode_buffers(struct inode *inode); -int sync_mapping_buffers(struct address_space *mapping); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, @@ -238,9 +231,6 @@ void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); -void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(void); -bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -256,8 +246,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); -extern int buffer_heads_over_limit; - /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. @@ -289,18 +277,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); -/* Convert errno to return value from ->page_mkwrite() call */ -static inline vm_fault_t block_page_mkwrite_return(int err) -{ - if (err == 0) - return VM_FAULT_LOCKED; - if (err == -EFAULT || err == -EAGAIN) - return VM_FAULT_NOPAGE; - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* -ENOSPC, -EDQUOT, -EIO ... */ - return VM_FAULT_SIGBUS; -} sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); @@ -314,8 +290,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *, #define buffer_migrate_folio_norefs NULL #endif -void buffer_init(void); - /* * inline definitions */ @@ -475,7 +449,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size) bool block_dirty_folio(struct address_space *mapping, struct folio *folio); -#else /* CONFIG_BLOCK */ +#ifdef CONFIG_BUFFER_HEAD + +void buffer_init(void); +bool try_to_free_buffers(struct folio *folio); +int inode_has_buffers(struct inode *inode); +void invalidate_inode_buffers(struct inode *inode); +int remove_inode_buffers(struct inode *inode); +int sync_mapping_buffers(struct address_space *mapping); +void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(void); +bool has_bh_in_lru(int cpu, void *dummy); +extern int buffer_heads_over_limit; + +#else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } @@ -483,9 +470,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 -#endif /* CONFIG_BLOCK */ +#endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index fdc6e64f49d6..96dd0acbba44 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,7 +58,11 @@ struct vm_fault; #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) +#ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) +#else +#define IOMAP_F_BUFFER_HEAD 0 +#endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 53efddc4d178..20e6d1dde412 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3475,6 +3475,24 @@ static inline vm_fault_t vmf_error(int err) return VM_FAULT_SIGBUS; } +/* + * Convert errno to return value for ->page_mkwrite() calls. + * + * This should eventually be merged with vmf_error() above, but will need a + * careful audit of all vmf_error() callers. + */ +static inline vm_fault_t vmf_fs_error(int err) +{ + if (err == 0) + return VM_FAULT_LOCKED; + if (err == -EFAULT || err == -EAGAIN) + return VM_FAULT_NOPAGE; + if (err == -ENOMEM) + return VM_FAULT_OOM; + /* -ENOSPC, -EDQUOT, -EIO ... */ + return VM_FAULT_SIGBUS; +} + struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index bbae1e52ab4f..2ac50822554e 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -25,6 +25,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev); struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); +#define OPAL_AUTH_KEY "opal-boot-pin" +#define OPAL_AUTH_KEY_PREV "opal-boot-pin-prev" + static inline bool is_sed_ioctl(unsigned int cmd) { switch (cmd) { @@ -47,6 +50,8 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_STATUS: case IOC_OPAL_GET_LR_STATUS: case IOC_OPAL_GET_GEOMETRY: + case IOC_OPAL_DISCOVERY: + case IOC_OPAL_REVERT_LSP: return true; } return false; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 40e60c33cc6f..0e128ad51460 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -12,6 +12,7 @@ #define RWBS_LEN 8 +#ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), @@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_ARGS(bh) ); +#endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index bf7533f171ff..9d44781efc1c 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -31,8 +31,8 @@ TRACE_EVENT(kyber_latency, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); - strlcpy(__entry->type, type, sizeof(__entry->type)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; __entry->numerator = numerator; __entry->denominator = denominator; @@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); ), TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index 9c66e59d859c..4661f0d27062 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; @@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; @@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 99440b2e8c35..bee2bdb0eedb 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -107,20 +107,21 @@ enum { /* * Return an I/O priority value based on a class, a level and a hint. */ -static __always_inline __u16 ioprio_value(int class, int level, int hint) +static __always_inline __u16 ioprio_value(int prioclass, int priolevel, + int priohint) { - if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) || - IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) || - IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS)) + if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) || + IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) || + IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS)) return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; - return (class << IOPRIO_CLASS_SHIFT) | - (hint << IOPRIO_HINT_SHIFT) | level; + return (prioclass << IOPRIO_CLASS_SHIFT) | + (priohint << IOPRIO_HINT_SHIFT) | priolevel; } -#define IOPRIO_PRIO_VALUE(class, level) \ - ioprio_value(class, level, IOPRIO_HINT_NONE) -#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ - ioprio_value(class, level, hint) +#define IOPRIO_PRIO_VALUE(prioclass, priolevel) \ + ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE) +#define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \ + ioprio_value(prioclass, priolevel, priohint) #endif /* _UAPI_LINUX_IOPRIO_H */ diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index dc2efd345133..d3994b7716bc 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -49,13 +49,23 @@ enum opal_lock_flags { OPAL_SAVE_FOR_LOCK = 0x01, }; +enum opal_key_type { + OPAL_INCLUDED = 0, /* key[] is the key */ + OPAL_KEYRING, /* key is in keyring */ +}; + struct opal_key { __u8 lr; __u8 key_len; - __u8 __align[6]; + __u8 key_type; + __u8 __align[5]; __u8 key[OPAL_KEY_MAX]; }; +enum opal_revert_lsp_opts { + OPAL_PRESERVE = 0x01, +}; + struct opal_lr_act { struct opal_key key; __u32 sum; @@ -173,6 +183,17 @@ struct opal_geometry { __u8 __align[3]; }; +struct opal_discovery { + __u64 data; + __u64 size; +}; + +struct opal_revert_lsp { + struct opal_key key; + __u32 options; + __u32 __pad; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -192,5 +213,7 @@ struct opal_geometry { #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) +#define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) +#define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #endif /* _UAPI_SED_OPAL_H */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4b8558db90e1..b9cfc5c96268 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -176,6 +176,12 @@ /* Copy between request and user buffer by pread()/pwrite() */ #define UBLK_F_USER_COPY (1UL << 7) +/* + * User space sets this flag when setting up the device to request zoned storage support. Kernel may + * deny the request by returning an error. + */ +#define UBLK_F_ZONED (1ULL << 8) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -232,9 +238,27 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_READ 0 #define UBLK_IO_OP_WRITE 1 #define UBLK_IO_OP_FLUSH 2 -#define UBLK_IO_OP_DISCARD 3 -#define UBLK_IO_OP_WRITE_SAME 4 -#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_DISCARD 3 +#define UBLK_IO_OP_WRITE_SAME 4 +#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_ZONE_OPEN 10 +#define UBLK_IO_OP_ZONE_CLOSE 11 +#define UBLK_IO_OP_ZONE_FINISH 12 +#define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET_ALL 14 +#define UBLK_IO_OP_ZONE_RESET 15 +/* + * Construct a zone report. The report request is carried in `struct + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone + * and shall indicate the first zone of the report. The `nr_zones` shall + * indicate how many zones should be reported at most. The report shall be + * delivered as a `struct blk_zone` array. To report fewer zones than requested, + * zero the last entry of the returned array. + * + * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in + * include/uapi/linux/blkzoned.h are part of ublk UAPI. + */ +#define UBLK_IO_OP_REPORT_ZONES 18 #define UBLK_IO_F_FAILFAST_DEV (1U << 8) #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) @@ -255,7 +279,10 @@ struct ublksrv_io_desc { /* op: bit 0-7, flags: bit 8-31 */ __u32 op_flags; - __u32 nr_sectors; + union { + __u32 nr_sectors; + __u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */ + }; /* start sector for this io */ __u64 start_sector; @@ -284,11 +311,21 @@ struct ublksrv_io_cmd { /* io result, it is valid for COMMIT* command only */ __s32 result; - /* - * userspace buffer address in ublksrv daemon process, valid for - * FETCH* command only - */ - __u64 addr; + union { + /* + * userspace buffer address in ublksrv daemon process, valid for + * FETCH* command only + * + * `addr` should not be used when UBLK_F_USER_COPY is enabled, + * because userspace handles data copy by pread()/pwrite() over + * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is + * re-used to pass back the allocated LBA for + * UBLK_IO_OP_ZONE_APPEND which actually depends on + * UBLK_F_USER_COPY + */ + __u64 addr; + __u64 zone_append_lba; + }; }; struct ublk_param_basic { @@ -331,6 +368,13 @@ struct ublk_param_devt { __u32 disk_minor; }; +struct ublk_param_zoned { + __u32 max_open_zones; + __u32 max_active_zones; + __u32 max_zone_append_sectors; + __u8 reserved[20]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -342,11 +386,13 @@ struct ublk_params { #define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DEVT (1 << 2) +#define UBLK_PARAM_TYPE_ZONED (1 << 3) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; struct ublk_param_devt devt; + struct ublk_param_zoned zoned; }; #endif diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index f02e10fa6238..3be03793237c 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -56,7 +56,9 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; + printf("#ifdef __KERNEL__\n"); printf("#include <linux/export.h>\n"); + printf("#endif\n"); printf("#include <linux/raid/pq.h>\n"); /* Compute multiplication table */ diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index e49d519de6cb..a7c1b2bbe40d 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -13,7 +13,6 @@ * the syndrome.) */ -#include <linux/export.h> #include <linux/raid/pq.h> /* Recover two failed data blocks. */ diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore new file mode 100644 index 000000000000..1b68a77f348f --- /dev/null +++ b/lib/raid6/test/.gitignore @@ -0,0 +1,3 @@ +/int.uc +/neon.uc +/raid6test diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 4fb7700a741b..1f693ea3b980 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -6,14 +6,15 @@ pound := \# -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -f -AR = ar -RANLIB = ranlib -OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o +# Adjust as desired +CC = gcc +OPTFLAGS = -O2 +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) @@ -34,24 +35,25 @@ ifeq ($(ARCH),aarch64) HAS_NEON = yes endif +ifeq ($(findstring ppc,$(ARCH)),ppc) + CFLAGS += -I../../../arch/powerpc/include + HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ - gcc -c -x assembler - >/dev/null 2>&1 && \ - rm ./-.o && echo -DCONFIG_AS_AVX512=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 -else - HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ - gcc -c -x c - >/dev/null && rm ./-.o && echo yes) - ifeq ($(HAS_ALTIVEC),yes) - CFLAGS += -I../../../arch/powerpc/include - CFLAGS += -DCONFIG_ALTIVEC - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ - vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o - endif +else ifeq ($(HAS_ALTIVEC),yes) + CFLAGS += -DCONFIG_ALTIVEC + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o endif .c.o: @@ -63,12 +65,12 @@ endif %.uc: ../%.uc cp -f $< $@ -all: raid6.a raid6test +all: raid6.a raid6test raid6.a: $(OBJS) - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ diff --git a/mm/migrate.c b/mm/migrate.c index e21d5a7e7447..78c9bd505b62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst, } EXPORT_SYMBOL(migrate_folio); -#ifdef CONFIG_BLOCK +#ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) @@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); -#endif +#endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 0dbbc67400fc..933c750b319b 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -100,6 +100,7 @@ class IocStat: self.period_at = ioc.period_at.value_() / 1_000_000 self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC + self.ivrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC self.busy_level = ioc.busy_level.value_() self.autop_idx = ioc.autop_idx.value_() self.user_cost_model = ioc.user_cost_model.value_() @@ -119,7 +120,9 @@ class IocStat: 'period_at' : self.period_at, 'period_vtime_at' : self.vperiod_at, 'busy_level' : self.busy_level, - 'vrate_pct' : self.vrate_pct, } + 'vrate_pct' : self.vrate_pct, + 'ivrate_pct' : self.ivrate_pct, + } def table_preamble_str(self): state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF' @@ -127,7 +130,7 @@ class IocStat: f'per={self.period_ms}ms ' \ f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \ f'busy={self.busy_level:+3} ' \ - f'vrate={self.vrate_pct:6.2f}% ' \ + f'vrate={self.vrate_pct:6.2f}%:{self.ivrate_pct:6.2f}% ' \ f'params={self.autop_name}' if self.user_cost_model or self.user_qos_params: output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})' @@ -135,7 +138,7 @@ class IocStat: def table_header_str(self): return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ - f'{"debt":>7} {"delay":>7} {"usage%"}' + f'{"usage%":>6} {"wait":>7} {"debt":>7} {"delay":>7}' class IocgStat: def __init__(self, iocg): @@ -161,6 +164,8 @@ class IocgStat: self.usage = (100 * iocg.usage_delta_us.value_() / ioc.period_us.value_()) if self.active else 0 + self.wait_ms = (iocg.stat.wait_us.value_() - + iocg.last_stat.wait_us.value_()) / 1000 self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 if blkg.use_delay.counter.value_() != 0: self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 @@ -177,9 +182,10 @@ class IocgStat: 'hweight_active_pct' : self.hwa_pct, 'hweight_inuse_pct' : self.hwi_pct, 'inflight_pct' : self.inflight_pct, + 'usage_pct' : self.usage, + 'wait_ms' : self.wait_ms, 'debt_ms' : self.debt_ms, 'delay_ms' : self.delay_ms, - 'usage_pct' : self.usage, 'address' : self.address } return out @@ -189,9 +195,10 @@ class IocgStat: f'{round(self.inuse):5}/{round(self.active):5} ' \ f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ f'{self.inflight_pct:6.2f} ' \ + f'{min(self.usage, 999):6.2f} ' \ + f'{self.wait_ms:7.2f} ' \ f'{self.debt_ms:7.2f} ' \ - f'{self.delay_ms:7.2f} '\ - f'{min(self.usage, 999):6.2f}' + f'{self.delay_ms:7.2f}' out = out.rstrip(':') return out @@ -221,7 +228,7 @@ ioc = None for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()): blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr) try: - if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'): + if devname == blkg.q.mq_kobj.parent.name.string_().decode('utf-8'): q_id = blkg.q.id.value_() if blkg.pd[plid]: root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd') |