diff options
-rw-r--r-- | Documentation/core-api/kernel-api.rst | 3 | ||||
-rw-r--r-- | Documentation/filesystems/api-summary.rst | 3 | ||||
-rw-r--r-- | MAINTAINERS | 3 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/bdev.c (renamed from fs/block_dev.c) | 641 | ||||
-rw-r--r-- | block/blk-mq.c | 14 | ||||
-rw-r--r-- | block/blk-throttle.c | 1 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/fops.c | 640 | ||||
-rw-r--r-- | block/genhd.c | 9 | ||||
-rw-r--r-- | drivers/block/n64cart.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 68 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 19 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 10 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 22 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 5 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 10 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 11 | ||||
-rw-r--r-- | drivers/nvme/target/passthru.c | 14 | ||||
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/internal.h | 2 |
22 files changed, 805 insertions, 682 deletions
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 2a7444e3a4c2..2e7186805148 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -315,6 +315,9 @@ Block Devices .. kernel-doc:: block/genhd.c :export: +.. kernel-doc:: block/bdev.c + :export: + Char devices ============ diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst index 7e5c04c98619..98db2ea5fa12 100644 --- a/Documentation/filesystems/api-summary.rst +++ b/Documentation/filesystems/api-summary.rst @@ -71,9 +71,6 @@ Other Functions .. kernel-doc:: fs/fs-writeback.c :export: -.. kernel-doc:: fs/block_dev.c - :export: - .. kernel-doc:: fs/anon_inodes.c :export: diff --git a/MAINTAINERS b/MAINTAINERS index 51be43b2495b..7d46f03e5037 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3313,7 +3313,6 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: block/ F: drivers/block/ -F: fs/block_dev.c F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c @@ -13409,7 +13408,7 @@ F: include/linux/nvme-fc.h NVM EXPRESS TARGET DRIVER M: Christoph Hellwig <hch@lst.de> M: Sagi Grimberg <sagi@grimberg.me> -M: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> +M: Chaitanya Kulkarni <kch@nvidia.com> L: linux-nvme@lists.infradead.org S: Supported W: http://git.infradead.org/nvme.git diff --git a/block/Makefile b/block/Makefile index 6cf40276d3cf..41aa1ba69c90 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ diff --git a/fs/block_dev.c b/block/bdev.c index 45df6cbccf12..cf2780cb44a7 100644 --- a/fs/block_dev.c +++ b/block/bdev.c @@ -7,12 +7,10 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/fcntl.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> -#include <linux/highmem.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/module.h> @@ -20,30 +18,22 @@ #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> -#include <linux/pagevec.h> #include <linux/writeback.h> -#include <linux/mpage.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> -#include <linux/log2.h> #include <linux/cleancache.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/falloc.h> #include <linux/part_stat.h> #include <linux/uaccess.h> -#include <linux/suspend.h> -#include "internal.h" -#include "../block/blk.h" +#include "../fs/internal.h" +#include "blk.h" struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; -static const struct address_space_operations def_blk_aops; - static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); @@ -194,332 +184,6 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -static int -blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - -static struct inode *bdev_file_inode(struct file *file) -{ - return file->f_mapping->host; -} - -static unsigned int dio_bio_write_op(struct kiocb *iocb) -{ - unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - - /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) - op |= REQ_FUA; - return op; -} - -#define DIO_INLINE_BIO_VECS 4 - -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - -static ssize_t -__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; - loff_t pos = iocb->ki_pos; - bool should_dirty = false; - struct bio bio; - ssize_t ret; - blk_qc_t qc; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - if (nr_pages <= DIO_INLINE_BIO_VECS) - vecs = inline_vecs; - else { - vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - if (!vecs) - return -ENOMEM; - } - - bio_init(&bio, vecs, nr_pages); - bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; - bio.bi_write_hint = iocb->ki_hint; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; - bio.bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(&bio, iter); - if (unlikely(ret)) - goto out; - ret = bio.bi_iter.bi_size; - - if (iov_iter_rw(iter) == READ) { - bio.bi_opf = REQ_OP_READ; - if (iter_is_iovec(iter)) - should_dirty = true; - } else { - bio.bi_opf = dio_bio_write_op(iocb); - task_io_account_write(ret); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - - qc = submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - bio_release_pages(&bio, should_dirty); - if (unlikely(bio.bi_status)) - ret = blk_status_to_errno(bio.bi_status); - -out: - if (vecs != inline_vecs) - kfree(vecs); - - bio_uninit(&bio); - - return ret; -} - -struct blkdev_dio { - union { - struct kiocb *iocb; - struct task_struct *waiter; - }; - size_t size; - atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; -}; - -static struct bio_set blkdev_dio_pool; - -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - -static void blkdev_bio_end_io(struct bio *bio) -{ - struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { - struct kiocb *iocb = dio->iocb; - ssize_t ret; - - if (likely(!dio->bio.bi_status)) { - ret = dio->size; - iocb->ki_pos += ret; - } else { - ret = blk_status_to_errno(dio->bio.bi_status); - } - - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); - } else { - struct task_struct *waiter = dio->waiter; - - WRITE_ONCE(dio->waiter, NULL); - blk_wake_io_task(waiter); - } - } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); - struct blk_plug plug; - struct blkdev_dio *dio; - struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; - bool is_read = (iov_iter_rw(iter) == READ), is_sync; - loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; - int ret = 0; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); - - dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { - dio->waiter = current; - bio_get(bio); - } else { - dio->iocb = iocb; - } - - dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); - - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); - - for (;;) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; - bio->bi_write_hint = iocb->ki_hint; - bio->bi_private = dio; - bio->bi_end_io = blkdev_bio_end_io; - bio->bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - break; - } - - if (is_read) { - bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) - bio_set_pages_dirty(bio); - } else { - bio->bi_opf = dio_bio_write_op(iocb); - task_io_account_write(bio->bi_iter.bi_size); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); - if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); - break; - } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - - submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); - } - - if (!is_poll) - blk_finish_plug(&plug); - - if (!is_sync) - return -EIOCBQUEUED; - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->waiter)) - break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - if (!ret) - ret = blk_status_to_errno(dio->bio.bi_status); - if (likely(!ret)) - ret = dio->size; - - bio_put(&dio->bio); - return ret; -} - -static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int nr_pages; - - if (!iov_iter_count(iter)) - return 0; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); -} - -static __init int blkdev_init(void) -{ - return bioset_init(&blkdev_dio_pool, 4, - offsetof(struct blkdev_dio, bio), - BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); -} -module_init(blkdev_init); - int __sync_blockdev(struct block_device *bdev, int wait) { if (!bdev) @@ -637,81 +301,6 @@ out: } EXPORT_SYMBOL(thaw_bdev); -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, blkdev_get_block, wbc); -} - -static int blkdev_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, blkdev_get_block); -} - -static void blkdev_readahead(struct readahead_control *rac) -{ - mpage_readahead(rac, blkdev_get_block); -} - -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - return block_write_begin(mapping, pos, len, flags, pagep, - blkdev_get_block); -} - -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - unlock_page(page); - put_page(page); - - return ret; -} - -/* - * private llseek: - * for a block special file file_inode(file)->i_size is zero - * so we compute the size by hand (just as in block_read/write above) - */ -static loff_t block_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *bd_inode = bdev_file_inode(file); - loff_t retval; - - inode_lock(bd_inode); - retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - inode_unlock(bd_inode); - return retval; -} - -static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); - int error; - - error = file_write_and_wait_range(filp, start, end); - if (error) - return error; - - /* - * There is no need to serialise calls to blkdev_issue_flush with - * i_mutex and doing so causes performance issues with concurrent - * O_SYNC writers to a block device. - */ - error = blkdev_issue_flush(bdev); - if (error == -EOPNOTSUPP) - error = 0; - - return error; -} - /** * bdev_read_page() - Start reading a page from a block device * @bdev: The device to read the page from @@ -1305,35 +894,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -static int blkdev_open(struct inode * inode, struct file * filp) -{ - struct block_device *bdev; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; - if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - filp->f_mapping = bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - return 0; -} - void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -1397,203 +957,6 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) } EXPORT_SYMBOL(blkdev_put); -static int blkdev_close(struct inode * inode, struct file * filp) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); - blkdev_put(bdev, filp->f_mode); - return 0; -} - -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - -/* - * Write data to the block device. Only intended for the block device itself - * and the raw driver which basically is a fake block device. - * - * Does not take i_mutex for the write and thus is not for general purpose - * use. - */ -static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - struct blk_plug plug; - size_t shorted = 0; - ssize_t ret; - - if (bdev_read_only(I_BDEV(bd_inode))) - return -EPERM; - - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if (iocb->ki_pos >= size) - return -ENOSPC; - - if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) - return -EOPNOTSUPP; - - size -= iocb->ki_pos; - if (iov_iter_count(from) > size) { - shorted = iov_iter_count(from) - size; - iov_iter_truncate(from, size); - } - - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); - return ret; -} - -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - loff_t pos = iocb->ki_pos; - size_t shorted = 0; - ssize_t ret; - - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); - } - - ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); - return ret; -} - -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - -static const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = blkdev_readpage, - .readahead = blkdev_readahead, - .writepage = blkdev_writepage, - .write_begin = blkdev_write_begin, - .write_end = blkdev_write_end, - .writepages = blkdev_writepages, - .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, - .is_dirty_writeback = buffer_check_dirty_writeback, -}; - -#define BLKDEV_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) - -static long blkdev_fallocate(struct file *file, int mode, loff_t start, - loff_t len) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - loff_t end = start + len - 1; - loff_t isize; - int error; - - /* Fail if we don't recognize the flags. */ - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - - /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); - if (start >= isize) - return -EINVAL; - if (end >= isize) { - if (mode & FALLOC_FL_KEEP_SIZE) { - len = isize - start; - end = start + len - 1; - } else - return -EINVAL; - } - - /* - * Don't allow IO that isn't aligned to logical block size. - */ - if ((start | len) & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); - if (error) - return error; - - switch (mode) { - case FALLOC_FL_ZERO_RANGE: - case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); - break; - default: - return -EOPNOTSUPP; - } - if (error) - return error; - - /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... - */ - return truncate_bdev_range(bdev, file->f_mode, start, end); -} - -const struct file_operations def_blk_fops = { - .open = blkdev_open, - .release = blkdev_close, - .llseek = block_llseek, - .read_iter = blkdev_read_iter, - .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, - .mmap = generic_file_mmap, - .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = compat_blkdev_ioctl, -#endif - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = blkdev_fallocate, -}; - /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device diff --git a/block/blk-mq.c b/block/blk-mq.c index 65d3a63aecc6..108a352051be 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2135,6 +2135,18 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } } +/* + * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 4; + return BLK_MAX_REQUEST_COUNT; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2231,7 +2243,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) else last = list_entry_rq(plug->mq_list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + if (request_count >= blk_plug_max_rq_count(plug) || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 55c49015e533..7c4e7993ba97 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2458,6 +2458,7 @@ int blk_throtl_init(struct request_queue *q) void blk_throtl_exit(struct request_queue *q) { BUG_ON(!q->td); + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); diff --git a/block/blk.h b/block/blk.h index 8c96b0c90c48..7d2a0ba7ed21 100644 --- a/block/blk.h +++ b/block/blk.h @@ -373,4 +373,6 @@ static inline void bio_clear_hipri(struct bio *bio) bio->bi_opf &= ~REQ_HIPRI; } +extern const struct address_space_operations def_blk_aops; + #endif /* BLK_INTERNAL_H */ diff --git a/block/fops.c b/block/fops.c new file mode 100644 index 000000000000..ffce6f6c68dd --- /dev/null +++ b/block/fops.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/uio.h> +#include <linux/namei.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/falloc.h> +#include <linux/suspend.h> +#include "blk.h" + +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static unsigned int dio_bio_write_op(struct kiocb *iocb) +{ + unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb->ki_flags & IOCB_DSYNC) + op |= REQ_FUA; + return op; +} + +#define DIO_INLINE_BIO_VECS 4 + +static void blkdev_bio_end_io_simple(struct bio *bio) +{ + struct task_struct *waiter = bio->bi_private; + + WRITE_ONCE(bio->bi_private, NULL); + blk_wake_io_task(waiter); +} + +static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, + struct iov_iter *iter, unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + blk_qc_t qc; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + bio_init(&bio, vecs, nr_pages); + bio_set_dev(&bio, bdev); + bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; + bio.bi_private = current; + bio.bi_end_io = blkdev_bio_end_io_simple; + bio.bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + goto out; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio.bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio.bi_opf = dio_bio_write_op(iocb); + task_io_account_write(ret); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(&bio, iocb); + + qc = submit_bio(&bio); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) + break; + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + bio_release_pages(&bio, should_dirty); + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + +out: + if (vecs != inline_vecs) + kfree(vecs); + + bio_uninit(&bio); + + return ret; +} + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + bool multi_bio : 1; + bool should_dirty : 1; + bool is_sync : 1; + struct bio bio; +}; + +static struct bio_set blkdev_dio_pool; + +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { + if (!dio->is_sync) { + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!dio->bio.bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); + } + + dio->iocb->ki_complete(iocb, ret, 0); + if (dio->multi_bio) + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + blk_wake_io_task(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + struct blk_plug plug; + struct blkdev_dio *dio; + struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; + bool is_read = (iov_iter_rw(iter) == READ), is_sync; + loff_t pos = iocb->ki_pos; + blk_qc_t qc = BLK_QC_T_NONE; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + + dio = container_of(bio, struct blkdev_dio, bio); + dio->is_sync = is_sync = is_sync_kiocb(iocb); + if (dio->is_sync) { + dio->waiter = current; + bio_get(bio); + } else { + dio->iocb = iocb; + } + + dio->size = 0; + dio->multi_bio = false; + dio->should_dirty = is_read && iter_is_iovec(iter); + + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + + for (;;) { + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + } + + if (is_read) { + bio->bi_opf = REQ_OP_READ; + if (dio->should_dirty) + bio_set_pages_dirty(bio); + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); + if (!nr_pages) { + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + polled = true; + } + + qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); + break; + } + + if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); + dio->multi_bio = true; + atomic_set(&dio->ref, 2); + } else { + atomic_inc(&dio->ref); + } + + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); + } + + if (!is_poll) + blk_finish_plug(&plug); + + if (!is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} + +static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + unsigned int nr_pages; + + if (!iov_iter_count(iter)) + return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); +} + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + mpage_readahead(rac, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + return block_write_begin(mapping, pos, len, flags, pagep, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + put_page(page); + + return ret; +} + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return generic_writepages(mapping, wbc); +} + +const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, + .readpage = blkdev_readpage, + .readahead = blkdev_readahead, + .writepage = blkdev_writepage, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = blkdev_writepages, + .direct_IO = blkdev_direct_IO, + .migratepage = buffer_migrate_page_norefs, + .is_dirty_writeback = buffer_check_dirty_writeback, +}; + +/* + * for a block special file file_inode(file)->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *bd_inode = bdev_file_inode(file); + loff_t retval; + + inode_lock(bd_inode); + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); + inode_unlock(bd_inode); + return retval; +} + +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bd_inode = bdev_file_inode(filp); + struct block_device *bdev = I_BDEV(bd_inode); + int error; + + error = file_write_and_wait_range(filp, start, end); + if (error) + return error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + error = blkdev_issue_flush(bdev); + if (error == -EOPNOTSUPP) + error = 0; + + return error; +} + +static int blkdev_open(struct inode *inode, struct file *filp) +{ + struct block_device *bdev; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + return 0; +} + +static int blkdev_close(struct inode *inode, struct file *filp) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); + + blkdev_put(bdev, filp->f_mode); + return 0; +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + struct blk_plug plug; + size_t shorted = 0; + ssize_t ret; + + if (bdev_read_only(I_BDEV(bd_inode))) + return -EPERM; + + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) + return -EOPNOTSUPP; + + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } + + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); + blk_finish_plug(&plug); + return ret; +} + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; + + if (pos >= size) + return 0; + + size -= pos; + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; +} + +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages. */ + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + return -EOPNOTSUPP; + } + if (error) + return error; + + /* + * Invalidate the page cache again; if someone wandered in and dirtied + * a page, we just discard it - userspace has no way of knowing whether + * the write happened before or after discard completing... + */ + return truncate_bdev_range(bdev, file->f_mode, start, end); +} + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, + .llseek = blkdev_llseek, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, + .mmap = generic_file_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, +}; + +static __init int blkdev_init(void) +{ + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); +} +module_init(blkdev_init); diff --git a/block/genhd.c b/block/genhd.c index 567549a011d1..7b6e5e1cf956 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -183,6 +183,7 @@ static struct blk_major_name { void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); +static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -195,11 +196,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&major_names_lock); + spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ @@ -271,6 +272,7 @@ int __register_blkdev(unsigned int major, const char *name, p->next = NULL; index = major_to_index(major); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; @@ -279,6 +281,7 @@ int __register_blkdev(unsigned int major, const char *name, *n = p; else ret = -EBUSY; + spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", @@ -298,6 +301,7 @@ void unregister_blkdev(unsigned int major, const char *name) int index = major_to_index(major); mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -307,6 +311,7 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } + spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index c84be0028f63..26798da661bd 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -129,8 +129,8 @@ static int __init n64cart_probe(struct platform_device *pdev) } reg_base = devm_platform_ioremap_resource(pdev, 0); - if (!reg_base) - return -EINVAL; + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8679a108f571..7efb31b87f37 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -116,6 +116,8 @@ static struct class *nvme_ns_chr_class; static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd); /* * Prepare a queue for teardown. @@ -1152,7 +1154,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, + struct nvme_command *cmd, int status) { if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); @@ -1167,6 +1170,26 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); } + + switch (cmd->common.opcode) { + case nvme_admin_set_features: + switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) { + case NVME_FEAT_KATO: + /* + * Keep alive commands interval on the host should be + * updated when KATO is modified by Set Features + * commands. + */ + if (!status) + nvme_update_keep_alive(ctrl, cmd); + break; + default: + break; + } + break; + default: + break; + } } int nvme_execute_passthru_rq(struct request *rq) @@ -1181,7 +1204,7 @@ int nvme_execute_passthru_rq(struct request *rq) effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); ret = nvme_execute_rq(disk, rq, false); if (effects) /* nothing to be done for zero cmd effects */ - nvme_passthru_end(ctrl, effects); + nvme_passthru_end(ctrl, effects, cmd, ret); return ret; } @@ -1269,6 +1292,21 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd) +{ + unsigned int new_kato = + DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000); + + dev_info(ctrl->device, + "keep alive interval updated from %u ms to %u ms\n", + ctrl->kato * 1000 / 2, new_kato * 1000 / 2); + + nvme_stop_keep_alive(ctrl); + ctrl->kato = new_kato; + nvme_start_keep_alive(ctrl); +} + /* * In NVMe 1.0 the CNS field was just a binary controller or namespace * flag, thus sending any new CNS opcodes has a big chance of not working. @@ -1302,11 +1340,6 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } -static bool nvme_multi_css(struct nvme_ctrl *ctrl) -{ - return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; -} - static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, struct nvme_ns_id_desc *cur, bool *csi_seen) { @@ -1874,6 +1907,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) goto out_unfreeze; } + set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { @@ -1885,6 +1919,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); + nvme_mpath_revalidate_paths(ns); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); disk_update_readahead(ns->head->disk); @@ -3763,7 +3798,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); - device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) + goto out_cleanup_ns_from_list; + if (!nvme_ns_head_multipath(ns->head)) nvme_add_ns_cdev(ns); @@ -3773,6 +3810,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, return; + out_cleanup_ns_from_list: + nvme_put_ctrl(ctrl); + down_write(&ctrl->namespaces_rwsem); + list_del_init(&ns->list); + up_write(&ctrl->namespaces_rwsem); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3795,6 +3837,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + clear_bit(NVME_NS_READY, &ns->flags); set_capacity(ns->disk, 0); nvme_fault_inject_fini(&ns->fault_inject); @@ -3802,9 +3845,12 @@ static void nvme_ns_remove(struct nvme_ns *ns) list_del_rcu(&ns->siblings); mutex_unlock(&ns->ctrl->subsys->lock); - synchronize_rcu(); /* guarantee not available in head->list */ - nvme_mpath_clear_current_path(ns); - synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ + /* guarantee not available in head->list */ + synchronize_rcu(); + + /* wait for concurrent submissions */ + if (nvme_mpath_clear_current_path(ns)) + synchronize_srcu(&ns->head->srcu); if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 37ce3e8b1db2..5d7bc58a27bd 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -147,6 +147,21 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) mutex_unlock(&ctrl->scan_lock); } +void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + sector_t capacity = get_capacity(head->disk); + int node; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (capacity != get_capacity(ns->disk)) + clear_bit(NVME_NS_READY, &ns->flags); + } + + for_each_node(node) + rcu_assign_pointer(head->current_path[node], NULL); +} + static bool nvme_path_is_disabled(struct nvme_ns *ns) { /* @@ -158,7 +173,7 @@ static bool nvme_path_is_disabled(struct nvme_ns *ns) ns->ctrl->state != NVME_CTRL_DELETING) return true; if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags)) + !test_bit(NVME_NS_READY, &ns->flags)) return true; return false; } @@ -465,6 +480,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ctrl->subsys->instance, head->instance); blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + /* set to a default value of 512 until the disk is validated */ blk_queue_logical_block_size(head->disk->queue, 512); blk_set_stacking_limits(&head->disk->queue->limits); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2e1f298b217..9871c0c9374c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -456,6 +456,7 @@ struct nvme_ns { #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 struct cdev cdev; struct device cdev_device; @@ -748,6 +749,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); +void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); @@ -795,6 +797,9 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { return false; } +static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ +} static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } @@ -887,4 +892,9 @@ struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); +static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 645025620154..e2ab12f3f51c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -45,6 +45,7 @@ struct nvme_tcp_request { u32 pdu_len; u32 pdu_sent; u16 ttag; + __le16 status; struct list_head entry; struct llist_node lentry; __le32 ddgst; @@ -485,6 +486,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, struct nvme_completion *cqe) { + struct nvme_tcp_request *req; struct request *rq; rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); @@ -496,7 +498,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) + req = blk_mq_rq_to_pdu(rq); + if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) + req->status = cqe->status; + + if (!nvme_try_complete_req(rq, req->status, cqe->result)) nvme_complete_rq(rq); queue->nr_cqe++; @@ -758,7 +764,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + nvme_tcp_end_request(rq, + le16_to_cpu(req->status)); queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -788,18 +795,24 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return 0; if (queue->recv_ddgst != queue->exp_ddgst) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); + dev_err(queue->ctrl->ctrl.device, "data digest error: recv %#x expected %#x\n", le32_to_cpu(queue->recv_ddgst), le32_to_cpu(queue->exp_ddgst)); - return -EIO; } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + nvme_tcp_end_request(rq, le16_to_cpu(req->status)); queue->nr_cqe++; } @@ -2293,6 +2306,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return ret; req->state = NVME_TCP_SEND_CMD_PDU; + req->status = cpu_to_le16(NVME_SC_SUCCESS); req->offset = 0; req->data_sent = 0; req->pdu_len = 0; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 0cb98f2bbc8c..aa6d84d8848e 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1015,7 +1015,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; - if (nvmet_req_passthru_ctrl(req)) + if (nvmet_is_passthru_req(req)) return nvmet_parse_passthru_admin_cmd(req); switch (cmd->common.opcode) { diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 273555127188..d784f3c200b4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1028,7 +1028,7 @@ nvmet_subsys_attr_version_store_locked(struct nvmet_subsys *subsys, } /* passthru subsystems use the underlying controller's version */ - if (nvmet_passthru_ctrl(subsys)) + if (nvmet_is_passthru_subsys(subsys)) return -EINVAL; ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); @@ -1067,7 +1067,8 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, { struct nvmet_subsys *subsys = to_subsys(item); - return snprintf(page, PAGE_SIZE, "%s\n", subsys->serial); + return snprintf(page, PAGE_SIZE, "%*s\n", + NVMET_SN_MAX_SIZE, subsys->serial); } static ssize_t diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 66d05eecc2a9..b8425fa34300 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -553,7 +553,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) mutex_lock(&subsys->lock); ret = 0; - if (nvmet_passthru_ctrl(subsys)) { + if (nvmet_is_passthru_subsys(subsys)) { pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); goto out_unlock; } @@ -869,7 +869,7 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; - if (nvmet_req_passthru_ctrl(req)) + if (nvmet_is_passthru_req(req)) return nvmet_parse_passthru_io_cmd(req); ret = nvmet_req_find_ns(req); @@ -1206,6 +1206,9 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ ctrl->cap |= NVMET_QUEUE_SIZE - 1; + + if (nvmet_is_passthru_subsys(ctrl->subsys)) + nvmet_passthrough_override_cap(ctrl); } struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, @@ -1363,8 +1366,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, goto out_put_subsystem; mutex_init(&ctrl->lock); - nvmet_init_cap(ctrl); - ctrl->port = req->port; INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); @@ -1378,6 +1379,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, kref_init(&ctrl->ref); ctrl->subsys = subsys; + nvmet_init_cap(ctrl); WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 06dd3d537f07..7143c7fa7464 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -582,7 +582,7 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); -static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) { return subsys->passthru_ctrl; } @@ -601,18 +601,19 @@ static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) { return 0; } -static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) { return NULL; } #endif /* CONFIG_NVME_TARGET_PASSTHRU */ -static inline struct nvme_ctrl * -nvmet_req_passthru_ctrl(struct nvmet_req *req) +static inline bool nvmet_is_passthru_req(struct nvmet_req *req) { - return nvmet_passthru_ctrl(nvmet_req_subsys(req)); + return nvmet_is_passthru_subsys(nvmet_req_subsys(req)); } +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl); + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); u16 nvmet_report_invalid_opcode(struct nvmet_req *req); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 225cd1ffbe45..f0efb3537989 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -20,6 +20,16 @@ MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); */ static DEFINE_XARRAY(passthru_subsystems); +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl) +{ + /* + * Multiple command set support can only be declared if the underlying + * controller actually supports it. + */ + if (!nvme_multi_css(ctrl->subsys->passthru_ctrl)) + ctrl->cap &= ~(1ULL << 43); +} + static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -218,7 +228,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) static void nvmet_passthru_execute_cmd(struct nvmet_req *req) { - struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; @@ -299,7 +309,7 @@ out: */ static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) { - struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; struct nvme_feat_host_behavior *host; u16 status = NVME_SC_INTERNAL; int ret; diff --git a/fs/Makefile b/fs/Makefile index 2f21300851ae..9e712aa5e254 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o block_dev.o direct-io.o mpage.o +obj-y += buffer.o direct-io.o mpage.o else obj-y += no-block.o endif diff --git a/fs/internal.h b/fs/internal.h index 68a2ae029a27..3cd065c8a66b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -18,7 +18,7 @@ struct user_namespace; struct pipe_inode_info; /* - * block_dev.c + * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); |