diff options
Diffstat (limited to 'block/io.c')
-rw-r--r-- | block/io.c | 1634 |
1 files changed, 780 insertions, 854 deletions
diff --git a/block/io.c b/block/io.c index a7dbf85b19..420944d80d 100644 --- a/block/io.c +++ b/block/io.c @@ -27,118 +27,54 @@ #include "sysemu/block-backend.h" #include "block/blockjob.h" #include "block/block_int.h" -#include "block/throttle-groups.h" #include "qemu/cutils.h" #include "qapi/error.h" #include "qemu/error-report.h" #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags); -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) +static void bdrv_parent_drained_begin(BlockDriverState *bs) { - int i; - - throttle_group_config(bs, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; + BdrvChild *c; - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; + QLIST_FOREACH(c, &bs->parents, next_parent) { + if (c->role->drained_begin) { + c->role->drained_begin(c); } } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - bdrv_start_throttled_reqs(bs); - throttle_group_unregister_bs(bs); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) -{ - assert(!bs->io_limits_enabled); - throttle_group_register_bs(bs, group); - bs->io_limits_enabled = true; } -void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) +static void bdrv_parent_drained_end(BlockDriverState *bs) { - /* this bs is not part of any group */ - if (!bs->throttle_state) { - return; - } + BdrvChild *c; - /* this bs is a part of the same group than the one we want */ - if (!g_strcmp0(throttle_group_get_name(bs), group)) { - return; + QLIST_FOREACH(c, &bs->parents, next_parent) { + if (c->role->drained_end) { + c->role->drained_end(c); + } } - - /* need to change the group this bs belong to */ - bdrv_io_limits_disable(bs); - bdrv_io_limits_enable(bs, group); } -void bdrv_setup_io_funcs(BlockDriver *bdrv) +static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } + dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); + dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); + dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, + src->opt_mem_alignment); + dst->min_mem_alignment = MAX(dst->min_mem_alignment, + src->min_mem_alignment); + dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); } void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) @@ -152,6 +88,9 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) return; } + /* Default alignment based on whether driver has byte interface */ + bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; + /* Take some limits from the children as a default */ if (bs->file) { bdrv_refresh_limits(bs->file->bs, &local_err); @@ -159,11 +98,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; - bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; - bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; - bs->bl.max_iov = bs->file->bs->bl.max_iov; + bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); } else { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = getpagesize(); @@ -178,21 +113,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = - MAX(bs->bl.opt_transfer_length, - bs->backing->bs->bl.opt_transfer_length); - bs->bl.max_transfer_length = - MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing->bs->bl.max_transfer_length); - bs->bl.opt_mem_alignment = - MAX(bs->bl.opt_mem_alignment, - bs->backing->bs->bl.opt_mem_alignment); - bs->bl.min_mem_alignment = - MAX(bs->bl.min_mem_alignment, - bs->backing->bs->bl.min_mem_alignment); - bs->bl.max_iov = - MIN(bs->bl.max_iov, - bs->backing->bs->bl.max_iov); + bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); } /* Then let the driver override it */ @@ -225,12 +146,6 @@ bool bdrv_requests_pending(BlockDriverState *bs) if (!QLIST_EMPTY(&bs->tracked_requests)) { return true; } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } QLIST_FOREACH(child, &bs->children, next) { if (bdrv_requests_pending(child->bs)) { @@ -260,18 +175,29 @@ typedef struct { bool done; } BdrvCoDrainData; +static void bdrv_drain_poll(BlockDriverState *bs) +{ + bool busy = true; + + while (busy) { + /* Keep iterating */ + busy = bdrv_requests_pending(bs); + busy |= aio_poll(bdrv_get_aio_context(bs), busy); + } +} + static void bdrv_co_drain_bh_cb(void *opaque) { BdrvCoDrainData *data = opaque; Coroutine *co = data->co; qemu_bh_delete(data->bh); - bdrv_drain(data->bs); + bdrv_drain_poll(data->bs); data->done = true; - qemu_coroutine_enter(co, NULL); + qemu_coroutine_enter(co); } -void coroutine_fn bdrv_co_drain(BlockDriverState *bs) +static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) { BdrvCoDrainData data; @@ -294,6 +220,34 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) assert(data.done); } +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + bdrv_parent_drained_begin(bs); + } + + bdrv_io_unplugged_begin(bs); + bdrv_drain_recurse(bs); + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(bs); + } else { + bdrv_drain_poll(bs); + } + bdrv_io_unplugged_end(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + + bdrv_parent_drained_end(bs); + aio_enable_external(bdrv_get_aio_context(bs)); +} + /* * Wait for pending requests to complete on a single BlockDriverState subtree, * and suspend block driver's internal I/O until next request arrives. @@ -305,21 +259,17 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) * not depend on events in other AioContexts. In that case, use * bdrv_drain_all() instead. */ -void bdrv_drain(BlockDriverState *bs) +void coroutine_fn bdrv_co_drain(BlockDriverState *bs) { - bool busy = true; + assert(qemu_in_coroutine()); + bdrv_drained_begin(bs); + bdrv_drained_end(bs); +} - bdrv_drain_recurse(bs); - if (qemu_in_coroutine()) { - bdrv_co_drain(bs); - return; - } - while (busy) { - /* Keep iterating */ - bdrv_flush_io_queue(bs); - busy = bdrv_requests_pending(bs); - busy |= aio_poll(bdrv_get_aio_context(bs), busy); - } +void bdrv_drain(BlockDriverState *bs) +{ + bdrv_drained_begin(bs); + bdrv_drained_end(bs); } /* @@ -332,16 +282,25 @@ void bdrv_drain_all(void) { /* Always run first iteration so any pending completion BHs run */ bool busy = true; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; + BlockJob *job = NULL; GSList *aio_ctxs = NULL, *ctx; - while ((bs = bdrv_next(bs))) { + while ((job = block_job_next(job))) { + AioContext *aio_context = blk_get_aio_context(job->blk); + + aio_context_acquire(aio_context); + block_job_pause(job); + aio_context_release(aio_context); + } + + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (bs->job) { - block_job_pause(bs->job); - } + bdrv_parent_drained_begin(bs); + bdrv_io_unplugged_begin(bs); bdrv_drain_recurse(bs); aio_context_release(aio_context); @@ -361,12 +320,10 @@ void bdrv_drain_all(void) for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { AioContext *aio_context = ctx->data; - bs = NULL; aio_context_acquire(aio_context); - while ((bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (aio_context == bdrv_get_aio_context(bs)) { - bdrv_flush_io_queue(bs); if (bdrv_requests_pending(bs)) { busy = true; aio_poll(aio_context, busy); @@ -378,17 +335,24 @@ void bdrv_drain_all(void) } } - bs = NULL; - while ((bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (bs->job) { - block_job_resume(bs->job); - } + bdrv_io_unplugged_end(bs); + bdrv_parent_drained_end(bs); aio_context_release(aio_context); } g_slist_free(aio_ctxs); + + job = NULL; + while ((job = block_job_next(job))) { + AioContext *aio_context = blk_get_aio_context(job->blk); + + aio_context_acquire(aio_context); + block_job_resume(job); + aio_context_release(aio_context); + } } /** @@ -447,12 +411,12 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) } /** - * Round a region to cluster boundaries + * Round a region to cluster boundaries (sector-based) */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) +void bdrv_round_sectors_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) { BlockDriverInfo bdi; @@ -467,6 +431,26 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t offset, unsigned int bytes, + int64_t *cluster_offset, + unsigned int *cluster_bytes) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_offset = offset; + *cluster_bytes = bytes; + } else { + int64_t c = bdi.cluster_size; + *cluster_offset = QEMU_ALIGN_DOWN(offset, c); + *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); + } +} + static int bdrv_get_cluster_size(BlockDriverState *bs) { BlockDriverInfo bdi; @@ -474,7 +458,7 @@ static int bdrv_get_cluster_size(BlockDriverState *bs) ret = bdrv_get_info(bs, &bdi); if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; + return bs->bl.request_alignment; } else { return bdi.cluster_size; } @@ -568,7 +552,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, } typedef struct RwCo { - BlockDriverState *bs; + BdrvChild *child; int64_t offset; QEMUIOVector *qiov; bool is_write; @@ -581,26 +565,26 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, +static int bdrv_prwv_co(BdrvChild *child, int64_t offset, QEMUIOVector *qiov, bool is_write, BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { - .bs = bs, + .child = child, .offset = offset, .qiov = qiov, .is_write = is_write, @@ -608,25 +592,14 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, .flags = flags, }; - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ bdrv_rw_co_entry(&rwco); } else { - AioContext *aio_context = bdrv_get_aio_context(bs); + AioContext *aio_context = bdrv_get_aio_context(child->bs); - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); + co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); + qemu_coroutine_enter(co); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); } @@ -637,7 +610,7 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, /* * Process a synchronous request using coroutines */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, +static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, int nb_sectors, bool is_write, BdrvRequestFlags flags) { QEMUIOVector qiov; @@ -651,15 +624,15 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, } qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, +int bdrv_read(BdrvChild *child, int64_t sector_num, uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); + return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); } /* Return < 0 if error. Important errors are: @@ -668,30 +641,39 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, -EINVAL Invalid sector number or nb_sectors -EACCES Trying to write a read-only device */ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, +int bdrv_write(BdrvChild *child, int64_t sector_num, const uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); + return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, + int count, BdrvRequestFlags flags) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = NULL, + .iov_len = count, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(child, offset, &qiov, true, + BDRV_REQ_ZERO_WRITE | flags); } /* - * Completely zero out a block device with the help of bdrv_write_zeroes. + * Completely zero out a block device with the help of bdrv_pwrite_zeroes. * The operation is sped up by checking the block status and only writing * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, + * BDRV_REQ_FUA). * * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). */ -int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) { int64_t target_sectors, ret, nb_sectors, sector_num = 0; + BlockDriverState *bs = child->bs; BlockDriverState *file; int n; @@ -715,7 +697,8 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) sector_num += n; continue; } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); + ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, flags); if (ret < 0) { error_report("error writing zeroes at sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -725,33 +708,39 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(child, offset, qiov, false, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *)buf, .iov_len = bytes, }; - int ret; if (bytes < 0) { return -EINVAL; } qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; - } - - return bytes; + return bdrv_preadv(child, offset, &qiov); } -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) { int ret; - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + ret = bdrv_prwv_co(child, offset, qiov, true, 0); if (ret < 0) { return ret; } @@ -759,8 +748,7 @@ int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) return qiov->size; } -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { @@ -773,7 +761,7 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, } qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); + return bdrv_pwritev(child, offset, &qiov); } /* @@ -782,17 +770,17 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, * * Returns 0 on success, -errno in error cases. */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) +int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, + const void *buf, int count) { int ret; - ret = bdrv_pwrite(bs, offset, buf, count); + ret = bdrv_pwrite(child, offset, buf, count); if (ret < 0) { return ret; } - ret = bdrv_flush(bs); + ret = bdrv_flush(child->bs); if (ret < 0) { return ret; } @@ -800,8 +788,117 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine); +} + +static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + + assert(!(flags & ~BDRV_REQ_MASK)); + + if (drv->bdrv_co_preadv) { + return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_readv) { + return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } +} + +static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + int ret; + + assert(!(flags & ~BDRV_REQ_MASK)); + + if (drv->bdrv_co_pwritev) { + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; + goto emulate_flags; + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_writev_flags) { + ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; + } else if (drv->bdrv_co_writev) { + assert(!bs->supported_write_flags); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + +emulate_flags: + if (ret == 0 && (flags & BDRV_REQ_FUA)) { + ret = bdrv_co_flush(bs); + } + + return ret; +} + static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t offset, unsigned int bytes, QEMUIOVector *qiov) { /* Perform I/O through a temporary buffer so that users who scribble over * their read buffer while the operation is in progress do not end up @@ -813,21 +910,20 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, BlockDriver *drv = bs->drv; struct iovec iov; QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; + int64_t cluster_offset; + unsigned int cluster_bytes; size_t skip_bytes; int ret; /* Cover entire cluster so no additional backing file I/O is required when * allocating cluster in the image file. */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); + trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, + cluster_offset, cluster_bytes); - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_len = cluster_bytes; iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); if (bounce_buffer == NULL) { ret = -ENOMEM; @@ -836,22 +932,24 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, qemu_iovec_init_external(&bounce_qiov, &iov, 1); - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes, + &bounce_qiov, 0); if (ret < 0) { goto err; } - if (drv->bdrv_co_write_zeroes && + if (drv->bdrv_co_pwrite_zeroes && buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); + /* FIXME: Should we (perhaps conditionally) be setting + * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy + * that still correctly reads as zero? */ + ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes, + &bounce_qiov, 0); } if (ret < 0) { @@ -862,9 +960,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, goto err; } - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); + skip_bytes = offset - cluster_offset; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes); err: qemu_vfree(bounce_buffer); @@ -873,23 +970,31 @@ err: /* * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. + * handles copy on read, zeroing after EOF, and fragmentation of large + * reads; any other features must be implemented by the caller. */ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, int64_t align, QEMUIOVector *qiov, int flags) { - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int64_t total_bytes, max_bytes; + int ret = 0; + uint64_t bytes_remaining = bytes; + int max_transfer; - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(is_power_of_2(align)); + assert((offset & (align - 1)) == 0); + assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); + + /* TODO: We would need a per-BDS .supported_read_flags and + * potential fallback support, if we ever implement any read flags + * to pass through to drivers. For now, there aren't any + * passthrough flags. */ + assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -906,76 +1011,77 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } if (flags & BDRV_REQ_COPY_ON_READ) { + int64_t start_sector = offset >> BDRV_SECTOR_BITS; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + unsigned int nb_sectors = end_sector - start_sector; int pnum; - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum); if (ret < 0) { goto out; } if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov); goto out; } } - /* Forward the request to the BlockDriver */ - if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else { - /* Read zeros after EOF */ - int64_t total_sectors, max_nb_sectors; + /* Forward the request to the BlockDriver, possibly fragmenting it */ + total_bytes = bdrv_getlength(bs); + if (total_bytes < 0) { + ret = total_bytes; + goto out; + } - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } + max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); + if (bytes <= max_bytes && bytes <= max_transfer) { + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); + goto out; + } + + while (bytes_remaining) { + int num; - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { + if (max_bytes) { QEMUIOVector local_qiov; + num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); + assert(num); qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); + ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, + num, &local_qiov, 0); + max_bytes -= num; qemu_iovec_destroy(&local_qiov); } else { - ret = 0; + num = bytes_remaining; + ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, + bytes_remaining); } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + if (ret < 0) { + goto out; } + bytes_remaining -= num; } out: - return ret; + return ret < 0 ? ret : 0; } /* * Handle a read request in coroutine context */ -int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +int coroutine_fn bdrv_co_preadv(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { + BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; uint8_t *head_buf = NULL; uint8_t *tail_buf = NULL; QEMUIOVector local_qiov; @@ -996,11 +1102,6 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, flags |= BDRV_REQ_COPY_ON_READ; } - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, false); - } - /* Align read if necessary by padding qiov */ if (offset & (align - 1)) { head_buf = qemu_blockalign(bs, align); @@ -1041,7 +1142,7 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, return ret; } -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, +static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1049,67 +1150,56 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { - trace_bdrv_co_readv(bs, sector_num, nb_sectors); + trace_bdrv_co_readv(child->bs, sector_num, nb_sectors); - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); + return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); +/* Maximum buffer for write zeroes fallback, in bytes */ +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_NO_SERIALISING); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 - -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; struct iovec iov = {0}; int ret = 0; + bool need_flush = false; + int head = 0; + int tail = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); + int alignment = MAX(bs->bl.pwrite_zeroes_alignment, + bs->bl.request_alignment); - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); + assert(alignment % bs->bl.request_alignment == 0); + head = offset % alignment; + tail = (offset + count) % alignment; + max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); + assert(max_write_zeroes >= bs->bl.request_alignment); - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; + while (count > 0 && !ret) { + int num = count; /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. + * to be aligned, and that unaligned requests do not cross cluster + * boundaries. */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } + if (head) { + /* Make a small request up to the first aligned sector. */ + num = MIN(count, alignment - head); + head = 0; + } else if (tail && num > alignment) { + /* Shorten the request to the last aligned sector. */ + num -= tail; } /* limit request size */ @@ -1119,64 +1209,90 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, ret = -ENOTSUP; /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + if (drv->bdrv_co_pwrite_zeroes) { + ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, + flags & bs->supported_zero_flags); + if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && + !(bs->supported_zero_flags & BDRV_REQ_FUA)) { + need_flush = true; + } + } else { + assert(!bs->supported_zero_flags); } if (ret == -ENOTSUP) { /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_WRITE_ZEROES_BOUNCE_BUFFER); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; + BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; + + if ((flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* No need for bdrv_driver_pwrite() to do a fallback + * flush on each chunk; use just one at the end */ + write_flags &= ~BDRV_REQ_FUA; + need_flush = true; + } + num = MIN(num, max_transfer); + iov.iov_len = num; if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + iov.iov_base = qemu_try_blockalign(bs, num); if (iov.iov_base == NULL) { ret = -ENOMEM; goto fail; } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + memset(iov.iov_base, 0, num); } qemu_iovec_init_external(&qiov, &iov, 1); - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); /* Keep bounce buffer around if it is big enough for all * all future requests. */ - if (num < max_xfer_len) { + if (num < max_transfer) { qemu_vfree(iov.iov_base); iov.iov_base = NULL; } } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } fail: + if (ret == 0 && need_flush) { + ret = bdrv_co_flush(bs); + } qemu_vfree(iov.iov_base); return ret; } /* - * Forwards an already correctly aligned write request to the BlockDriver. + * Forwards an already correctly aligned write request to the BlockDriver, + * after possibly fragmenting it. */ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; bool waited; int ret; - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int64_t start_sector = offset >> BDRV_SECTOR_BITS; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + uint64_t bytes_remaining = bytes; + int max_transfer; - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(is_power_of_2(align)); + assert((offset & (align - 1)) == 0); + assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); + assert(!(flags & ~BDRV_REQ_MASK)); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); @@ -1186,7 +1302,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) { flags |= BDRV_REQ_ZERO_WRITE; if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { @@ -1198,32 +1314,48 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else if (drv->bdrv_co_writev_flags) { + ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); + } else if (bytes <= max_transfer) { bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, - flags); + ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); } else { - assert(drv->supported_write_flags == 0); bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); + while (bytes_remaining) { + int num = MIN(bytes_remaining, max_transfer); + QEMUIOVector local_qiov; + int local_flags = flags; + + assert(num); + if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* If FUA is going to be emulated by flush, we only + * need to flush on the last iteration */ + local_flags &= ~BDRV_REQ_FUA; + } + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); - if (ret == 0 && (flags & BDRV_REQ_FUA) && - !(drv->supported_write_flags & BDRV_REQ_FUA)) - { - ret = bdrv_co_flush(bs); + ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, + num, &local_qiov, local_flags); + qemu_iovec_destroy(&local_qiov); + if (ret < 0) { + break; + } + bytes_remaining -= num; + } } + bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - bdrv_set_dirty(bs, sector_num, nb_sectors); + ++bs->write_gen; + bdrv_set_dirty(bs, start_sector, end_sector - start_sector); if (bs->wr_highest_offset < offset + bytes) { bs->wr_highest_offset = offset + bytes; } if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + bs->total_sectors = MAX(bs->total_sectors, end_sector); + ret = 0; } return ret; @@ -1238,7 +1370,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, uint8_t *buf = NULL; QEMUIOVector local_qiov; struct iovec iov; - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; unsigned int head_padding_bytes, tail_padding_bytes; int ret = 0; @@ -1271,7 +1403,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, memset(buf + head_padding_bytes, 0, zero_bytes); ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, - &local_qiov, + align, &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); if (ret < 0) { goto fail; @@ -1284,7 +1416,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, if (bytes >= align) { /* Write the aligned part in the middle. */ uint64_t aligned_bytes = bytes & ~(align - 1); - ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, + ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align, NULL, flags); if (ret < 0) { goto fail; @@ -1308,7 +1440,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); memset(buf, 0, bytes); - ret = bdrv_aligned_pwritev(bs, req, offset, align, + ret = bdrv_aligned_pwritev(bs, req, offset, align, align, &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); } fail: @@ -1320,13 +1452,13 @@ fail: /* * Handle a write request in coroutine context */ -int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +int coroutine_fn bdrv_co_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { + BlockDriverState *bs = child->bs; BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; uint8_t *head_buf = NULL; uint8_t *tail_buf = NULL; QEMUIOVector local_qiov; @@ -1346,11 +1478,6 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, return ret; } - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, true); - } - /* * Align write if necessary by performing a read-modify-write cycle. * Pad qiov with the read parts and be sure to have a tracked request not @@ -1392,6 +1519,14 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bytes += offset & (align - 1); offset = offset & ~(align - 1); + + /* We have read the tail already if the request is smaller + * than one aligned block. + */ + if (bytes < align) { + qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); + bytes = align; + } } if ((offset + bytes) & (align - 1)) { @@ -1431,7 +1566,7 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); @@ -1447,7 +1582,7 @@ out: return ret; } -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, +static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1455,30 +1590,29 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, +int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - trace_bdrv_co_writev(bs, sector_num, nb_sectors); + trace_bdrv_co_writev(child->bs, sector_num, nb_sectors); - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); + return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) +int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, + int count, BdrvRequestFlags flags) { - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); - if (!(bs->open_flags & BDRV_O_UNMAP)) { + if (!(child->bs->open_flags & BDRV_O_UNMAP)) { flags &= ~BDRV_REQ_MAY_UNMAP; } - return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); + return bdrv_co_pwritev(child, offset, count, NULL, + BDRV_REQ_ZERO_WRITE | flags); } typedef struct BdrvCoGetBlockStatusData { @@ -1663,8 +1797,9 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); - qemu_coroutine_enter(co, &data); + co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry, + &data); + qemu_coroutine_enter(co); while (!data.done) { aio_poll(aio_context, true); } @@ -1766,273 +1901,134 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); } -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; +typedef struct BdrvVmstateCo { + BlockDriverState *bs; + QEMUIOVector *qiov; + int64_t pos; + bool is_read; + int ret; +} BdrvVmstateCo; - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +static int coroutine_fn +bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, + bool is_read) { BlockDriver *drv = bs->drv; if (!drv) { return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (drv->bdrv_load_vmstate) { + return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos) + : drv->bdrv_save_vmstate(bs, qiov, pos); } else if (bs->file) { - return bdrv_writev_vmstate(bs->file->bs, qiov, pos); + return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); } return -ENOTSUP; } -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file->bs, buf, pos, size); - return -ENOTSUP; -} - -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); -} - -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) { - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); + BdrvVmstateCo *co = opaque; + co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); } -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +static inline int +bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, + bool is_read) { - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); -} - - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) -{ - int i; + if (qemu_in_coroutine()) { + return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); + } else { + BdrvVmstateCo data = { + .bs = bs, + .qiov = qiov, + .pos = pos, + .is_read = is_read, + .ret = -EINPROGRESS, + }; + Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + qemu_coroutine_enter(co); + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); } - g_free(mcb->callbacks[i].free_qiov); + return data.ret; } } -static void multiwrite_cb(void *opaque, int ret) +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) { - MultiwriteCB *mcb = opaque; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + int ret; - trace_multiwrite_cb(mcb, ret); + qemu_iovec_init_external(&qiov, &iov, 1); - if (ret < 0 && !mcb->error) { - mcb->error = ret; + ret = bdrv_writev_vmstate(bs, &qiov, pos); + if (ret < 0) { + return ret; } - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } + return size; } -static int multiwrite_req_compare(const void *a, const void *b) +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } + return bdrv_rw_vmstate(bs, qiov, pos, false); } -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) { - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } - - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > - bs->bl.max_iov) { - merge = 0; - } - - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; - } - - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); - - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); - - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); - - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); - - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } - - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; - - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; - } - } + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = buf, + .iov_len = size, + }; + int ret; - if (bs->blk) { - block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, - num_reqs - outidx - 1); + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_readv_vmstate(bs, &qiov, pos); + if (ret < 0) { + return ret; } - return outidx + 1; + return size; } -/* - * Submit multiple AIO write requests at once. - * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). - * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. - */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - MultiwriteCB *mcb; - int i; - - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; - } - return -1; - } - - if (num_reqs == 0) { - return 0; - } - - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; + return bdrv_rw_vmstate(bs, qiov, pos, true); +} - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; - } +/**************************************************************/ +/* async I/Os */ - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); +BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque); - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, false); +} - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); - } +BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque); - return 0; + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, true); } void bdrv_aio_cancel(BlockAIOCB *acb) @@ -2064,82 +2060,30 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb) /**************************************************************/ /* async block device emulation */ -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) -{ - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); -} - -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - -{ - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; -} - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} +typedef struct BlockRequest { + union { + /* Used during read, write, trim */ + struct { + int64_t offset; + int bytes; + int flags; + QEMUIOVector *qiov; + }; + /* Used during ioctl */ + struct { + int req; + void *buf; + }; + }; + BlockCompletionFunc *cb; + void *opaque; + int error; +} BlockRequest; typedef struct BlockAIOCBCoroutine { BlockAIOCB common; + BdrvChild *child; BlockRequest req; bool is_write; bool need_bh; @@ -2183,42 +2127,40 @@ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) static void coroutine_fn bdrv_co_do_rw(void *opaque) { BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } bdrv_co_complete(acb); } -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) { Coroutine *co; BlockAIOCBCoroutine *acb; - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque); + acb->child = child; acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; + acb->req.offset = offset; acb->req.qiov = qiov; acb->req.flags = flags; acb->is_write = is_write; - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_co_do_rw, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2245,38 +2187,37 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, acb->need_bh = true; acb->req.error = -EINPROGRESS; - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; } -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque) { BlockAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes); bdrv_co_complete(acb); } -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque) { Coroutine *co; BlockAIOCBCoroutine *acb; - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + trace_bdrv_aio_pdiscard(bs, offset, count, opaque); acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); + acb->req.offset = offset; + acb->req.bytes = count; + co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2314,62 +2255,15 @@ void qemu_aio_unref(void *p) /**************************************************************/ /* Coroutine block device emulation */ -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; +typedef struct FlushCo { + BlockDriverState *bs; int ret; -} CoroutineIOCompletion; - -static void bdrv_co_io_em_complete(void *opaque, int ret) -{ - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); -} - -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) -{ - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } - - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); - - return co.ret; -} - -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} +} FlushCo; -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} static void coroutine_fn bdrv_flush_co_entry(void *opaque) { - RwCo *rwco = opaque; + FlushCo *rwco = opaque; rwco->ret = bdrv_co_flush(rwco->bs); } @@ -2386,6 +2280,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + int current_gen = bs->write_gen; + + /* Wait until any previous flushes are completed */ + while (bs->active_flush_req != NULL) { + qemu_co_queue_wait(&bs->flush_queue); + } + + bs->active_flush_req = &req; + /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { ret = bs->drv->bdrv_co_flush(bs); @@ -2406,6 +2309,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) goto flush_parent; } + /* Check if we really need to flush anything */ + if (bs->flushed_gen == current_gen) { + goto flush_parent; + } + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); if (bs->drv->bdrv_co_flush_to_disk) { ret = bs->drv->bdrv_co_flush_to_disk(bs); @@ -2436,6 +2344,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) */ ret = 0; } + if (ret < 0) { goto out; } @@ -2446,6 +2355,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) flush_parent: ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; out: + /* Notify any pending flushes that we have completed */ + bs->flushed_gen = current_gen; + bs->active_flush_req = NULL; + /* Return value is ignored - it's ok if wait queue is empty */ + qemu_co_queue_next(&bs->flush_queue); + tracked_request_end(&req); return ret; } @@ -2453,51 +2368,52 @@ out: int bdrv_flush(BlockDriverState *bs) { Coroutine *co; - RwCo rwco = { + FlushCo flush_co = { .bs = bs, .ret = NOT_DONE, }; if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); + bdrv_flush_co_entry(&flush_co); } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { + co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); + qemu_coroutine_enter(co); + while (flush_co.ret == NOT_DONE) { aio_poll(aio_context, true); } } - return rwco.ret; + return flush_co.ret; } typedef struct DiscardCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; + int count; int ret; } DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); + rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count); } -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, + int count) { BdrvTrackedRequest req; - int max_discard, ret; + int max_pdiscard, ret; + int head, align; if (!bs->drv) { return -ENOMEDIUM; } - ret = bdrv_check_request(bs, sector_num, nb_sectors); + ret = bdrv_check_byte_request(bs, offset, count); if (ret < 0) { return ret; } else if (bs->read_only) { @@ -2510,44 +2426,49 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { return 0; } - tracked_request_begin(&req, bs, sector_num, nb_sectors, - BDRV_TRACKED_DISCARD); - bdrv_set_dirty(bs, sector_num, nb_sectors); + /* Discard is advisory, so ignore any unaligned head or tail */ + align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); + assert(align % bs->bl.request_alignment == 0); + head = offset % align; + if (head) { + head = MIN(count, align - head); + count -= head; + offset += head; + } + count = QEMU_ALIGN_DOWN(count, align); + if (!count) { + return 0; + } - max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } + tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + if (ret < 0) { + goto out; + } - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), + align); + assert(max_pdiscard); + + while (count > 0) { + int ret; + int num = MIN(count, max_pdiscard); + + if (bs->drv->bdrv_co_pdiscard) { + ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); } else { BlockAIOCB *acb; CoroutineIOCompletion co = { .coroutine = qemu_coroutine_self(), }; - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); + acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, + bdrv_co_io_em_complete, &co); if (acb == NULL) { ret = -EIO; goto out; @@ -2560,33 +2481,36 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, goto out; } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } ret = 0; out: + ++bs->write_gen; + bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, + req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); return ret; } -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) { Coroutine *co; DiscardCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, + .offset = offset, + .count = count, .ret = NOT_DONE, }; if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); + bdrv_pdiscard_co_entry(&rwco); } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); + co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); + qemu_coroutine_enter(co); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); } @@ -2595,19 +2519,6 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) return rwco.ret; } -typedef struct { - CoroutineIOCompletion *co; - QEMUBH *bh; -} BdrvIoctlCompletionData; - -static void bdrv_ioctl_bh_cb(void *opaque) -{ - BdrvIoctlCompletionData *data = opaque; - - bdrv_co_io_em_complete(data->co, -ENOTSUP); - qemu_bh_delete(data->bh); -} - static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; @@ -2625,11 +2536,8 @@ static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); if (!acb) { - BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); - data->bh = aio_bh_new(bdrv_get_aio_context(bs), - bdrv_ioctl_bh_cb, data); - data->co = &co; - qemu_bh_schedule(data->bh); + co.ret = -ENOTSUP; + goto out; } qemu_coroutine_yield(); out: @@ -2664,9 +2572,9 @@ int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) /* Fast-path if already in coroutine context */ bdrv_co_ioctl_entry(&data); } else { - Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, &data); - qemu_coroutine_enter(co, &data); + qemu_coroutine_enter(co); while (data.ret == -EINPROGRESS) { aio_poll(bdrv_get_aio_context(bs), true); } @@ -2694,8 +2602,8 @@ BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, acb->req.error = -EINPROGRESS; acb->req.req = req; acb->req.buf = buf; - co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2763,48 +2671,66 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs, void bdrv_io_plug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file->bs); + BdrvChild *child; + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_plug(child->bs); + } + + if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } } void bdrv_io_unplug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file->bs); + BdrvChild *child; + + assert(bs->io_plugged); + if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } } -} -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file->bs); + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplug(child->bs); } - bdrv_start_throttled_reqs(bs); } -void bdrv_drained_begin(BlockDriverState *bs) +void bdrv_io_unplugged_begin(BlockDriverState *bs) { - if (!bs->quiesce_counter++) { - aio_disable_external(bdrv_get_aio_context(bs)); + BdrvChild *child; + + if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } + } + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_begin(child->bs); } - bdrv_drain(bs); } -void bdrv_drained_end(BlockDriverState *bs) +void bdrv_io_unplugged_end(BlockDriverState *bs) { - assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { - return; + BdrvChild *child; + + assert(bs->io_plug_disabled); + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_end(child->bs); + } + + if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } - aio_enable_external(bdrv_get_aio_context(bs)); } |