diff options
author | hyokeun <hyokeun.jeon@samsung.com> | 2016-09-06 14:09:22 +0900 |
---|---|---|
committer | hyokeun <hyokeun.jeon@samsung.com> | 2016-09-06 14:09:22 +0900 |
commit | bd54c25035217800f3b1d39f6472d599cd602d5a (patch) | |
tree | 299417fe96f546225439ff92b27ac3e55909a970 /block | |
parent | 186efde2677c31fb40d154a81a5f3731eab52414 (diff) | |
download | qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.gz qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.bz2 qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.zip |
Imported Upstream version 2.7.0upstream/2.7.0
Diffstat (limited to 'block')
56 files changed, 4940 insertions, 3544 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 44a541622..2593a2f8a 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o mirror.o io.o +block-obj-y += null.o mirror.o commit.o io.o block-obj-y += throttle-groups.o block-obj-y += nbd.o nbd-client.o sheepdog.o @@ -26,7 +26,6 @@ block-obj-y += write-threshold.o block-obj-y += crypto.o common-obj-y += stream.o -common-obj-y += commit.o common-obj-y += backup.o iscsi.o-cflags := $(LIBISCSI_CFLAGS) diff --git a/block/archipelago.c b/block/archipelago.c index b9f5e69d4..37b8aca78 100644 --- a/block/archipelago.c +++ b/block/archipelago.c @@ -974,11 +974,9 @@ err_exit2: static int64_t qemu_archipelago_getlength(BlockDriverState *bs) { - int64_t ret; BDRVArchipelagoState *s = bs->opaque; - ret = archipelago_volume_info(s); - return ret; + return archipelago_volume_info(s); } static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset) diff --git a/block/backup.c b/block/backup.c index 491fd1406..2c0532314 100644 --- a/block/backup.c +++ b/block/backup.c @@ -36,7 +36,7 @@ typedef struct CowRequest { typedef struct BackupBlockJob { BlockJob common; - BlockDriverState *target; + BlockBackend *target; /* bitmap for sync=incremental */ BdrvDirtyBitmap *sync_bitmap; MirrorSyncMode sync_mode; @@ -47,6 +47,7 @@ typedef struct BackupBlockJob { uint64_t sectors_read; unsigned long *done_bitmap; int64_t cluster_size; + NotifierWithReturn before_write; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; @@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req) qemu_co_queue_restart_all(&req->wait_queue); } -static int coroutine_fn backup_do_cow(BlockDriverState *bs, +static int coroutine_fn backup_do_cow(BackupBlockJob *job, int64_t sector_num, int nb_sectors, bool *error_is_read, bool is_write_notifier) { - BackupBlockJob *job = (BackupBlockJob *)bs->job; + BlockBackend *blk = job->common.blk; CowRequest cow_request; struct iovec iov; QEMUIOVector bounce_qiov; @@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, start * sectors_per_cluster); if (!bounce_buffer) { - bounce_buffer = qemu_blockalign(bs, job->cluster_size); + bounce_buffer = blk_blockalign(blk, job->cluster_size); } iov.iov_base = bounce_buffer; iov.iov_len = n * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&bounce_qiov, &iov, 1); - if (is_write_notifier) { - ret = bdrv_co_readv_no_serialising(bs, - start * sectors_per_cluster, - n, &bounce_qiov); - } else { - ret = bdrv_co_readv(bs, start * sectors_per_cluster, n, - &bounce_qiov); - } + ret = blk_co_preadv(blk, start * job->cluster_size, + bounce_qiov.size, &bounce_qiov, + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); if (ret < 0) { trace_backup_do_cow_read_fail(job, start, ret); if (error_is_read) { @@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, } if (buffer_is_zero(iov.iov_base, iov.iov_len)) { - ret = bdrv_co_write_zeroes(job->target, - start * sectors_per_cluster, - n, BDRV_REQ_MAY_UNMAP); + ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size, + bounce_qiov.size, BDRV_REQ_MAY_UNMAP); } else { - ret = bdrv_co_writev(job->target, - start * sectors_per_cluster, n, - &bounce_qiov); + ret = blk_co_pwritev(job->target, start * job->cluster_size, + bounce_qiov.size, &bounce_qiov, 0); } if (ret < 0) { trace_backup_do_cow_write_fail(job, start, ret); @@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify( NotifierWithReturn *notifier, void *opaque) { + BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write); BdrvTrackedRequest *req = opaque; int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; + assert(req->bs == blk_bs(job->common.blk)); assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); + return backup_do_cow(job, sector_num, nb_sectors, NULL, true); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -218,19 +214,10 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static void backup_iostatus_reset(BlockJob *job) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - - if (s->target->blk) { - blk_iostatus_reset(s->target->blk); - } -} - static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) { BdrvDirtyBitmap *bm; - BlockDriverState *bs = job->common.bs; + BlockDriverState *bs = blk_bs(job->common.blk); if (ret < 0 || block_job_is_cancelled(&job->common)) { /* Merge the successor back into the parent, delete nothing. */ @@ -259,24 +246,31 @@ static void backup_abort(BlockJob *job) } } +static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + blk_set_aio_context(s->target, aio_context); +} + static const BlockJobDriver backup_job_driver = { - .instance_size = sizeof(BackupBlockJob), - .job_type = BLOCK_JOB_TYPE_BACKUP, - .set_speed = backup_set_speed, - .iostatus_reset = backup_iostatus_reset, - .commit = backup_commit, - .abort = backup_abort, + .instance_size = sizeof(BackupBlockJob), + .job_type = BLOCK_JOB_TYPE_BACKUP, + .set_speed = backup_set_speed, + .commit = backup_commit, + .abort = backup_abort, + .attached_aio_context = backup_attached_aio_context, }; static BlockErrorAction backup_error_action(BackupBlockJob *job, bool read, int error) { if (read) { - return block_job_error_action(&job->common, job->common.bs, - job->on_source_error, true, error); + return block_job_error_action(&job->common, job->on_source_error, + true, error); } else { - return block_job_error_action(&job->common, job->target, - job->on_target_error, false, error); + return block_job_error_action(&job->common, job->on_target_error, + false, error); } } @@ -289,7 +283,7 @@ static void backup_complete(BlockJob *job, void *opaque) BackupBlockJob *s = container_of(job, BackupBlockJob, common); BackupCompleteData *data = opaque; - bdrv_unref(s->target); + blk_unref(s->target); block_job_completed(job, data->ret); g_free(data); @@ -331,7 +325,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t end; int64_t last_cluster = -1; int64_t sectors_per_cluster = cluster_size_sectors(job); - BlockDriverState *bs = job->common.bs; HBitmapIter hbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); @@ -353,7 +346,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if (yield_and_check(job)) { return ret; } - ret = backup_do_cow(bs, cluster * sectors_per_cluster, + ret = backup_do_cow(job, cluster * sectors_per_cluster, sectors_per_cluster, &error_is_read, false); if ((ret < 0) && @@ -386,12 +379,8 @@ static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; BackupCompleteData *data; - BlockDriverState *bs = job->common.bs; - BlockDriverState *target = job->target; - BlockdevOnError on_target_error = job->on_target_error; - NotifierWithReturn before_write = { - .notify = backup_before_write_notify, - }; + BlockDriverState *bs = blk_bs(job->common.blk); + BlockBackend *target = job->target; int64_t start, end; int64_t sectors_per_cluster = cluster_size_sectors(job); int ret = 0; @@ -404,20 +393,14 @@ static void coroutine_fn backup_run(void *opaque) job->done_bitmap = bitmap_new(end); - if (target->blk) { - blk_set_on_error(target->blk, on_target_error, on_target_error); - blk_iostatus_enable(target->blk); - } - - bdrv_add_before_write_notifier(bs, &before_write); + job->before_write.notify = backup_before_write_notify; + bdrv_add_before_write_notifier(bs, &job->before_write); if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { while (!block_job_is_cancelled(&job->common)) { /* Yield until the job is cancelled. We just let our before_write * notify callback service CoW requests. */ - job->common.busy = false; - qemu_coroutine_yield(); - job->common.busy = true; + block_job_yield(&job->common); } } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { ret = backup_run_incremental(job); @@ -461,7 +444,7 @@ static void coroutine_fn backup_run(void *opaque) } } /* FULL sync mode we copy the whole drive. */ - ret = backup_do_cow(bs, start * sectors_per_cluster, + ret = backup_do_cow(job, start * sectors_per_cluster, sectors_per_cluster, &error_is_read, false); if (ret < 0) { /* Depending on error action, fail now or retry cluster */ @@ -477,26 +460,23 @@ static void coroutine_fn backup_run(void *opaque) } } - notifier_with_return_remove(&before_write); + notifier_with_return_remove(&job->before_write); /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); g_free(job->done_bitmap); - if (target->blk) { - blk_iostatus_disable(target->blk); - } - bdrv_op_unblock_all(target, job->common.blocker); + bdrv_op_unblock_all(blk_bs(target), job->common.blocker); data = g_malloc(sizeof(*data)); data->ret = ret; block_job_defer_to_main_loop(&job->common, backup_complete, data); } -void backup_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode sync_mode, - BdrvDirtyBitmap *sync_bitmap, +void backup_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, int64_t speed, + MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, @@ -504,24 +484,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, { int64_t len; BlockDriverInfo bdi; + BackupBlockJob *job = NULL; int ret; assert(bs); assert(target); - assert(cb); if (bs == target) { error_setg(errp, "Source and target cannot be the same"); return; } - if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || - on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); - return; - } - if (!bdrv_is_inserted(bs)) { error_setg(errp, "Device is not inserted: %s", bdrv_get_device_name(bs)); @@ -568,15 +541,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, goto error; } - BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, - cb, opaque, errp); + job = block_job_create(job_id, &backup_job_driver, bs, speed, + cb, opaque, errp); if (!job) { goto error; } + job->target = blk_new(); + blk_insert_bs(job->target, target); + job->on_source_error = on_source_error; job->on_target_error = on_target_error; - job->target = target; job->sync_mode = sync_mode; job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? sync_bitmap : NULL; @@ -584,7 +559,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, /* If there is no backing file on the target, we cannot rely on COW if our * backup cluster size is smaller than the target cluster size. Even for * targets with a backing file, try to avoid COW if possible. */ - ret = bdrv_get_info(job->target, &bdi); + ret = bdrv_get_info(target, &bdi); if (ret < 0 && !target->backing) { error_setg_errno(errp, -ret, "Couldn't determine the cluster size of the target image, " @@ -601,13 +576,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, bdrv_op_block_all(target, job->common.blocker); job->common.len = len; - job->common.co = qemu_coroutine_create(backup_run); + job->common.co = qemu_coroutine_create(backup_run, job); block_job_txn_add_job(txn, &job->common); - qemu_coroutine_enter(job->common.co, job); + qemu_coroutine_enter(job->common.co); return; error: if (sync_bitmap) { bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); } + if (job) { + blk_unref(job->target); + block_job_unref(&job->common); + } } diff --git a/block/blkdebug.c b/block/blkdebug.c index 20d25bda6..d5db16681 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -37,6 +37,10 @@ typedef struct BDRVBlkdebugState { int state; int new_state; + int align; + + /* For blkdebug_refresh_filename() */ + char *config_file; QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX]; QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; @@ -350,7 +354,6 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, BDRVBlkdebugState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *config; uint64_t align; int ret; @@ -363,8 +366,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, } /* Read rules from config file or command line options */ - config = qemu_opt_get(opts, "config"); - ret = read_config(s, config, options, errp); + s->config_file = g_strdup(qemu_opt_get(opts, "config")); + ret = read_config(s, s->config_file, options, errp); if (ret) { goto out; } @@ -382,10 +385,10 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, } /* Set request alignment */ - align = qemu_opt_get_size(opts, "align", bs->request_alignment); - if (align > 0 && align < INT_MAX && !(align & (align - 1))) { - bs->request_alignment = align; - } else { + align = qemu_opt_get_size(opts, "align", 0); + if (align < INT_MAX && is_power_of_2(align)) { + s->align = align; + } else if (align) { error_setg(errp, "Invalid alignment"); ret = -EINVAL; goto fail_unref; @@ -397,6 +400,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, fail_unref: bdrv_unref_child(bs, bs->file); out: + if (ret < 0) { + g_free(s->config_file); + } qemu_opts_del(opts); return ret; } @@ -456,7 +462,7 @@ static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, return inject_error(bs, cb, opaque, rule); } - return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors, + return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); } @@ -479,7 +485,7 @@ static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, return inject_error(bs, cb, opaque, rule); } - return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, + return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); } @@ -514,6 +520,8 @@ static void blkdebug_close(BlockDriverState *bs) remove_rule(rule); } } + + g_free(s->config_file); } static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) @@ -620,7 +628,7 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { if (!strcmp(r->tag, tag)) { - qemu_coroutine_enter(r->co, NULL); + qemu_coroutine_enter(r->co); return 0; } } @@ -646,7 +654,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, } QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { if (!strcmp(r->tag, tag)) { - qemu_coroutine_enter(r->co, NULL); + qemu_coroutine_enter(r->co); ret = 0; } } @@ -678,6 +686,7 @@ static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) { + BDRVBlkdebugState *s = bs->opaque; QDict *opts; const QDictEntry *e; bool force_json = false; @@ -699,8 +708,7 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) if (!force_json && bs->file->bs->exact_filename[0]) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "blkdebug:%s:%s", - qdict_get_try_str(options, "config") ?: "", + "blkdebug:%s:%s", s->config_file ?: "", bs->file->bs->exact_filename); } @@ -720,6 +728,15 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) bs->full_open_options = opts; } +static void blkdebug_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVBlkdebugState *s = bs->opaque; + + if (s->align) { + bs->bl.request_alignment = s->align; + } +} + static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp) { @@ -738,6 +755,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_getlength = blkdebug_getlength, .bdrv_truncate = blkdebug_truncate, .bdrv_refresh_filename = blkdebug_refresh_filename, + .bdrv_refresh_limits = blkdebug_refresh_limits, .bdrv_aio_readv = blkdebug_aio_readv, .bdrv_aio_writev = blkdebug_aio_writev, diff --git a/block/blkreplay.c b/block/blkreplay.c index 42f1813af..30f9d5ff6 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -65,7 +65,7 @@ static int64_t blkreplay_getlength(BlockDriverState *bs) static void blkreplay_bh_cb(void *opaque) { Request *req = opaque; - qemu_coroutine_enter(req->co, NULL); + qemu_coroutine_enter(req->co); qemu_bh_delete(req->bh); g_free(req); } @@ -81,44 +81,44 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs, replay_block_event(req->bh, reqid); } -static int coroutine_fn blkreplay_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { uint64_t reqid = request_id++; - int ret = bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); + int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); return ret; } -static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { uint64_t reqid = request_id++; - int ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov); + int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); return ret; } -static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { uint64_t reqid = request_id++; - int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); + int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); return ret; } -static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { uint64_t reqid = request_id++; - int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); + int ret = bdrv_co_pdiscard(bs->file->bs, offset, count); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -144,11 +144,11 @@ static BlockDriver bdrv_blkreplay = { .bdrv_close = blkreplay_close, .bdrv_getlength = blkreplay_getlength, - .bdrv_co_readv = blkreplay_co_readv, - .bdrv_co_writev = blkreplay_co_writev, + .bdrv_co_preadv = blkreplay_co_preadv, + .bdrv_co_pwritev = blkreplay_co_pwritev, - .bdrv_co_write_zeroes = blkreplay_co_write_zeroes, - .bdrv_co_discard = blkreplay_co_discard, + .bdrv_co_pwrite_zeroes = blkreplay_co_pwrite_zeroes, + .bdrv_co_pdiscard = blkreplay_co_pdiscard, .bdrv_co_flush = blkreplay_co_flush, }; diff --git a/block/blkverify.c b/block/blkverify.c index 9414b7a84..da62d7596 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -247,9 +247,9 @@ static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); - bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors, + bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); - bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors, + bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, blkverify_aio_cb, acb); return &acb->common; } @@ -262,9 +262,9 @@ static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, nb_sectors, cb, opaque); - bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors, + bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); - bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, + bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); return &acb->common; } @@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); } -/* Propagate AioContext changes to ->test_file */ -static void blkverify_detach_aio_context(BlockDriverState *bs) -{ - BDRVBlkverifyState *s = bs->opaque; - - bdrv_detach_aio_context(s->test_file->bs); -} - -static void blkverify_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVBlkverifyState *s = bs->opaque; - - bdrv_attach_aio_context(s->test_file->bs, new_context); -} - static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options) { BDRVBlkverifyState *s = bs->opaque; @@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = { .bdrv_aio_writev = blkverify_aio_writev, .bdrv_aio_flush = blkverify_aio_flush, - .bdrv_attach_aio_context = blkverify_attach_aio_context, - .bdrv_detach_aio_context = blkverify_detach_aio_context, - .is_filter = true, .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, }; diff --git a/block/block-backend.c b/block/block-backend.c index 16c9d5e0f..effa03892 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1,7 +1,7 @@ /* * QEMU Block backends * - * Copyright (C) 2014 Red Hat, Inc. + * Copyright (C) 2014-2016 Red Hat, Inc. * * Authors: * Markus Armbruster <armbru@redhat.com>, @@ -19,6 +19,7 @@ #include "sysemu/sysemu.h" #include "qapi-event.h" #include "qemu/id.h" +#include "trace.h" /* Number of coroutines to reserve per attached device model */ #define COROUTINE_POOL_RESERVATION 64 @@ -34,6 +35,7 @@ struct BlockBackend { DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */ QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */ + BlockBackendPublic public; void *dev; /* attached device model, if any */ /* TODO change to DeviceState when all users are qdevified */ @@ -74,6 +76,7 @@ static const AIOCBInfo block_backend_aiocb_info = { }; static void drive_info_del(DriveInfo *dinfo); +static BlockBackend *bdrv_first_blk(BlockDriverState *bs); /* All BlockBackends */ static QTAILQ_HEAD(, BlockBackend) block_backends = @@ -90,9 +93,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options, /* We're not supposed to call this function for root nodes */ abort(); } +static void blk_root_drained_begin(BdrvChild *child); +static void blk_root_drained_end(BdrvChild *child); + +static void blk_root_change_media(BdrvChild *child, bool load); +static void blk_root_resize(BdrvChild *child); + +static const char *blk_root_get_name(BdrvChild *child) +{ + return blk_name(child->opaque); +} static const BdrvChildRole child_root = { - .inherit_options = blk_root_inherit_options, + .inherit_options = blk_root_inherit_options, + + .change_media = blk_root_change_media, + .resize = blk_root_resize, + .get_name = blk_root_get_name, + + .drained_begin = blk_root_drained_begin, + .drained_end = blk_root_drained_end, }; /* @@ -100,40 +120,26 @@ static const BdrvChildRole child_root = { * Store an error through @errp on failure, unless it's null. * Return the new BlockBackend on success, null on failure. */ -BlockBackend *blk_new(Error **errp) +BlockBackend *blk_new(void) { BlockBackend *blk; blk = g_new0(BlockBackend, 1); blk->refcnt = 1; - notifier_list_init(&blk->remove_bs_notifiers); - notifier_list_init(&blk->insert_bs_notifiers); - QTAILQ_INSERT_TAIL(&block_backends, blk, link); - return blk; -} + blk_set_enable_write_cache(blk, true); -/* - * Create a new BlockBackend with a new BlockDriverState attached. - * Otherwise just like blk_new(), which see. - */ -BlockBackend *blk_new_with_bs(Error **errp) -{ - BlockBackend *blk; - BlockDriverState *bs; + qemu_co_queue_init(&blk->public.throttled_reqs[0]); + qemu_co_queue_init(&blk->public.throttled_reqs[1]); - blk = blk_new(errp); - if (!blk) { - return NULL; - } + notifier_list_init(&blk->remove_bs_notifiers); + notifier_list_init(&blk->insert_bs_notifiers); - bs = bdrv_new_root(); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - bs->blk = blk; + QTAILQ_INSERT_TAIL(&block_backends, blk, link); return blk; } /* - * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState. + * Creates a new BlockBackend, opens a new BlockDriverState, and connects both. * * Just as with bdrv_open(), after having called this function the reference to * @options belongs to the block layer (even on failure). @@ -148,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference, QDict *options, int flags, Error **errp) { BlockBackend *blk; - int ret; - - blk = blk_new_with_bs(errp); - if (!blk) { - QDECREF(options); - return NULL; - } + BlockDriverState *bs; - ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp); - if (ret < 0) { + blk = blk_new(); + bs = bdrv_open(filename, reference, options, flags, errp); + if (!bs) { blk_unref(blk); return NULL; } - blk_set_enable_write_cache(blk, true); + blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk); return blk; } @@ -177,10 +178,6 @@ static void blk_delete(BlockBackend *blk) } assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers)); assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers)); - if (blk->root_state.throttle_state) { - g_free(blk->root_state.throttle_group); - throttle_group_unref(blk->root_state.throttle_state); - } QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); block_acct_cleanup(&blk->stats); @@ -267,28 +264,45 @@ BlockBackend *blk_next(BlockBackend *blk) : QTAILQ_FIRST(&monitor_block_backends); } -/* - * Iterates over all BlockDriverStates which are attached to a BlockBackend. - * This function is for use by bdrv_next(). - * - * @bs must be NULL or a BDS that is attached to a BB. - */ -BlockDriverState *blk_next_root_bs(BlockDriverState *bs) +/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by + * the monitor or attached to a BlockBackend */ +BlockDriverState *bdrv_next(BdrvNextIterator *it) { - BlockBackend *blk; + BlockDriverState *bs; - if (bs) { - assert(bs->blk); - blk = bs->blk; - } else { - blk = NULL; + /* First, return all root nodes of BlockBackends. In order to avoid + * returning a BDS twice when multiple BBs refer to it, we only return it + * if the BB is the first one in the parent list of the BDS. */ + if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { + do { + it->blk = blk_all_next(it->blk); + bs = it->blk ? blk_bs(it->blk) : NULL; + } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk)); + + if (bs) { + return bs; + } + it->phase = BDRV_NEXT_MONITOR_OWNED; } + /* Then return the monitor-owned BDSes without a BB attached. Ignore all + * BDSes that are attached to a BlockBackend here; they have been handled + * by the above block already */ do { - blk = blk_all_next(blk); - } while (blk && !blk->root); + it->bs = bdrv_next_monitor_owned(it->bs); + bs = it->bs; + } while (bs && bdrv_has_blk(bs)); + + return bs; +} + +BlockDriverState *bdrv_first(BdrvNextIterator *it) +{ + *it = (BdrvNextIterator) { + .phase = BDRV_NEXT_BACKEND_ROOTS, + }; - return blk ? blk->root->bs : NULL; + return bdrv_next(it); } /* @@ -375,6 +389,26 @@ BlockDriverState *blk_bs(BlockBackend *blk) return blk->root ? blk->root->bs : NULL; } +static BlockBackend *bdrv_first_blk(BlockDriverState *bs) +{ + BdrvChild *child; + QLIST_FOREACH(child, &bs->parents, next_parent) { + if (child->role == &child_root) { + return child->opaque; + } + } + + return NULL; +} + +/* + * Returns true if @bs has an associated BlockBackend. + */ +bool bdrv_has_blk(BlockDriverState *bs) +{ + return bdrv_first_blk(bs) != NULL; +} + /* * Return @blk's DriveInfo if any, else null. */ @@ -411,17 +445,33 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) } /* + * Returns a pointer to the publicly accessible fields of @blk. + */ +BlockBackendPublic *blk_get_public(BlockBackend *blk) +{ + return &blk->public; +} + +/* + * Returns a BlockBackend given the associated @public fields. + */ +BlockBackend *blk_by_public(BlockBackendPublic *public) +{ + return container_of(public, BlockBackend, public); +} + +/* * Disassociates the currently associated BlockDriverState from @blk. */ void blk_remove_bs(BlockBackend *blk) { - assert(blk->root->bs->blk == blk); - notifier_list_notify(&blk->remove_bs_notifiers, blk); + if (blk->public.throttle_state) { + throttle_timers_detach_aio_context(&blk->public.throttle_timers); + } blk_update_root_state(blk); - blk->root->bs->blk = NULL; bdrv_root_unref_child(blk->root); blk->root = NULL; } @@ -431,12 +481,14 @@ void blk_remove_bs(BlockBackend *blk) */ void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs) { - assert(!blk->root && !bs->blk); bdrv_ref(bs); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - bs->blk = blk; + blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk); notifier_list_notify(&blk->insert_bs_notifiers, blk); + if (blk->public.throttle_state) { + throttle_timers_attach_aio_context( + &blk->public.throttle_timers, bdrv_get_aio_context(bs)); + } } /* @@ -525,6 +577,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load) } } +static void blk_root_change_media(BdrvChild *child, bool load) +{ + blk_dev_change_media_cb(child->opaque, load); +} + /* * Does @blk's attached device model have removable media? * %true if no device model is attached. @@ -579,8 +636,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk) /* * Notify @blk's attached device model of a backend size change. */ -void blk_dev_resize_cb(BlockBackend *blk) +static void blk_root_resize(BdrvChild *child) { + BlockBackend *blk = child->opaque; + if (blk->dev_ops && blk->dev_ops->resize_cb) { blk->dev_ops->resize_cb(blk->dev_opaque); } @@ -683,34 +742,50 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num, nb_sectors * BDRV_SECTOR_SIZE); } -static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { - int ret = blk_check_byte_request(blk, offset, bytes); + int ret; + + trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags); + + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } - return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags); + /* throttling disk I/O */ + if (blk->public.throttle_state) { + throttle_group_co_io_limits_intercept(blk, bytes, false); + } + + return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags); } -static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { int ret; + trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags); + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } + /* throttling disk I/O */ + if (blk->public.throttle_state) { + throttle_group_co_io_limits_intercept(blk, bytes, true); + } + if (!blk->enable_write_cache) { flags |= BDRV_REQ_FUA; } - return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags); + return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags); } typedef struct BlkRwCo { @@ -761,8 +836,8 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, .ret = NOT_DONE, }; - co = qemu_coroutine_create(co_entry); - qemu_coroutine_enter(co, &rwco); + co = qemu_coroutine_create(co_entry, &rwco); + qemu_coroutine_enter(co); aio_context = blk_get_aio_context(blk); while (rwco.ret == NOT_DONE) { @@ -772,55 +847,32 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, return rwco.ret; } -static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors, CoroutineEntry co_entry, - BdrvRequestFlags flags) +int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf, + int count) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf, - nb_sectors << BDRV_SECTOR_BITS, co_entry, flags); -} - -int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) -{ - return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0); -} - -int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) -{ - BlockDriverState *bs = blk_bs(blk); - bool enabled; int ret; - ret = blk_check_request(blk, sector_num, nb_sectors); + ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = blk_read(blk, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; + blk_root_drained_begin(blk->root); + ret = blk_pread(blk, offset, buf, count); + blk_root_drained_end(blk->root); return ret; } -int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, - int nb_sectors) +int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { - return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors, - blk_write_entry, 0); + return blk_prw(blk, offset, NULL, count, blk_write_entry, + flags | BDRV_REQ_ZERO_WRITE); } -int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) { - return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry, - flags | BDRV_REQ_ZERO_WRITE); + return bdrv_make_zero(blk->root, flags); } static void error_callback_bh(void *opaque) @@ -898,8 +950,8 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, acb->bh = NULL; acb->has_returned = false; - co = qemu_coroutine_create(co_entry); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(co_entry, acb); + qemu_coroutine_enter(co); acb->has_returned = true; if (acb->rwco.ret != NOT_DONE) { @@ -932,18 +984,12 @@ static void blk_aio_write_entry(void *opaque) blk_aio_complete(acb); } -BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, - blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE, - cb, opaque); + return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry, + flags | BDRV_REQ_ZERO_WRITE, cb, opaque); } int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) @@ -955,9 +1001,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) return count; } -int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) +int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count, + BdrvRequestFlags flags) { - int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0); + int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry, + flags); if (ret < 0) { return ret; } @@ -991,30 +1039,20 @@ int64_t blk_nb_sectors(BlockBackend *blk) return bdrv_nb_sectors(blk_bs(blk)); } -BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_read_entry, 0, cb, opaque); + return blk_aio_prwv(blk, offset, qiov->size, qiov, + blk_aio_read_entry, flags, cb, opaque); } -BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_write_entry, 0, cb, opaque); + return blk_aio_prwv(blk, offset, qiov->size, qiov, + blk_aio_write_entry, flags, cb, opaque); } BlockAIOCB *blk_aio_flush(BlockBackend *blk, @@ -1027,16 +1065,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk, return bdrv_aio_flush(blk_bs(blk), cb, opaque); } -BlockAIOCB *blk_aio_discard(BlockBackend *blk, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, + int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return blk_abort_aio_request(blk, cb, opaque, ret); } - return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque); + return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque); } void blk_aio_cancel(BlockAIOCB *acb) @@ -1049,20 +1087,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb) bdrv_aio_cancel_async(acb); } -int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs) -{ - int i, ret; - - for (i = 0; i < num_reqs; i++) { - ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors); - if (ret < 0) { - return ret; - } - } - - return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs); -} - int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) { if (!blk_is_available(blk)) { @@ -1082,14 +1106,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque); } -int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors); + return bdrv_co_pdiscard(blk_bs(blk), offset, count); } int blk_co_flush(BlockBackend *blk) @@ -1149,6 +1173,7 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, return BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_IGNORE: return BLOCK_ERROR_ACTION_IGNORE; + case BLOCKDEV_ON_ERROR_AUTO: default: abort(); } @@ -1284,15 +1309,16 @@ int blk_get_flags(BlockBackend *blk) } } -int blk_get_max_transfer_length(BlockBackend *blk) +/* Returns the maximum transfer length, in bytes; guaranteed nonzero */ +uint32_t blk_get_max_transfer(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); + uint32_t max = 0; if (bs) { - return bs->bl.max_transfer_length; - } else { - return 0; + max = bs->bl.max_transfer; } + return MIN_NON_ZERO(max, INT_MAX); } int blk_get_max_iov(BlockBackend *blk) @@ -1375,7 +1401,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) BlockDriverState *bs = blk_bs(blk); if (bs) { + if (blk->public.throttle_state) { + throttle_timers_detach_aio_context(&blk->public.throttle_timers); + } bdrv_set_aio_context(bs, new_context); + if (blk->public.throttle_state) { + throttle_timers_attach_aio_context(&blk->public.throttle_timers, + new_context); + } } } @@ -1444,15 +1477,10 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); } -int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, + return blk_co_pwritev(blk, offset, count, NULL, flags | BDRV_REQ_ZERO_WRITE); } @@ -1476,14 +1504,14 @@ int blk_truncate(BlockBackend *blk, int64_t offset) return bdrv_truncate(blk_bs(blk), offset); } -int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +int blk_pdiscard(BlockBackend *blk, int64_t offset, int count) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - return bdrv_discard(blk_bs(blk), sector_num, nb_sectors); + return bdrv_pdiscard(blk_bs(blk), offset, count); } int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, @@ -1545,19 +1573,6 @@ void blk_update_root_state(BlockBackend *blk) blk->root_state.open_flags = blk->root->bs->open_flags; blk->root_state.read_only = blk->root->bs->read_only; blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes; - - if (blk->root_state.throttle_group) { - g_free(blk->root_state.throttle_group); - throttle_group_unref(blk->root_state.throttle_state); - } - if (blk->root->bs->throttle_state) { - const char *name = throttle_group_get_name(blk->root->bs); - blk->root_state.throttle_group = g_strdup(name); - blk->root_state.throttle_state = throttle_group_incref(name); - } else { - blk->root_state.throttle_group = NULL; - blk->root_state.throttle_state = NULL; - } } /* @@ -1568,9 +1583,6 @@ void blk_update_root_state(BlockBackend *blk) void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs) { bs->detect_zeroes = blk->root_state.detect_zeroes; - if (blk->root_state.throttle_group) { - bdrv_io_limits_enable(bs, blk->root_state.throttle_group); - } } /* @@ -1633,3 +1645,62 @@ int blk_flush_all(void) return result; } + + +/* throttling disk I/O limits */ +void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg) +{ + throttle_group_config(blk, cfg); +} + +void blk_io_limits_disable(BlockBackend *blk) +{ + assert(blk->public.throttle_state); + bdrv_drained_begin(blk_bs(blk)); + throttle_group_unregister_blk(blk); + bdrv_drained_end(blk_bs(blk)); +} + +/* should be called before blk_set_io_limits if a limit is set */ +void blk_io_limits_enable(BlockBackend *blk, const char *group) +{ + assert(!blk->public.throttle_state); + throttle_group_register_blk(blk, group); +} + +void blk_io_limits_update_group(BlockBackend *blk, const char *group) +{ + /* this BB is not part of any group */ + if (!blk->public.throttle_state) { + return; + } + + /* this BB is a part of the same group than the one we want */ + if (!g_strcmp0(throttle_group_get_name(blk), group)) { + return; + } + + /* need to change the group this bs belong to */ + blk_io_limits_disable(blk); + blk_io_limits_enable(blk, group); +} + +static void blk_root_drained_begin(BdrvChild *child) +{ + BlockBackend *blk = child->opaque; + + /* Note that blk->root may not be accessible here yet if we are just + * attaching to a BlockDriverState that is drained. Use child instead. */ + + if (blk->public.io_limits_disabled++ == 0) { + throttle_group_restart_blk(blk); + } +} + +static void blk_root_drained_end(BdrvChild *child) +{ + BlockBackend *blk = child->opaque; + + assert(blk->public.io_limits_disabled); + --blk->public.io_limits_disabled; +} diff --git a/block/bochs.c b/block/bochs.c index af8b7abdf..8c9652ebe 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -27,6 +27,7 @@ #include "qemu-common.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qemu/bswap.h" /**************************************************************/ @@ -103,9 +104,9 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, struct bochs_header bochs; int ret; - bs->read_only = 1; // no write support yet + bs->read_only = true; /* no write support yet */ - ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs)); + ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); if (ret < 0) { return ret; } @@ -139,7 +140,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, return -ENOMEM; } - ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap, + ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, s->catalog_size * 4); if (ret < 0) { goto fail; @@ -187,6 +188,11 @@ fail: return ret; } +static void bochs_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ +} + static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) { BDRVBochsState *s = bs->opaque; @@ -208,7 +214,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) (s->extent_blocks + s->bitmap_blocks)); /* read in bitmap for current extent */ - ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8), + ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), &bitmap_entry, 1); if (ret < 0) { return ret; @@ -221,38 +227,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); } -static int bochs_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { + BDRVBochsState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + uint64_t bytes_done = 0; + QEMUIOVector local_qiov; int ret; + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_co_mutex_lock(&s->lock); + while (nb_sectors > 0) { int64_t block_offset = seek_to_sector(bs, sector_num); if (block_offset < 0) { - return block_offset; - } else if (block_offset > 0) { - ret = bdrv_pread(bs->file->bs, block_offset, buf, 512); + ret = block_offset; + goto fail; + } + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512); + + if (block_offset > 0) { + ret = bdrv_co_preadv(bs->file, block_offset, 512, + &local_qiov, 0); if (ret < 0) { - return ret; + goto fail; } } else { - memset(buf, 0, 512); + qemu_iovec_memset(&local_qiov, 0, 0, 512); } nb_sectors--; sector_num++; - buf += 512; + bytes_done += 512; } - return 0; -} -static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVBochsState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = bochs_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&local_qiov); + return ret; } @@ -267,7 +287,8 @@ static BlockDriver bdrv_bochs = { .instance_size = sizeof(BDRVBochsState), .bdrv_probe = bochs_probe, .bdrv_open = bochs_open, - .bdrv_read = bochs_co_read, + .bdrv_refresh_limits = bochs_refresh_limits, + .bdrv_co_preadv = bochs_co_preadv, .bdrv_close = bochs_close, }; diff --git a/block/cloop.c b/block/cloop.c index a84f14019..7b75f7ef7 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -26,6 +26,7 @@ #include "qemu-common.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include <zlib.h> /* Maximum compressed block size */ @@ -65,10 +66,10 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, uint32_t offsets_size, max_compressed_block_size = 1, i; int ret; - bs->read_only = 1; + bs->read_only = true; /* read header */ - ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4); + ret = bdrv_pread(bs->file, 128, &s->block_size, 4); if (ret < 0) { return ret; } @@ -94,7 +95,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4); + ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); if (ret < 0) { return ret; } @@ -125,7 +126,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, return -ENOMEM; } - ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size); + ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); if (ret < 0) { goto fail; } @@ -197,6 +198,11 @@ fail: return ret; } +static void cloop_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ +} + static inline int cloop_read_block(BlockDriverState *bs, int block_num) { BDRVCloopState *s = bs->opaque; @@ -205,7 +211,7 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) int ret; uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; - ret = bdrv_pread(bs->file->bs, s->offsets[block_num], + ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block, bytes); if (ret != bytes) { return -1; @@ -229,33 +235,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) return 0; } -static int cloop_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVCloopState *s = bs->opaque; - int i; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int ret, i; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_co_mutex_lock(&s->lock); for (i = 0; i < nb_sectors; i++) { + void *data; uint32_t sector_offset_in_block = ((sector_num + i) % s->sectors_per_block), block_num = (sector_num + i) / s->sectors_per_block; if (cloop_read_block(bs, block_num) != 0) { - return -1; + ret = -EIO; + goto fail; } - memcpy(buf + i * 512, - s->uncompressed_block + sector_offset_in_block * 512, 512); + + data = s->uncompressed_block + sector_offset_in_block * 512; + qemu_iovec_from_buf(qiov, i * 512, data, 512); } - return 0; -} -static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVCloopState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = cloop_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + return ret; } @@ -273,7 +284,8 @@ static BlockDriver bdrv_cloop = { .instance_size = sizeof(BDRVCloopState), .bdrv_probe = cloop_probe, .bdrv_open = cloop_open, - .bdrv_read = cloop_co_read, + .bdrv_refresh_limits = cloop_refresh_limits, + .bdrv_co_preadv = cloop_co_preadv, .bdrv_close = cloop_close, }; diff --git a/block/commit.c b/block/commit.c index cba0e8c1e..553e18da5 100644 --- a/block/commit.c +++ b/block/commit.c @@ -36,28 +36,36 @@ typedef struct CommitBlockJob { BlockJob common; RateLimit limit; BlockDriverState *active; - BlockDriverState *top; - BlockDriverState *base; + BlockBackend *top; + BlockBackend *base; BlockdevOnError on_error; int base_flags; int orig_overlay_flags; char *backing_file_str; } CommitBlockJob; -static int coroutine_fn commit_populate(BlockDriverState *bs, - BlockDriverState *base, +static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base, int64_t sector_num, int nb_sectors, void *buf) { int ret = 0; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - if (ret) { + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE, + qiov.size, &qiov, 0); + if (ret < 0) { return ret; } - ret = bdrv_write(base, sector_num, buf, nb_sectors); - if (ret) { + ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE, + qiov.size, &qiov, 0); + if (ret < 0) { return ret; } @@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque) CommitBlockJob *s = container_of(job, CommitBlockJob, common); CommitCompleteData *data = opaque; BlockDriverState *active = s->active; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; + BlockDriverState *top = blk_bs(s->top); + BlockDriverState *base = blk_bs(s->base); BlockDriverState *overlay_bs; int ret = data->ret; @@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque) bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); } g_free(s->backing_file_str); + blk_unref(s->top); + blk_unref(s->base); block_job_completed(&s->common, ret); g_free(data); } @@ -102,42 +112,39 @@ static void coroutine_fn commit_run(void *opaque) { CommitBlockJob *s = opaque; CommitCompleteData *data; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; int64_t sector_num, end; + uint64_t delay_ns = 0; int ret = 0; int n = 0; void *buf = NULL; int bytes_written = 0; int64_t base_len; - ret = s->common.len = bdrv_getlength(top); + ret = s->common.len = blk_getlength(s->top); if (s->common.len < 0) { goto out; } - ret = base_len = bdrv_getlength(base); + ret = base_len = blk_getlength(s->base); if (base_len < 0) { goto out; } if (base_len < s->common.len) { - ret = bdrv_truncate(base, s->common.len); + ret = blk_truncate(s->base, s->common.len); if (ret) { goto out; } } end = s->common.len >> BDRV_SECTOR_BITS; - buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); + buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); for (sector_num = 0; sector_num < end; sector_num += n) { - uint64_t delay_ns = 0; bool copy; -wait: /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ @@ -146,25 +153,20 @@ wait: break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(top, base, sector_num, + ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), + sector_num, COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, sector_num, n, ret); if (copy) { - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); - if (delay_ns > 0) { - goto wait; - } - } - ret = commit_populate(top, base, sector_num, n, buf); + ret = commit_populate(s->top, s->base, sector_num, n, buf); bytes_written += n * BDRV_SECTOR_SIZE; } if (ret < 0) { - if (s->on_error == BLOCKDEV_ON_ERROR_STOP || - s->on_error == BLOCKDEV_ON_ERROR_REPORT|| - (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { + BlockErrorAction action = + block_job_error_action(&s->common, false, s->on_error, -ret); + if (action == BLOCK_ERROR_ACTION_REPORT) { goto out; } else { n = 0; @@ -173,6 +175,10 @@ wait: } /* Publish progress */ s->common.offset += n * BDRV_SECTOR_SIZE; + + if (copy && s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, n); + } } ret = 0; @@ -202,8 +208,8 @@ static const BlockJobDriver commit_job_driver = { .set_speed = commit_set_speed, }; -void commit_start(BlockDriverState *bs, BlockDriverState *base, - BlockDriverState *top, int64_t speed, +void commit_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, BlockDriverState *top, int64_t speed, BlockdevOnError on_error, BlockCompletionFunc *cb, void *opaque, const char *backing_file_str, Error **errp) { @@ -214,13 +220,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, BlockDriverState *overlay_bs; Error *local_err = NULL; - if ((on_error == BLOCKDEV_ON_ERROR_STOP || - on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, "Invalid parameter combination"); - return; - } - assert(top != bs); if (top == base) { error_setg(errp, "Invalid files for merge: top and base are the same"); @@ -234,6 +233,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, return; } + s = block_job_create(job_id, &commit_job_driver, bs, speed, + cb, opaque, errp); + if (!s) { + return; + } + orig_base_flags = bdrv_get_flags(base); orig_overlay_flags = bdrv_get_flags(overlay_bs); @@ -250,18 +255,18 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, bdrv_reopen_multiple(reopen_queue, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); + block_job_unref(&s->common); return; } } - s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); - if (!s) { - return; - } + s->base = blk_new(); + blk_insert_bs(s->base, base); + + s->top = blk_new(); + blk_insert_bs(s->top, top); - s->base = base; - s->top = top; s->active = bs; s->base_flags = orig_base_flags; @@ -270,8 +275,129 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, s->backing_file_str = g_strdup(backing_file_str); s->on_error = on_error; - s->common.co = qemu_coroutine_create(commit_run); + s->common.co = qemu_coroutine_create(commit_run, s); trace_commit_start(bs, base, top, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); + qemu_coroutine_enter(s->common.co); +} + + +#define COMMIT_BUF_SECTORS 2048 + +/* commit COW file into the raw image */ +int bdrv_commit(BlockDriverState *bs) +{ + BlockBackend *src, *backing; + BlockDriver *drv = bs->drv; + int64_t sector, total_sectors, length, backing_length; + int n, ro, open_flags; + int ret = 0; + uint8_t *buf = NULL; + + if (!drv) + return -ENOMEDIUM; + + if (!bs->backing) { + return -ENOTSUP; + } + + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) || + bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) { + return -EBUSY; + } + + ro = bs->backing->bs->read_only; + open_flags = bs->backing->bs->open_flags; + + if (ro) { + if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) { + return -EACCES; + } + } + + src = blk_new(); + blk_insert_bs(src, bs); + + backing = blk_new(); + blk_insert_bs(backing, bs->backing->bs); + + length = blk_getlength(src); + if (length < 0) { + ret = length; + goto ro_cleanup; + } + + backing_length = blk_getlength(backing); + if (backing_length < 0) { + ret = backing_length; + goto ro_cleanup; + } + + /* If our top snapshot is larger than the backing file image, + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { + ret = blk_truncate(backing, length); + if (ret < 0) { + goto ro_cleanup; + } + } + + total_sectors = length >> BDRV_SECTOR_BITS; + + /* blk_try_blockalign() for src will choose an alignment that works for + * backing as well, so no need to compare the alignment manually. */ + buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + if (buf == NULL) { + ret = -ENOMEM; + goto ro_cleanup; + } + + for (sector = 0; sector < total_sectors; sector += n) { + ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); + if (ret < 0) { + goto ro_cleanup; + } + if (ret) { + ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf, + n * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto ro_cleanup; + } + + ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf, + n * BDRV_SECTOR_SIZE, 0); + if (ret < 0) { + goto ro_cleanup; + } + } + } + + if (drv->bdrv_make_empty) { + ret = drv->bdrv_make_empty(bs); + if (ret < 0) { + goto ro_cleanup; + } + blk_flush(src); + } + + /* + * Make sure all data we wrote to the backing device is actually + * stable on disk. + */ + blk_flush(backing); + + ret = 0; +ro_cleanup: + qemu_vfree(buf); + + blk_unref(src); + blk_unref(backing); + + if (ro) { + /* ignoring error return here */ + bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL); + } + + return ret; } diff --git a/block/crypto.c b/block/crypto.c index 1903e84fb..7f61e1268 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -64,7 +64,7 @@ static ssize_t block_crypto_read_func(QCryptoBlock *block, BlockDriverState *bs = opaque; ssize_t ret; - ret = bdrv_pread(bs->file->bs, offset, buf, buflen); + ret = bdrv_pread(bs->file, offset, buf, buflen); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read encryption header"); return ret; @@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block, struct BlockCryptoCreateData *data = opaque; ssize_t ret; - ret = blk_pwrite(data->blk, offset, buf, buflen); + ret = blk_pwrite(data->blk, offset, buf, buflen, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write encryption header"); return ret; @@ -193,18 +193,16 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, QemuOpts *opts, Error **errp) { - OptsVisitor *ov; + Visitor *v; QCryptoBlockOpenOptions *ret = NULL; Error *local_err = NULL; - Error *end_err = NULL; ret = g_new0(QCryptoBlockOpenOptions, 1); ret->format = format; - ov = opts_visitor_new(opts); + v = opts_visitor_new(opts); - visit_start_struct(opts_get_visitor(ov), - NULL, NULL, 0, &local_err); + visit_start_struct(v, NULL, NULL, 0, &local_err); if (local_err) { goto out; } @@ -212,16 +210,18 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, switch (format) { case Q_CRYPTO_BLOCK_FORMAT_LUKS: visit_type_QCryptoBlockOptionsLUKS_members( - opts_get_visitor(ov), &ret->u.luks, &local_err); + v, &ret->u.luks, &local_err); break; default: error_setg(&local_err, "Unsupported block format %d", format); break; } + if (!local_err) { + visit_check_struct(v, &local_err); + } - visit_end_struct(opts_get_visitor(ov), &end_err); - error_propagate(&local_err, end_err); + visit_end_struct(v, NULL); out: if (local_err) { @@ -229,7 +229,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, qapi_free_QCryptoBlockOpenOptions(ret); ret = NULL; } - opts_visitor_cleanup(ov); + visit_free(v); return ret; } @@ -239,18 +239,16 @@ block_crypto_create_opts_init(QCryptoBlockFormat format, QemuOpts *opts, Error **errp) { - OptsVisitor *ov; + Visitor *v; QCryptoBlockCreateOptions *ret = NULL; Error *local_err = NULL; - Error *end_err = NULL; ret = g_new0(QCryptoBlockCreateOptions, 1); ret->format = format; - ov = opts_visitor_new(opts); + v = opts_visitor_new(opts); - visit_start_struct(opts_get_visitor(ov), - NULL, NULL, 0, &local_err); + visit_start_struct(v, NULL, NULL, 0, &local_err); if (local_err) { goto out; } @@ -258,16 +256,18 @@ block_crypto_create_opts_init(QCryptoBlockFormat format, switch (format) { case Q_CRYPTO_BLOCK_FORMAT_LUKS: visit_type_QCryptoBlockCreateOptionsLUKS_members( - opts_get_visitor(ov), &ret->u.luks, &local_err); + v, &ret->u.luks, &local_err); break; default: error_setg(&local_err, "Unsupported block format %d", format); break; } + if (!local_err) { + visit_check_struct(v, &local_err); + } - visit_end_struct(opts_get_visitor(ov), &end_err); - error_propagate(&local_err, end_err); + visit_end_struct(v, NULL); out: if (local_err) { @@ -275,7 +275,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format, qapi_free_QCryptoBlockCreateOptions(ret); ret = NULL; } - opts_visitor_cleanup(ov); + visit_free(v); return ret; } @@ -320,8 +320,8 @@ static int block_crypto_open_generic(QCryptoBlockFormat format, goto cleanup; } - bs->encrypted = 1; - bs->valid_key = 1; + bs->encrypted = true; + bs->valid_key = true; ret = 0; cleanup: @@ -426,7 +426,7 @@ block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num, qemu_iovec_reset(&hd_qiov); qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); - ret = bdrv_co_readv(bs->file->bs, + ret = bdrv_co_readv(bs->file, payload_offset + sector_num, cur_nr_sectors, &hd_qiov); if (ret < 0) { @@ -505,7 +505,7 @@ block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num, qemu_iovec_reset(&hd_qiov); qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); - ret = bdrv_co_writev(bs->file->bs, + ret = bdrv_co_writev(bs->file, payload_offset + sector_num, cur_nr_sectors, &hd_qiov); if (ret < 0) { @@ -563,6 +563,53 @@ static int block_crypto_create_luks(const char *filename, filename, opts, errp); } +static int block_crypto_get_info_luks(BlockDriverState *bs, + BlockDriverInfo *bdi) +{ + BlockDriverInfo subbdi; + int ret; + + ret = bdrv_get_info(bs->file->bs, &subbdi); + if (ret != 0) { + return ret; + } + + bdi->unallocated_blocks_are_zero = false; + bdi->can_write_zeroes_with_unmap = false; + bdi->cluster_size = subbdi.cluster_size; + + return 0; +} + +static ImageInfoSpecific * +block_crypto_get_specific_info_luks(BlockDriverState *bs) +{ + BlockCrypto *crypto = bs->opaque; + ImageInfoSpecific *spec_info; + QCryptoBlockInfo *info; + + info = qcrypto_block_get_info(crypto->block, NULL); + if (!info) { + return NULL; + } + if (info->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) { + qapi_free_QCryptoBlockInfo(info); + return NULL; + } + + spec_info = g_new(ImageInfoSpecific, 1); + spec_info->type = IMAGE_INFO_SPECIFIC_KIND_LUKS; + spec_info->u.luks.data = g_new(QCryptoBlockInfoLUKS, 1); + *spec_info->u.luks.data = info->u.luks; + + /* Blank out pointers we've just stolen to avoid double free */ + memset(&info->u.luks, 0, sizeof(info->u.luks)); + + qapi_free_QCryptoBlockInfo(info); + + return spec_info; +} + BlockDriver bdrv_crypto_luks = { .format_name = "luks", .instance_size = sizeof(BlockCrypto), @@ -576,6 +623,8 @@ BlockDriver bdrv_crypto_luks = { .bdrv_co_readv = block_crypto_co_readv, .bdrv_co_writev = block_crypto_co_writev, .bdrv_getlength = block_crypto_getlength, + .bdrv_get_info = block_crypto_get_info_luks, + .bdrv_get_specific_info = block_crypto_get_specific_info_luks, }; static void block_crypto_init(void) diff --git a/block/curl.c b/block/curl.c index 5a8f8b623..426fb4d67 100644 --- a/block/curl.c +++ b/block/curl.c @@ -36,10 +36,16 @@ // #define DEBUG_VERBOSE #ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#define DEBUG_CURL_PRINT 1 #else -#define DPRINTF(fmt, ...) do { } while (0) +#define DEBUG_CURL_PRINT 0 #endif +#define DPRINTF(fmt, ...) \ + do { \ + if (DEBUG_CURL_PRINT) { \ + fprintf(stderr, fmt, ## __VA_ARGS__); \ + } \ + } while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ @@ -163,7 +169,7 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, state->sock_fd = fd; s = state->s; - DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); + DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, (int)fd); switch (action) { case CURL_POLL_IN: aio_set_fd_handler(s->aio_context, fd, false, diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 4902ca557..f2bfdcfde 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -326,14 +326,14 @@ void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) } void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) + int64_t cur_sector, int64_t nr_sectors) { assert(bdrv_dirty_bitmap_enabled(bitmap)); hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) + int64_t cur_sector, int64_t nr_sectors) { assert(bdrv_dirty_bitmap_enabled(bitmap)); hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); @@ -361,7 +361,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) + int64_t nr_sectors) { BdrvDirtyBitmap *bitmap; QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { diff --git a/block/dmg.c b/block/dmg.c index a496eb7c9..b0ed89baa 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -32,7 +32,6 @@ #ifdef CONFIG_BZIP2 #include <bzlib.h> #endif -#include <glib.h> enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used @@ -87,7 +86,7 @@ static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) uint64_t buffer; int ret; - ret = bdrv_pread(bs->file->bs, offset, &buffer, 8); + ret = bdrv_pread(bs->file, offset, &buffer, 8); if (ret < 0) { return ret; } @@ -101,7 +100,7 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) uint32_t buffer; int ret; - ret = bdrv_pread(bs->file->bs, offset, &buffer, 4); + ret = bdrv_pread(bs->file, offset, &buffer, 4); if (ret < 0) { return ret; } @@ -154,8 +153,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, } } -static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) +static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp) { + BlockDriverState *file_bs = file->bs; int64_t length; int64_t offset = 0; uint8_t buffer[515]; @@ -179,7 +179,7 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) offset = length - 511 - 512; } length = length < 515 ? length : 515; - ret = bdrv_pread(file_bs, offset, buffer, length); + ret = bdrv_pread(file, offset, buffer, length); if (ret < 0) { error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); return ret; @@ -356,7 +356,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, offset += 4; buffer = g_realloc(buffer, count); - ret = bdrv_pread(bs->file->bs, offset, buffer, count); + ret = bdrv_pread(bs->file, offset, buffer, count); if (ret < 0) { goto fail; } @@ -393,7 +393,7 @@ static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, buffer = g_malloc(info_length + 1); buffer[info_length] = '\0'; - ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length); + ret = bdrv_pread(bs->file, info_begin, buffer, info_length); if (ret != info_length) { ret = -EINVAL; goto fail; @@ -439,7 +439,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, int64_t offset; int ret; - bs->read_only = 1; + bs->read_only = true; + s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; /* used by dmg_read_mish_block to keep track of the current I/O position */ @@ -448,7 +449,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, ds.max_sectors_per_chunk = 1; /* locate the UDIF trailer */ - offset = dmg_find_koly_offset(bs->file->bs, errp); + offset = dmg_find_koly_offset(bs->file, errp); if (offset < 0) { ret = offset; goto fail; @@ -546,6 +547,11 @@ fail: return ret; } +static void dmg_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ +} + static inline int is_sector_in_chunk(BDRVDMGState* s, uint32_t chunk_num, uint64_t sector_num) { @@ -594,7 +600,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) case 0x80000005: { /* zlib compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + ret = bdrv_pread(bs->file, s->offsets[chunk], s->compressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; @@ -618,7 +624,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) case 0x80000006: /* bzip2 compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + ret = bdrv_pread(bs->file, s->offsets[chunk], s->compressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; @@ -643,7 +649,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) break; #endif /* CONFIG_BZIP2 */ case 1: /* copy */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; @@ -659,38 +665,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return 0; } -static int dmg_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVDMGState *s = bs->opaque; - int i; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int ret, i; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_co_mutex_lock(&s->lock); for (i = 0; i < nb_sectors; i++) { uint32_t sector_offset_in_chunk; + void *data; + if (dmg_read_chunk(bs, sector_num + i) != 0) { - return -1; + ret = -EIO; + goto fail; } /* Special case: current chunk is all zeroes. Do not perform a memcpy as * s->uncompressed_chunk may be too small to cover the large all-zeroes * section. dmg_read_chunk is called to find s->current_chunk */ if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ - memset(buf + i * 512, 0, 512); + qemu_iovec_memset(qiov, i * 512, 0, 512); continue; } sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; - memcpy(buf + i * 512, - s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); + data = s->uncompressed_chunk + sector_offset_in_chunk * 512; + qemu_iovec_from_buf(qiov, i * 512, data, 512); } - return 0; -} -static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVDMGState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = dmg_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); return ret; } @@ -715,7 +725,8 @@ static BlockDriver bdrv_dmg = { .instance_size = sizeof(BDRVDMGState), .bdrv_probe = dmg_probe, .bdrv_open = dmg_open, - .bdrv_read = dmg_co_read, + .bdrv_refresh_limits = dmg_refresh_limits, + .bdrv_co_preadv = dmg_co_preadv, .bdrv_close = dmg_close, }; diff --git a/block/gluster.c b/block/gluster.c index a8aaacf64..01b479fbb 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -11,7 +11,27 @@ #include <glusterfs/api/glfs.h> #include "block/block_int.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" #include "qemu/uri.h" +#include "qemu/error-report.h" + +#define GLUSTER_OPT_FILENAME "filename" +#define GLUSTER_OPT_VOLUME "volume" +#define GLUSTER_OPT_PATH "path" +#define GLUSTER_OPT_TYPE "type" +#define GLUSTER_OPT_SERVER_PATTERN "server." +#define GLUSTER_OPT_HOST "host" +#define GLUSTER_OPT_PORT "port" +#define GLUSTER_OPT_TO "to" +#define GLUSTER_OPT_IPV4 "ipv4" +#define GLUSTER_OPT_IPV6 "ipv6" +#define GLUSTER_OPT_SOCKET "socket" +#define GLUSTER_OPT_DEBUG "debug" +#define GLUSTER_DEFAULT_PORT 24007 +#define GLUSTER_DEBUG_DEFAULT 4 +#define GLUSTER_DEBUG_MAX 9 + +#define GERR_INDEX_HINT "hint: check in 'server' array index '%d'\n" typedef struct GlusterAIOCB { int64_t size; @@ -24,28 +44,145 @@ typedef struct GlusterAIOCB { typedef struct BDRVGlusterState { struct glfs *glfs; struct glfs_fd *fd; + bool supports_seek_data; + int debug_level; } BDRVGlusterState; -typedef struct GlusterConf { - char *server; - int port; - char *volname; - char *image; - char *transport; -} GlusterConf; +typedef struct BDRVGlusterReopenState { + struct glfs *glfs; + struct glfs_fd *fd; +} BDRVGlusterReopenState; -static void qemu_gluster_gconf_free(GlusterConf *gconf) -{ - if (gconf) { - g_free(gconf->server); - g_free(gconf->volname); - g_free(gconf->image); - g_free(gconf->transport); - g_free(gconf); + +static QemuOptsList qemu_gluster_create_opts = { + .name = "qemu-gluster-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { + .name = GLUSTER_OPT_DEBUG, + .type = QEMU_OPT_NUMBER, + .help = "Gluster log level, valid range is 0-9", + }, + { /* end of list */ } } -} +}; -static int parse_volume_options(GlusterConf *gconf, char *path) +static QemuOptsList runtime_opts = { + .name = "gluster", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = GLUSTER_OPT_FILENAME, + .type = QEMU_OPT_STRING, + .help = "URL to the gluster image", + }, + { + .name = GLUSTER_OPT_DEBUG, + .type = QEMU_OPT_NUMBER, + .help = "Gluster log level, valid range is 0-9", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList runtime_json_opts = { + .name = "gluster_json", + .head = QTAILQ_HEAD_INITIALIZER(runtime_json_opts.head), + .desc = { + { + .name = GLUSTER_OPT_VOLUME, + .type = QEMU_OPT_STRING, + .help = "name of gluster volume where VM image resides", + }, + { + .name = GLUSTER_OPT_PATH, + .type = QEMU_OPT_STRING, + .help = "absolute path to image file in gluster volume", + }, + { + .name = GLUSTER_OPT_DEBUG, + .type = QEMU_OPT_NUMBER, + .help = "Gluster log level, valid range is 0-9", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList runtime_type_opts = { + .name = "gluster_type", + .head = QTAILQ_HEAD_INITIALIZER(runtime_type_opts.head), + .desc = { + { + .name = GLUSTER_OPT_TYPE, + .type = QEMU_OPT_STRING, + .help = "tcp|unix", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList runtime_unix_opts = { + .name = "gluster_unix", + .head = QTAILQ_HEAD_INITIALIZER(runtime_unix_opts.head), + .desc = { + { + .name = GLUSTER_OPT_SOCKET, + .type = QEMU_OPT_STRING, + .help = "socket file path)", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList runtime_tcp_opts = { + .name = "gluster_tcp", + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), + .desc = { + { + .name = GLUSTER_OPT_TYPE, + .type = QEMU_OPT_STRING, + .help = "tcp|unix", + }, + { + .name = GLUSTER_OPT_HOST, + .type = QEMU_OPT_STRING, + .help = "host address (hostname/ipv4/ipv6 addresses)", + }, + { + .name = GLUSTER_OPT_PORT, + .type = QEMU_OPT_NUMBER, + .help = "port number on which glusterd is listening (default 24007)", + }, + { + .name = "to", + .type = QEMU_OPT_NUMBER, + .help = "max port number, not supported by gluster", + }, + { + .name = "ipv4", + .type = QEMU_OPT_BOOL, + .help = "ipv4 bool value, not supported by gluster", + }, + { + .name = "ipv6", + .type = QEMU_OPT_BOOL, + .help = "ipv6 bool value, not supported by gluster", + }, + { /* end of list */ } + }, +}; + +static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path) { char *p, *q; @@ -59,31 +196,29 @@ static int parse_volume_options(GlusterConf *gconf, char *path) if (*p == '\0') { return -EINVAL; } - gconf->volname = g_strndup(q, p - q); + gconf->volume = g_strndup(q, p - q); - /* image */ + /* path */ p += strspn(p, "/"); if (*p == '\0') { return -EINVAL; } - gconf->image = g_strdup(p); + gconf->path = g_strdup(p); return 0; } /* - * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] + * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...] * * 'gluster' is the protocol. * * 'transport' specifies the transport type used to connect to gluster * management daemon (glusterd). Valid transport types are - * tcp, unix and rdma. If a transport type isn't specified, then tcp - * type is assumed. + * tcp or unix. If a transport type isn't specified, then tcp type is assumed. * - * 'server' specifies the server where the volume file specification for - * the given volume resides. This can be either hostname, ipv4 address - * or ipv6 address. ipv6 address needs to be within square brackets [ ]. - * If transport type is 'unix', then 'server' field should not be specified. + * 'host' specifies the host where the volume file specification for + * the given volume resides. This can be either hostname or ipv4 address. + * If transport type is 'unix', then 'host' field should not be specified. * The 'socket' field needs to be populated with the path to unix domain * socket. * @@ -92,23 +227,22 @@ static int parse_volume_options(GlusterConf *gconf, char *path) * default port. If the transport type is unix, then 'port' should not be * specified. * - * 'volname' is the name of the gluster volume which contains the VM image. + * 'volume' is the name of the gluster volume which contains the VM image. * - * 'image' is the path to the actual VM image that resides on gluster volume. + * 'path' is the path to the actual VM image that resides on gluster volume. * * Examples: * * file=gluster://1.2.3.4/testvol/a.img * file=gluster+tcp://1.2.3.4/testvol/a.img * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img - * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img - * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img - * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img + * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket - * file=gluster+rdma://1.2.3.4:24007/testvol/a.img */ -static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) +static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf, + const char *filename) { + GlusterServer *gsconf; URI *uri; QueryParams *qp = NULL; bool is_unix = false; @@ -119,16 +253,21 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) return -EINVAL; } + gconf->server = g_new0(GlusterServerList, 1); + gconf->server->value = gsconf = g_new0(GlusterServer, 1); + /* transport */ if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { - gconf->transport = g_strdup("tcp"); + gsconf->type = GLUSTER_TRANSPORT_TCP; } else if (!strcmp(uri->scheme, "gluster+tcp")) { - gconf->transport = g_strdup("tcp"); + gsconf->type = GLUSTER_TRANSPORT_TCP; } else if (!strcmp(uri->scheme, "gluster+unix")) { - gconf->transport = g_strdup("unix"); + gsconf->type = GLUSTER_TRANSPORT_UNIX; is_unix = true; } else if (!strcmp(uri->scheme, "gluster+rdma")) { - gconf->transport = g_strdup("rdma"); + gsconf->type = GLUSTER_TRANSPORT_TCP; + error_report("Warning: rdma feature is not supported, falling " + "back to tcp"); } else { ret = -EINVAL; goto out; @@ -154,10 +293,14 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) ret = -EINVAL; goto out; } - gconf->server = g_strdup(qp->p[0].value); + gsconf->u.q_unix.path = g_strdup(qp->p[0].value); } else { - gconf->server = g_strdup(uri->server ? uri->server : "localhost"); - gconf->port = uri->port; + gsconf->u.tcp.host = g_strdup(uri->server ? uri->server : "localhost"); + if (uri->port) { + gsconf->u.tcp.port = g_strdup_printf("%d", uri->port); + } else { + gsconf->u.tcp.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT); + } } out: @@ -168,52 +311,62 @@ out: return ret; } -static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, - Error **errp) +static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf, + Error **errp) { - struct glfs *glfs = NULL; + struct glfs *glfs; int ret; int old_errno; + GlusterServerList *server; - ret = qemu_gluster_parseuri(gconf, filename); - if (ret < 0) { - error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" - "volname/image[?socket=...]"); - errno = -ret; - goto out; - } - - glfs = glfs_new(gconf->volname); + glfs = glfs_new(gconf->volume); if (!glfs) { goto out; } - ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, - gconf->port); - if (ret < 0) { - goto out; + for (server = gconf->server; server; server = server->next) { + if (server->value->type == GLUSTER_TRANSPORT_UNIX) { + ret = glfs_set_volfile_server(glfs, + GlusterTransport_lookup[server->value->type], + server->value->u.q_unix.path, 0); + } else { + ret = glfs_set_volfile_server(glfs, + GlusterTransport_lookup[server->value->type], + server->value->u.tcp.host, + atoi(server->value->u.tcp.port)); + } + + if (ret < 0) { + goto out; + } } - /* - * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when - * GlusterFS makes GF_LOG_* macros available to libgfapi users. - */ - ret = glfs_set_logging(glfs, "-", 4); + ret = glfs_set_logging(glfs, "-", gconf->debug_level); if (ret < 0) { goto out; } ret = glfs_init(glfs); if (ret) { - error_setg_errno(errp, errno, - "Gluster connection failed for server=%s port=%d " - "volume=%s image=%s transport=%s", gconf->server, - gconf->port, gconf->volname, gconf->image, - gconf->transport); + error_setg(errp, "Gluster connection for volume %s, path %s failed" + " to connect", gconf->volume, gconf->path); + for (server = gconf->server; server; server = server->next) { + if (server->value->type == GLUSTER_TRANSPORT_UNIX) { + error_append_hint(errp, "hint: failed on socket %s ", + server->value->u.q_unix.path); + } else { + error_append_hint(errp, "hint: failed on host %s and port %s ", + server->value->u.tcp.host, + server->value->u.tcp.port); + } + } + + error_append_hint(errp, "Please refer to gluster logs for more info\n"); /* glfs_init sometimes doesn't set errno although docs suggest that */ - if (errno == 0) + if (errno == 0) { errno = EINVAL; + } goto out; } @@ -228,13 +381,233 @@ out: return NULL; } +static int qapi_enum_parse(const char *opt) +{ + int i; + + if (!opt) { + return GLUSTER_TRANSPORT__MAX; + } + + for (i = 0; i < GLUSTER_TRANSPORT__MAX; i++) { + if (!strcmp(opt, GlusterTransport_lookup[i])) { + return i; + } + } + + return i; +} + +/* + * Convert the json formatted command line into qapi. +*/ +static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf, + QDict *options, Error **errp) +{ + QemuOpts *opts; + GlusterServer *gsconf; + GlusterServerList *curr = NULL; + QDict *backing_options = NULL; + Error *local_err = NULL; + char *str = NULL; + const char *ptr; + size_t num_servers; + int i; + + /* create opts info from runtime_json_opts list */ + opts = qemu_opts_create(&runtime_json_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + goto out; + } + + num_servers = qdict_array_entries(options, GLUSTER_OPT_SERVER_PATTERN); + if (num_servers < 1) { + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); + goto out; + } + + ptr = qemu_opt_get(opts, GLUSTER_OPT_VOLUME); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_VOLUME); + goto out; + } + gconf->volume = g_strdup(ptr); + + ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_PATH); + goto out; + } + gconf->path = g_strdup(ptr); + qemu_opts_del(opts); + + for (i = 0; i < num_servers; i++) { + str = g_strdup_printf(GLUSTER_OPT_SERVER_PATTERN"%d.", i); + qdict_extract_subqdict(options, &backing_options, str); + + /* create opts info from runtime_type_opts list */ + opts = qemu_opts_create(&runtime_type_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, backing_options, &local_err); + if (local_err) { + goto out; + } + + ptr = qemu_opt_get(opts, GLUSTER_OPT_TYPE); + gsconf = g_new0(GlusterServer, 1); + gsconf->type = qapi_enum_parse(ptr); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_TYPE); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + + } + if (gsconf->type == GLUSTER_TRANSPORT__MAX) { + error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, + GLUSTER_OPT_TYPE, "tcp or unix"); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + } + qemu_opts_del(opts); + + if (gsconf->type == GLUSTER_TRANSPORT_TCP) { + /* create opts info from runtime_tcp_opts list */ + opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, backing_options, &local_err); + if (local_err) { + goto out; + } + + ptr = qemu_opt_get(opts, GLUSTER_OPT_HOST); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, + GLUSTER_OPT_HOST); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + } + gsconf->u.tcp.host = g_strdup(ptr); + ptr = qemu_opt_get(opts, GLUSTER_OPT_PORT); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, + GLUSTER_OPT_PORT); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + } + gsconf->u.tcp.port = g_strdup(ptr); + + /* defend for unsupported fields in InetSocketAddress, + * i.e. @ipv4, @ipv6 and @to + */ + ptr = qemu_opt_get(opts, GLUSTER_OPT_TO); + if (ptr) { + gsconf->u.tcp.has_to = true; + } + ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV4); + if (ptr) { + gsconf->u.tcp.has_ipv4 = true; + } + ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV6); + if (ptr) { + gsconf->u.tcp.has_ipv6 = true; + } + if (gsconf->u.tcp.has_to) { + error_setg(&local_err, "Parameter 'to' not supported"); + goto out; + } + if (gsconf->u.tcp.has_ipv4 || gsconf->u.tcp.has_ipv6) { + error_setg(&local_err, "Parameters 'ipv4/ipv6' not supported"); + goto out; + } + qemu_opts_del(opts); + } else { + /* create opts info from runtime_unix_opts list */ + opts = qemu_opts_create(&runtime_unix_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, backing_options, &local_err); + if (local_err) { + goto out; + } + + ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET); + if (!ptr) { + error_setg(&local_err, QERR_MISSING_PARAMETER, + GLUSTER_OPT_SOCKET); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + } + gsconf->u.q_unix.path = g_strdup(ptr); + qemu_opts_del(opts); + } + + if (gconf->server == NULL) { + gconf->server = g_new0(GlusterServerList, 1); + gconf->server->value = gsconf; + curr = gconf->server; + } else { + curr->next = g_new0(GlusterServerList, 1); + curr->next->value = gsconf; + curr = curr->next; + } + + qdict_del(backing_options, str); + g_free(str); + str = NULL; + } + + return 0; + +out: + error_propagate(errp, local_err); + qemu_opts_del(opts); + if (str) { + qdict_del(backing_options, str); + g_free(str); + } + errno = EINVAL; + return -errno; +} + +static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf, + const char *filename, + QDict *options, Error **errp) +{ + int ret; + if (filename) { + ret = qemu_gluster_parse_uri(gconf, filename); + if (ret < 0) { + error_setg(errp, "invalid URI"); + error_append_hint(errp, "Usage: file=gluster[+transport]://" + "[host[:port]]/volume/path[?socket=...]\n"); + errno = -ret; + return NULL; + } + } else { + ret = qemu_gluster_parse_json(gconf, options, errp); + if (ret < 0) { + error_append_hint(errp, "Usage: " + "-drive driver=qcow2,file.driver=gluster," + "file.volume=testvol,file.path=/path/a.qcow2" + "[,file.debug=9],file.server.0.type=tcp," + "file.server.0.host=1.2.3.4," + "file.server.0.port=24007," + "file.server.1.transport=unix," + "file.server.1.socket=/var/run/glusterd.socket ..." + "\n"); + errno = -ret; + return NULL; + } + + } + + return qemu_gluster_glfs_init(gconf, errp); +} + static void qemu_gluster_complete_aio(void *opaque) { GlusterAIOCB *acb = (GlusterAIOCB *)opaque; qemu_bh_delete(acb->bh); acb->bh = NULL; - qemu_coroutine_enter(acb->coroutine, NULL); + qemu_coroutine_enter(acb->coroutine); } /* @@ -256,20 +629,6 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) qemu_bh_schedule(acb->bh); } -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "gluster", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "URL to the gluster image", - }, - { /* end of list */ } - }, -}; - static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) { assert(open_flags != NULL); @@ -287,13 +646,35 @@ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) } } +/* + * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of + * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used. + * - Corrected versions return -1 and set errno to EINVAL. + * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set + * errno to ENXIO when SEEK_DATA is called with a position of EOF. + */ +static bool qemu_gluster_test_seek(struct glfs_fd *fd) +{ + off_t ret, eof; + + eof = glfs_lseek(fd, 0, SEEK_END); + if (eof < 0) { + /* this should never occur */ + return false; + } + + /* this should always fail with ENXIO if SEEK_DATA is supported */ + ret = glfs_lseek(fd, eof, SEEK_DATA); + return (ret < 0) && (errno == ENXIO); +} + static int qemu_gluster_open(BlockDriverState *bs, QDict *options, int bdrv_flags, Error **errp) { BDRVGlusterState *s = bs->opaque; int open_flags = 0; int ret = 0; - GlusterConf *gconf = g_new0(GlusterConf, 1); + BlockdevOptionsGluster *gconf = NULL; QemuOpts *opts; Error *local_err = NULL; const char *filename; @@ -306,9 +687,20 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, goto out; } - filename = qemu_opt_get(opts, "filename"); + filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME); - s->glfs = qemu_gluster_init(gconf, filename, errp); + s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG, + GLUSTER_DEBUG_DEFAULT); + if (s->debug_level < 0) { + s->debug_level = 0; + } else if (s->debug_level > GLUSTER_DEBUG_MAX) { + s->debug_level = GLUSTER_DEBUG_MAX; + } + + gconf = g_new0(BlockdevOptionsGluster, 1); + gconf->debug_level = s->debug_level; + gconf->has_debug_level = true; + s->glfs = qemu_gluster_init(gconf, filename, options, errp); if (!s->glfs) { ret = -errno; goto out; @@ -333,14 +725,16 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, qemu_gluster_parse_flags(bdrv_flags, &open_flags); - s->fd = glfs_open(s->glfs, gconf->image, open_flags); + s->fd = glfs_open(s->glfs, gconf->path, open_flags); if (!s->fd) { ret = -errno; } + s->supports_seek_data = qemu_gluster_test_seek(s->fd); + out: qemu_opts_del(opts); - qemu_gluster_gconf_free(gconf); + qapi_free_BlockdevOptionsGluster(gconf); if (!ret) { return ret; } @@ -353,31 +747,29 @@ out: return ret; } -typedef struct BDRVGlusterReopenState { - struct glfs *glfs; - struct glfs_fd *fd; -} BDRVGlusterReopenState; - - static int qemu_gluster_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { int ret = 0; + BDRVGlusterState *s; BDRVGlusterReopenState *reop_s; - GlusterConf *gconf = NULL; + BlockdevOptionsGluster *gconf; int open_flags = 0; assert(state != NULL); assert(state->bs != NULL); + s = state->bs->opaque; + state->opaque = g_new0(BDRVGlusterReopenState, 1); reop_s = state->opaque; qemu_gluster_parse_flags(state->flags, &open_flags); - gconf = g_new0(GlusterConf, 1); - - reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); + gconf = g_new0(BlockdevOptionsGluster, 1); + gconf->debug_level = s->debug_level; + gconf->has_debug_level = true; + reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, NULL, errp); if (reop_s->glfs == NULL) { ret = -errno; goto exit; @@ -393,7 +785,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state, } #endif - reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); + reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags); if (reop_s->fd == NULL) { /* reops->glfs will be cleaned up in _abort */ ret = -errno; @@ -402,7 +794,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state, exit: /* state->opaque will be freed in either the _abort or _commit */ - qemu_gluster_gconf_free(gconf); + qapi_free_BlockdevOptionsGluster(gconf); return ret; } @@ -454,14 +846,14 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state) } #ifdef CONFIG_GLUSTERFS_ZEROFILL -static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, + int size, + BdrvRequestFlags flags) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - off_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; acb.size = size; acb.ret = 0; @@ -483,7 +875,7 @@ static inline bool gluster_supports_zerofill(void) } static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, - int64_t size) + int64_t size) { return glfs_zerofill(fd, offset, size); } @@ -495,7 +887,7 @@ static inline bool gluster_supports_zerofill(void) } static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, - int64_t size) + int64_t size) { return 0; } @@ -504,15 +896,25 @@ static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, static int qemu_gluster_create(const char *filename, QemuOpts *opts, Error **errp) { + BlockdevOptionsGluster *gconf; struct glfs *glfs; struct glfs_fd *fd; int ret = 0; int prealloc = 0; int64_t total_size = 0; char *tmp = NULL; - GlusterConf *gconf = g_new0(GlusterConf, 1); - glfs = qemu_gluster_init(gconf, filename, errp); + gconf = g_new0(BlockdevOptionsGluster, 1); + gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG, + GLUSTER_DEBUG_DEFAULT); + if (gconf->debug_level < 0) { + gconf->debug_level = 0; + } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) { + gconf->debug_level = GLUSTER_DEBUG_MAX; + } + gconf->has_debug_level = true; + + glfs = qemu_gluster_init(gconf, filename, NULL, errp); if (!glfs) { ret = -errno; goto out; @@ -524,19 +926,17 @@ static int qemu_gluster_create(const char *filename, tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); if (!tmp || !strcmp(tmp, "off")) { prealloc = 0; - } else if (!strcmp(tmp, "full") && - gluster_supports_zerofill()) { + } else if (!strcmp(tmp, "full") && gluster_supports_zerofill()) { prealloc = 1; } else { error_setg(errp, "Invalid preallocation mode: '%s'" - " or GlusterFS doesn't support zerofill API", - tmp); + " or GlusterFS doesn't support zerofill API", tmp); ret = -EINVAL; goto out; } - fd = glfs_creat(glfs, gconf->image, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); + fd = glfs_creat(glfs, gconf->path, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); if (!fd) { ret = -errno; } else { @@ -554,7 +954,7 @@ static int qemu_gluster_create(const char *filename, } out: g_free(tmp); - qemu_gluster_gconf_free(gconf); + qapi_free_BlockdevOptionsGluster(gconf); if (glfs) { glfs_fini(glfs); } @@ -562,7 +962,8 @@ out: } static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov, int write) { int ret; GlusterAIOCB acb; @@ -577,10 +978,10 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, - gluster_finish_aiocb, &acb); + gluster_finish_aiocb, &acb); } else { ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, - gluster_finish_aiocb, &acb); + gluster_finish_aiocb, &acb); } if (ret < 0) { @@ -605,13 +1006,17 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) } static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t sector_num, + int nb_sectors, + QEMUIOVector *qiov) { return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); } static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t sector_num, + int nb_sectors, + QEMUIOVector *qiov) { return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } @@ -672,14 +1077,12 @@ error: } #ifdef CONFIG_GLUSTERFS_DISCARD -static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs, + int64_t offset, int size) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - size_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; acb.size = 0; acb.ret = 0; @@ -729,29 +1132,164 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs) return 0; } -static QemuOptsList qemu_gluster_create_opts = { - .name = "qemu-gluster-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, full)" - }, - { /* end of list */ } +/* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. + * If @start is not in a hole, store @start in @data, and the + * beginning of the next hole in @hole, and return 0. + * If @start is in a non-trailing hole, store @start in @hole and the + * beginning of the next non-hole in @data, and return 0. + * If @start is in a trailing hole or beyond EOF, return -ENXIO. + * If we can't find out, return a negative errno other than -ENXIO. + * + * (Shamefully copied from raw-posix.c, only miniscule adaptions.) + */ +static int find_allocation(BlockDriverState *bs, off_t start, + off_t *data, off_t *hole) +{ + BDRVGlusterState *s = bs->opaque; + off_t offs; + + if (!s->supports_seek_data) { + return -ENOTSUP; } -}; + + /* + * SEEK_DATA cases: + * D1. offs == start: start is in data + * D2. offs > start: start is in a hole, next data at offs + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole + * or start is beyond EOF + * If the latter happens, the file has been truncated behind + * our back since we opened it. All bets are off then. + * Treating like a trailing hole is simplest. + * D4. offs < 0, errno != ENXIO: we learned nothing + */ + offs = glfs_lseek(s->fd, start, SEEK_DATA); + if (offs < 0) { + return -errno; /* D3 or D4 */ + } + assert(offs >= start); + + if (offs > start) { + /* D2: in hole, next data at offs */ + *hole = start; + *data = offs; + return 0; + } + + /* D1: in data, end not yet known */ + + /* + * SEEK_HOLE cases: + * H1. offs == start: start is in a hole + * If this happens here, a hole has been dug behind our back + * since the previous lseek(). + * H2. offs > start: either start is in data, next hole at offs, + * or start is in trailing hole, EOF at offs + * Linux treats trailing holes like any other hole: offs == + * start. Solaris seeks to EOF instead: offs > start (blech). + * If that happens here, a hole has been dug behind our back + * since the previous lseek(). + * H3. offs < 0, errno = ENXIO: start is beyond EOF + * If this happens, the file has been truncated behind our + * back since we opened it. Treat it like a trailing hole. + * H4. offs < 0, errno != ENXIO: we learned nothing + * Pretend we know nothing at all, i.e. "forget" about D1. + */ + offs = glfs_lseek(s->fd, start, SEEK_HOLE); + if (offs < 0) { + return -errno; /* D1 and (H3 or H4) */ + } + assert(offs >= start); + + if (offs > start) { + /* + * D1 and H2: either in data, next hole at offs, or it was in + * data but is now in a trailing hole. In the latter case, + * all bets are off. Treating it as if it there was data all + * the way to EOF is safe, so simply do that. + */ + *data = start; + *hole = offs; + return 0; + } + + /* D1 and H1 */ + return -EBUSY; +} + +/* + * Returns the allocation status of the specified sectors. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + * + * (Based on raw_co_get_block_status() from raw-posix.c.) + */ +static int64_t coroutine_fn qemu_gluster_co_get_block_status( + BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, + BlockDriverState **file) +{ + BDRVGlusterState *s = bs->opaque; + off_t start, data = 0, hole = 0; + int64_t total_size; + int ret = -EINVAL; + + if (!s->fd) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + total_size = bdrv_getlength(bs); + if (total_size < 0) { + return total_size; + } else if (start >= total_size) { + *pnum = 0; + return 0; + } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { + nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + } + + ret = find_allocation(bs, start, &data, &hole); + if (ret == -ENXIO) { + /* Trailing hole */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_ZERO; + } else if (ret < 0) { + /* No info available, so pretend there are no holes */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA; + } else if (data == start) { + /* On a data extent, compute sectors to the end of the extent, + * possibly including a partial sector at EOF. */ + *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); + ret = BDRV_BLOCK_DATA; + } else { + /* On a hole, compute sectors to the beginning of the next extent. */ + assert(hole == start); + *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + ret = BDRV_BLOCK_ZERO; + } + + *file = bs; + + return ret | BDRV_BLOCK_OFFSET_VALID | start; +} + static BlockDriver bdrv_gluster = { .format_name = "gluster", .protocol_name = "gluster", .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, + .bdrv_needs_filename = false, .bdrv_file_open = qemu_gluster_open, .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, .bdrv_reopen_commit = qemu_gluster_reopen_commit, @@ -766,11 +1304,12 @@ static BlockDriver bdrv_gluster = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif + .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, .create_opts = &qemu_gluster_create_opts, }; @@ -778,7 +1317,7 @@ static BlockDriver bdrv_gluster_tcp = { .format_name = "gluster", .protocol_name = "gluster+tcp", .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, + .bdrv_needs_filename = false, .bdrv_file_open = qemu_gluster_open, .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, .bdrv_reopen_commit = qemu_gluster_reopen_commit, @@ -793,11 +1332,12 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif + .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, .create_opts = &qemu_gluster_create_opts, }; @@ -820,14 +1360,21 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif + .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, .create_opts = &qemu_gluster_create_opts, }; +/* rdma is deprecated (actually never supported for volfile fetch). + * Let's maintain it for the protocol compatibility, to make sure things + * won't break immediately. For now, gluster+rdma will fall back to gluster+tcp + * protocol with a warning. + * TODO: remove gluster+rdma interface support + */ static BlockDriver bdrv_gluster_rdma = { .format_name = "gluster", .protocol_name = "gluster+rdma", @@ -847,11 +1394,12 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif + .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, .create_opts = &qemu_gluster_create_opts, }; diff --git a/block/io.c b/block/io.c index a7dbf85b1..420944d80 100644 --- a/block/io.c +++ b/block/io.c @@ -27,118 +27,54 @@ #include "sysemu/block-backend.h" #include "block/blockjob.h" #include "block/block_int.h" -#include "block/throttle-groups.h" #include "qemu/cutils.h" #include "qapi/error.h" #include "qemu/error-report.h" #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags); -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) +static void bdrv_parent_drained_begin(BlockDriverState *bs) { - int i; - - throttle_group_config(bs, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; + BdrvChild *c; - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; + QLIST_FOREACH(c, &bs->parents, next_parent) { + if (c->role->drained_begin) { + c->role->drained_begin(c); } } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - bdrv_start_throttled_reqs(bs); - throttle_group_unregister_bs(bs); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) -{ - assert(!bs->io_limits_enabled); - throttle_group_register_bs(bs, group); - bs->io_limits_enabled = true; } -void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) +static void bdrv_parent_drained_end(BlockDriverState *bs) { - /* this bs is not part of any group */ - if (!bs->throttle_state) { - return; - } + BdrvChild *c; - /* this bs is a part of the same group than the one we want */ - if (!g_strcmp0(throttle_group_get_name(bs), group)) { - return; + QLIST_FOREACH(c, &bs->parents, next_parent) { + if (c->role->drained_end) { + c->role->drained_end(c); + } } - - /* need to change the group this bs belong to */ - bdrv_io_limits_disable(bs); - bdrv_io_limits_enable(bs, group); } -void bdrv_setup_io_funcs(BlockDriver *bdrv) +static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } + dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); + dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); + dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, + src->opt_mem_alignment); + dst->min_mem_alignment = MAX(dst->min_mem_alignment, + src->min_mem_alignment); + dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); } void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) @@ -152,6 +88,9 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) return; } + /* Default alignment based on whether driver has byte interface */ + bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; + /* Take some limits from the children as a default */ if (bs->file) { bdrv_refresh_limits(bs->file->bs, &local_err); @@ -159,11 +98,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; - bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; - bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; - bs->bl.max_iov = bs->file->bs->bl.max_iov; + bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); } else { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = getpagesize(); @@ -178,21 +113,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = - MAX(bs->bl.opt_transfer_length, - bs->backing->bs->bl.opt_transfer_length); - bs->bl.max_transfer_length = - MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing->bs->bl.max_transfer_length); - bs->bl.opt_mem_alignment = - MAX(bs->bl.opt_mem_alignment, - bs->backing->bs->bl.opt_mem_alignment); - bs->bl.min_mem_alignment = - MAX(bs->bl.min_mem_alignment, - bs->backing->bs->bl.min_mem_alignment); - bs->bl.max_iov = - MIN(bs->bl.max_iov, - bs->backing->bs->bl.max_iov); + bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); } /* Then let the driver override it */ @@ -225,12 +146,6 @@ bool bdrv_requests_pending(BlockDriverState *bs) if (!QLIST_EMPTY(&bs->tracked_requests)) { return true; } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } QLIST_FOREACH(child, &bs->children, next) { if (bdrv_requests_pending(child->bs)) { @@ -260,18 +175,29 @@ typedef struct { bool done; } BdrvCoDrainData; +static void bdrv_drain_poll(BlockDriverState *bs) +{ + bool busy = true; + + while (busy) { + /* Keep iterating */ + busy = bdrv_requests_pending(bs); + busy |= aio_poll(bdrv_get_aio_context(bs), busy); + } +} + static void bdrv_co_drain_bh_cb(void *opaque) { BdrvCoDrainData *data = opaque; Coroutine *co = data->co; qemu_bh_delete(data->bh); - bdrv_drain(data->bs); + bdrv_drain_poll(data->bs); data->done = true; - qemu_coroutine_enter(co, NULL); + qemu_coroutine_enter(co); } -void coroutine_fn bdrv_co_drain(BlockDriverState *bs) +static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) { BdrvCoDrainData data; @@ -294,6 +220,34 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) assert(data.done); } +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + bdrv_parent_drained_begin(bs); + } + + bdrv_io_unplugged_begin(bs); + bdrv_drain_recurse(bs); + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(bs); + } else { + bdrv_drain_poll(bs); + } + bdrv_io_unplugged_end(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + + bdrv_parent_drained_end(bs); + aio_enable_external(bdrv_get_aio_context(bs)); +} + /* * Wait for pending requests to complete on a single BlockDriverState subtree, * and suspend block driver's internal I/O until next request arrives. @@ -305,21 +259,17 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) * not depend on events in other AioContexts. In that case, use * bdrv_drain_all() instead. */ -void bdrv_drain(BlockDriverState *bs) +void coroutine_fn bdrv_co_drain(BlockDriverState *bs) { - bool busy = true; + assert(qemu_in_coroutine()); + bdrv_drained_begin(bs); + bdrv_drained_end(bs); +} - bdrv_drain_recurse(bs); - if (qemu_in_coroutine()) { - bdrv_co_drain(bs); - return; - } - while (busy) { - /* Keep iterating */ - bdrv_flush_io_queue(bs); - busy = bdrv_requests_pending(bs); - busy |= aio_poll(bdrv_get_aio_context(bs), busy); - } +void bdrv_drain(BlockDriverState *bs) +{ + bdrv_drained_begin(bs); + bdrv_drained_end(bs); } /* @@ -332,16 +282,25 @@ void bdrv_drain_all(void) { /* Always run first iteration so any pending completion BHs run */ bool busy = true; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; + BlockJob *job = NULL; GSList *aio_ctxs = NULL, *ctx; - while ((bs = bdrv_next(bs))) { + while ((job = block_job_next(job))) { + AioContext *aio_context = blk_get_aio_context(job->blk); + + aio_context_acquire(aio_context); + block_job_pause(job); + aio_context_release(aio_context); + } + + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (bs->job) { - block_job_pause(bs->job); - } + bdrv_parent_drained_begin(bs); + bdrv_io_unplugged_begin(bs); bdrv_drain_recurse(bs); aio_context_release(aio_context); @@ -361,12 +320,10 @@ void bdrv_drain_all(void) for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { AioContext *aio_context = ctx->data; - bs = NULL; aio_context_acquire(aio_context); - while ((bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (aio_context == bdrv_get_aio_context(bs)) { - bdrv_flush_io_queue(bs); if (bdrv_requests_pending(bs)) { busy = true; aio_poll(aio_context, busy); @@ -378,17 +335,24 @@ void bdrv_drain_all(void) } } - bs = NULL; - while ((bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (bs->job) { - block_job_resume(bs->job); - } + bdrv_io_unplugged_end(bs); + bdrv_parent_drained_end(bs); aio_context_release(aio_context); } g_slist_free(aio_ctxs); + + job = NULL; + while ((job = block_job_next(job))) { + AioContext *aio_context = blk_get_aio_context(job->blk); + + aio_context_acquire(aio_context); + block_job_resume(job); + aio_context_release(aio_context); + } } /** @@ -447,12 +411,12 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) } /** - * Round a region to cluster boundaries + * Round a region to cluster boundaries (sector-based) */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) +void bdrv_round_sectors_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) { BlockDriverInfo bdi; @@ -467,6 +431,26 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t offset, unsigned int bytes, + int64_t *cluster_offset, + unsigned int *cluster_bytes) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_offset = offset; + *cluster_bytes = bytes; + } else { + int64_t c = bdi.cluster_size; + *cluster_offset = QEMU_ALIGN_DOWN(offset, c); + *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); + } +} + static int bdrv_get_cluster_size(BlockDriverState *bs) { BlockDriverInfo bdi; @@ -474,7 +458,7 @@ static int bdrv_get_cluster_size(BlockDriverState *bs) ret = bdrv_get_info(bs, &bdi); if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; + return bs->bl.request_alignment; } else { return bdi.cluster_size; } @@ -568,7 +552,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, } typedef struct RwCo { - BlockDriverState *bs; + BdrvChild *child; int64_t offset; QEMUIOVector *qiov; bool is_write; @@ -581,26 +565,26 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, +static int bdrv_prwv_co(BdrvChild *child, int64_t offset, QEMUIOVector *qiov, bool is_write, BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { - .bs = bs, + .child = child, .offset = offset, .qiov = qiov, .is_write = is_write, @@ -608,25 +592,14 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, .flags = flags, }; - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ bdrv_rw_co_entry(&rwco); } else { - AioContext *aio_context = bdrv_get_aio_context(bs); + AioContext *aio_context = bdrv_get_aio_context(child->bs); - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); + co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); + qemu_coroutine_enter(co); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); } @@ -637,7 +610,7 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, /* * Process a synchronous request using coroutines */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, +static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, int nb_sectors, bool is_write, BdrvRequestFlags flags) { QEMUIOVector qiov; @@ -651,15 +624,15 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, } qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, +int bdrv_read(BdrvChild *child, int64_t sector_num, uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); + return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); } /* Return < 0 if error. Important errors are: @@ -668,30 +641,39 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, -EINVAL Invalid sector number or nb_sectors -EACCES Trying to write a read-only device */ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, +int bdrv_write(BdrvChild *child, int64_t sector_num, const uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); + return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, + int count, BdrvRequestFlags flags) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = NULL, + .iov_len = count, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(child, offset, &qiov, true, + BDRV_REQ_ZERO_WRITE | flags); } /* - * Completely zero out a block device with the help of bdrv_write_zeroes. + * Completely zero out a block device with the help of bdrv_pwrite_zeroes. * The operation is sped up by checking the block status and only writing * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, + * BDRV_REQ_FUA). * * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). */ -int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) { int64_t target_sectors, ret, nb_sectors, sector_num = 0; + BlockDriverState *bs = child->bs; BlockDriverState *file; int n; @@ -715,7 +697,8 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) sector_num += n; continue; } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); + ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, flags); if (ret < 0) { error_report("error writing zeroes at sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -725,33 +708,39 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(child, offset, qiov, false, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *)buf, .iov_len = bytes, }; - int ret; if (bytes < 0) { return -EINVAL; } qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; - } - - return bytes; + return bdrv_preadv(child, offset, &qiov); } -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) { int ret; - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + ret = bdrv_prwv_co(child, offset, qiov, true, 0); if (ret < 0) { return ret; } @@ -759,8 +748,7 @@ int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) return qiov->size; } -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { @@ -773,7 +761,7 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, } qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); + return bdrv_pwritev(child, offset, &qiov); } /* @@ -782,17 +770,17 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, * * Returns 0 on success, -errno in error cases. */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) +int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, + const void *buf, int count) { int ret; - ret = bdrv_pwrite(bs, offset, buf, count); + ret = bdrv_pwrite(child, offset, buf, count); if (ret < 0) { return ret; } - ret = bdrv_flush(bs); + ret = bdrv_flush(child->bs); if (ret < 0) { return ret; } @@ -800,8 +788,117 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine); +} + +static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + + assert(!(flags & ~BDRV_REQ_MASK)); + + if (drv->bdrv_co_preadv) { + return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_readv) { + return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } +} + +static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + int ret; + + assert(!(flags & ~BDRV_REQ_MASK)); + + if (drv->bdrv_co_pwritev) { + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; + goto emulate_flags; + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_writev_flags) { + ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; + } else if (drv->bdrv_co_writev) { + assert(!bs->supported_write_flags); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + +emulate_flags: + if (ret == 0 && (flags & BDRV_REQ_FUA)) { + ret = bdrv_co_flush(bs); + } + + return ret; +} + static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t offset, unsigned int bytes, QEMUIOVector *qiov) { /* Perform I/O through a temporary buffer so that users who scribble over * their read buffer while the operation is in progress do not end up @@ -813,21 +910,20 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, BlockDriver *drv = bs->drv; struct iovec iov; QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; + int64_t cluster_offset; + unsigned int cluster_bytes; size_t skip_bytes; int ret; /* Cover entire cluster so no additional backing file I/O is required when * allocating cluster in the image file. */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); + trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, + cluster_offset, cluster_bytes); - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_len = cluster_bytes; iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); if (bounce_buffer == NULL) { ret = -ENOMEM; @@ -836,22 +932,24 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, qemu_iovec_init_external(&bounce_qiov, &iov, 1); - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes, + &bounce_qiov, 0); if (ret < 0) { goto err; } - if (drv->bdrv_co_write_zeroes && + if (drv->bdrv_co_pwrite_zeroes && buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); + /* FIXME: Should we (perhaps conditionally) be setting + * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy + * that still correctly reads as zero? */ + ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes, + &bounce_qiov, 0); } if (ret < 0) { @@ -862,9 +960,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, goto err; } - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); + skip_bytes = offset - cluster_offset; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes); err: qemu_vfree(bounce_buffer); @@ -873,23 +970,31 @@ err: /* * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. + * handles copy on read, zeroing after EOF, and fragmentation of large + * reads; any other features must be implemented by the caller. */ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, int64_t align, QEMUIOVector *qiov, int flags) { - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int64_t total_bytes, max_bytes; + int ret = 0; + uint64_t bytes_remaining = bytes; + int max_transfer; - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(is_power_of_2(align)); + assert((offset & (align - 1)) == 0); + assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); + + /* TODO: We would need a per-BDS .supported_read_flags and + * potential fallback support, if we ever implement any read flags + * to pass through to drivers. For now, there aren't any + * passthrough flags. */ + assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -906,76 +1011,77 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } if (flags & BDRV_REQ_COPY_ON_READ) { + int64_t start_sector = offset >> BDRV_SECTOR_BITS; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + unsigned int nb_sectors = end_sector - start_sector; int pnum; - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum); if (ret < 0) { goto out; } if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov); goto out; } } - /* Forward the request to the BlockDriver */ - if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else { - /* Read zeros after EOF */ - int64_t total_sectors, max_nb_sectors; + /* Forward the request to the BlockDriver, possibly fragmenting it */ + total_bytes = bdrv_getlength(bs); + if (total_bytes < 0) { + ret = total_bytes; + goto out; + } - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } + max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); + if (bytes <= max_bytes && bytes <= max_transfer) { + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); + goto out; + } + + while (bytes_remaining) { + int num; - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { + if (max_bytes) { QEMUIOVector local_qiov; + num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); + assert(num); qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); + ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, + num, &local_qiov, 0); + max_bytes -= num; qemu_iovec_destroy(&local_qiov); } else { - ret = 0; + num = bytes_remaining; + ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, + bytes_remaining); } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + if (ret < 0) { + goto out; } + bytes_remaining -= num; } out: - return ret; + return ret < 0 ? ret : 0; } /* * Handle a read request in coroutine context */ -int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +int coroutine_fn bdrv_co_preadv(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { + BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; uint8_t *head_buf = NULL; uint8_t *tail_buf = NULL; QEMUIOVector local_qiov; @@ -996,11 +1102,6 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, flags |= BDRV_REQ_COPY_ON_READ; } - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, false); - } - /* Align read if necessary by padding qiov */ if (offset & (align - 1)) { head_buf = qemu_blockalign(bs, align); @@ -1041,7 +1142,7 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, return ret; } -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, +static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1049,67 +1150,56 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { - trace_bdrv_co_readv(bs, sector_num, nb_sectors); + trace_bdrv_co_readv(child->bs, sector_num, nb_sectors); - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); + return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); +/* Maximum buffer for write zeroes fallback, in bytes */ +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_NO_SERIALISING); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 - -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; struct iovec iov = {0}; int ret = 0; + bool need_flush = false; + int head = 0; + int tail = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); + int alignment = MAX(bs->bl.pwrite_zeroes_alignment, + bs->bl.request_alignment); - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); + assert(alignment % bs->bl.request_alignment == 0); + head = offset % alignment; + tail = (offset + count) % alignment; + max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); + assert(max_write_zeroes >= bs->bl.request_alignment); - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; + while (count > 0 && !ret) { + int num = count; /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. + * to be aligned, and that unaligned requests do not cross cluster + * boundaries. */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } + if (head) { + /* Make a small request up to the first aligned sector. */ + num = MIN(count, alignment - head); + head = 0; + } else if (tail && num > alignment) { + /* Shorten the request to the last aligned sector. */ + num -= tail; } /* limit request size */ @@ -1119,64 +1209,90 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, ret = -ENOTSUP; /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + if (drv->bdrv_co_pwrite_zeroes) { + ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, + flags & bs->supported_zero_flags); + if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && + !(bs->supported_zero_flags & BDRV_REQ_FUA)) { + need_flush = true; + } + } else { + assert(!bs->supported_zero_flags); } if (ret == -ENOTSUP) { /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_WRITE_ZEROES_BOUNCE_BUFFER); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; + BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; + + if ((flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* No need for bdrv_driver_pwrite() to do a fallback + * flush on each chunk; use just one at the end */ + write_flags &= ~BDRV_REQ_FUA; + need_flush = true; + } + num = MIN(num, max_transfer); + iov.iov_len = num; if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + iov.iov_base = qemu_try_blockalign(bs, num); if (iov.iov_base == NULL) { ret = -ENOMEM; goto fail; } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + memset(iov.iov_base, 0, num); } qemu_iovec_init_external(&qiov, &iov, 1); - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); /* Keep bounce buffer around if it is big enough for all * all future requests. */ - if (num < max_xfer_len) { + if (num < max_transfer) { qemu_vfree(iov.iov_base); iov.iov_base = NULL; } } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } fail: + if (ret == 0 && need_flush) { + ret = bdrv_co_flush(bs); + } qemu_vfree(iov.iov_base); return ret; } /* - * Forwards an already correctly aligned write request to the BlockDriver. + * Forwards an already correctly aligned write request to the BlockDriver, + * after possibly fragmenting it. */ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; bool waited; int ret; - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int64_t start_sector = offset >> BDRV_SECTOR_BITS; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + uint64_t bytes_remaining = bytes; + int max_transfer; - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(is_power_of_2(align)); + assert((offset & (align - 1)) == 0); + assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); + assert(!(flags & ~BDRV_REQ_MASK)); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); @@ -1186,7 +1302,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) { flags |= BDRV_REQ_ZERO_WRITE; if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { @@ -1198,32 +1314,48 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else if (drv->bdrv_co_writev_flags) { + ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); + } else if (bytes <= max_transfer) { bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, - flags); + ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); } else { - assert(drv->supported_write_flags == 0); bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); + while (bytes_remaining) { + int num = MIN(bytes_remaining, max_transfer); + QEMUIOVector local_qiov; + int local_flags = flags; + + assert(num); + if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* If FUA is going to be emulated by flush, we only + * need to flush on the last iteration */ + local_flags &= ~BDRV_REQ_FUA; + } + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); - if (ret == 0 && (flags & BDRV_REQ_FUA) && - !(drv->supported_write_flags & BDRV_REQ_FUA)) - { - ret = bdrv_co_flush(bs); + ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, + num, &local_qiov, local_flags); + qemu_iovec_destroy(&local_qiov); + if (ret < 0) { + break; + } + bytes_remaining -= num; + } } + bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - bdrv_set_dirty(bs, sector_num, nb_sectors); + ++bs->write_gen; + bdrv_set_dirty(bs, start_sector, end_sector - start_sector); if (bs->wr_highest_offset < offset + bytes) { bs->wr_highest_offset = offset + bytes; } if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + bs->total_sectors = MAX(bs->total_sectors, end_sector); + ret = 0; } return ret; @@ -1238,7 +1370,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, uint8_t *buf = NULL; QEMUIOVector local_qiov; struct iovec iov; - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; unsigned int head_padding_bytes, tail_padding_bytes; int ret = 0; @@ -1271,7 +1403,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, memset(buf + head_padding_bytes, 0, zero_bytes); ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, - &local_qiov, + align, &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); if (ret < 0) { goto fail; @@ -1284,7 +1416,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, if (bytes >= align) { /* Write the aligned part in the middle. */ uint64_t aligned_bytes = bytes & ~(align - 1); - ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, + ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align, NULL, flags); if (ret < 0) { goto fail; @@ -1308,7 +1440,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); memset(buf, 0, bytes); - ret = bdrv_aligned_pwritev(bs, req, offset, align, + ret = bdrv_aligned_pwritev(bs, req, offset, align, align, &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); } fail: @@ -1320,13 +1452,13 @@ fail: /* * Handle a write request in coroutine context */ -int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +int coroutine_fn bdrv_co_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { + BlockDriverState *bs = child->bs; BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint64_t align = bs->bl.request_alignment; uint8_t *head_buf = NULL; uint8_t *tail_buf = NULL; QEMUIOVector local_qiov; @@ -1346,11 +1478,6 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, return ret; } - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, true); - } - /* * Align write if necessary by performing a read-modify-write cycle. * Pad qiov with the read parts and be sure to have a tracked request not @@ -1392,6 +1519,14 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bytes += offset & (align - 1); offset = offset & ~(align - 1); + + /* We have read the tail already if the request is smaller + * than one aligned block. + */ + if (bytes < align) { + qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); + bytes = align; + } } if ((offset + bytes) & (align - 1)) { @@ -1431,7 +1566,7 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); @@ -1447,7 +1582,7 @@ out: return ret; } -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, +static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1455,30 +1590,29 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, +int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - trace_bdrv_co_writev(bs, sector_num, nb_sectors); + trace_bdrv_co_writev(child->bs, sector_num, nb_sectors); - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); + return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) +int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, + int count, BdrvRequestFlags flags) { - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); - if (!(bs->open_flags & BDRV_O_UNMAP)) { + if (!(child->bs->open_flags & BDRV_O_UNMAP)) { flags &= ~BDRV_REQ_MAY_UNMAP; } - return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); + return bdrv_co_pwritev(child, offset, count, NULL, + BDRV_REQ_ZERO_WRITE | flags); } typedef struct BdrvCoGetBlockStatusData { @@ -1663,8 +1797,9 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); - qemu_coroutine_enter(co, &data); + co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry, + &data); + qemu_coroutine_enter(co); while (!data.done) { aio_poll(aio_context, true); } @@ -1766,273 +1901,134 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); } -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; +typedef struct BdrvVmstateCo { + BlockDriverState *bs; + QEMUIOVector *qiov; + int64_t pos; + bool is_read; + int ret; +} BdrvVmstateCo; - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +static int coroutine_fn +bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, + bool is_read) { BlockDriver *drv = bs->drv; if (!drv) { return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (drv->bdrv_load_vmstate) { + return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos) + : drv->bdrv_save_vmstate(bs, qiov, pos); } else if (bs->file) { - return bdrv_writev_vmstate(bs->file->bs, qiov, pos); + return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); } return -ENOTSUP; } -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file->bs, buf, pos, size); - return -ENOTSUP; -} - -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); -} - -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) { - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); + BdrvVmstateCo *co = opaque; + co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); } -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +static inline int +bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, + bool is_read) { - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); -} - - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) -{ - int i; + if (qemu_in_coroutine()) { + return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); + } else { + BdrvVmstateCo data = { + .bs = bs, + .qiov = qiov, + .pos = pos, + .is_read = is_read, + .ret = -EINPROGRESS, + }; + Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + qemu_coroutine_enter(co); + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); } - g_free(mcb->callbacks[i].free_qiov); + return data.ret; } } -static void multiwrite_cb(void *opaque, int ret) +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) { - MultiwriteCB *mcb = opaque; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + int ret; - trace_multiwrite_cb(mcb, ret); + qemu_iovec_init_external(&qiov, &iov, 1); - if (ret < 0 && !mcb->error) { - mcb->error = ret; + ret = bdrv_writev_vmstate(bs, &qiov, pos); + if (ret < 0) { + return ret; } - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } + return size; } -static int multiwrite_req_compare(const void *a, const void *b) +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } + return bdrv_rw_vmstate(bs, qiov, pos, false); } -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) { - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } - - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > - bs->bl.max_iov) { - merge = 0; - } - - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; - } - - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); - - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); - - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); - - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); - - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } - - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; - - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; - } - } + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = buf, + .iov_len = size, + }; + int ret; - if (bs->blk) { - block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, - num_reqs - outidx - 1); + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_readv_vmstate(bs, &qiov, pos); + if (ret < 0) { + return ret; } - return outidx + 1; + return size; } -/* - * Submit multiple AIO write requests at once. - * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). - * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. - */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - MultiwriteCB *mcb; - int i; - - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; - } - return -1; - } - - if (num_reqs == 0) { - return 0; - } - - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; + return bdrv_rw_vmstate(bs, qiov, pos, true); +} - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; - } +/**************************************************************/ +/* async I/Os */ - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); +BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque); - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, false); +} - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); - } +BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque); - return 0; + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, true); } void bdrv_aio_cancel(BlockAIOCB *acb) @@ -2064,82 +2060,30 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb) /**************************************************************/ /* async block device emulation */ -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) -{ - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); -} - -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - -{ - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; -} - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} +typedef struct BlockRequest { + union { + /* Used during read, write, trim */ + struct { + int64_t offset; + int bytes; + int flags; + QEMUIOVector *qiov; + }; + /* Used during ioctl */ + struct { + int req; + void *buf; + }; + }; + BlockCompletionFunc *cb; + void *opaque; + int error; +} BlockRequest; typedef struct BlockAIOCBCoroutine { BlockAIOCB common; + BdrvChild *child; BlockRequest req; bool is_write; bool need_bh; @@ -2183,42 +2127,40 @@ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) static void coroutine_fn bdrv_co_do_rw(void *opaque) { BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } bdrv_co_complete(acb); } -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) { Coroutine *co; BlockAIOCBCoroutine *acb; - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque); + acb->child = child; acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; + acb->req.offset = offset; acb->req.qiov = qiov; acb->req.flags = flags; acb->is_write = is_write; - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_co_do_rw, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2245,38 +2187,37 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, acb->need_bh = true; acb->req.error = -EINPROGRESS; - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; } -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque) { BlockAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes); bdrv_co_complete(acb); } -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque) { Coroutine *co; BlockAIOCBCoroutine *acb; - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + trace_bdrv_aio_pdiscard(bs, offset, count, opaque); acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); + acb->req.offset = offset; + acb->req.bytes = count; + co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2314,62 +2255,15 @@ void qemu_aio_unref(void *p) /**************************************************************/ /* Coroutine block device emulation */ -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; +typedef struct FlushCo { + BlockDriverState *bs; int ret; -} CoroutineIOCompletion; - -static void bdrv_co_io_em_complete(void *opaque, int ret) -{ - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); -} - -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) -{ - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } - - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); - - return co.ret; -} - -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} +} FlushCo; -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} static void coroutine_fn bdrv_flush_co_entry(void *opaque) { - RwCo *rwco = opaque; + FlushCo *rwco = opaque; rwco->ret = bdrv_co_flush(rwco->bs); } @@ -2386,6 +2280,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + int current_gen = bs->write_gen; + + /* Wait until any previous flushes are completed */ + while (bs->active_flush_req != NULL) { + qemu_co_queue_wait(&bs->flush_queue); + } + + bs->active_flush_req = &req; + /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { ret = bs->drv->bdrv_co_flush(bs); @@ -2406,6 +2309,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) goto flush_parent; } + /* Check if we really need to flush anything */ + if (bs->flushed_gen == current_gen) { + goto flush_parent; + } + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); if (bs->drv->bdrv_co_flush_to_disk) { ret = bs->drv->bdrv_co_flush_to_disk(bs); @@ -2436,6 +2344,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) */ ret = 0; } + if (ret < 0) { goto out; } @@ -2446,6 +2355,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) flush_parent: ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; out: + /* Notify any pending flushes that we have completed */ + bs->flushed_gen = current_gen; + bs->active_flush_req = NULL; + /* Return value is ignored - it's ok if wait queue is empty */ + qemu_co_queue_next(&bs->flush_queue); + tracked_request_end(&req); return ret; } @@ -2453,51 +2368,52 @@ out: int bdrv_flush(BlockDriverState *bs) { Coroutine *co; - RwCo rwco = { + FlushCo flush_co = { .bs = bs, .ret = NOT_DONE, }; if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); + bdrv_flush_co_entry(&flush_co); } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { + co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); + qemu_coroutine_enter(co); + while (flush_co.ret == NOT_DONE) { aio_poll(aio_context, true); } } - return rwco.ret; + return flush_co.ret; } typedef struct DiscardCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; + int count; int ret; } DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); + rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count); } -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, + int count) { BdrvTrackedRequest req; - int max_discard, ret; + int max_pdiscard, ret; + int head, align; if (!bs->drv) { return -ENOMEDIUM; } - ret = bdrv_check_request(bs, sector_num, nb_sectors); + ret = bdrv_check_byte_request(bs, offset, count); if (ret < 0) { return ret; } else if (bs->read_only) { @@ -2510,44 +2426,49 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { return 0; } - tracked_request_begin(&req, bs, sector_num, nb_sectors, - BDRV_TRACKED_DISCARD); - bdrv_set_dirty(bs, sector_num, nb_sectors); + /* Discard is advisory, so ignore any unaligned head or tail */ + align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); + assert(align % bs->bl.request_alignment == 0); + head = offset % align; + if (head) { + head = MIN(count, align - head); + count -= head; + offset += head; + } + count = QEMU_ALIGN_DOWN(count, align); + if (!count) { + return 0; + } - max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } + tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + if (ret < 0) { + goto out; + } - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), + align); + assert(max_pdiscard); + + while (count > 0) { + int ret; + int num = MIN(count, max_pdiscard); + + if (bs->drv->bdrv_co_pdiscard) { + ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); } else { BlockAIOCB *acb; CoroutineIOCompletion co = { .coroutine = qemu_coroutine_self(), }; - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); + acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, + bdrv_co_io_em_complete, &co); if (acb == NULL) { ret = -EIO; goto out; @@ -2560,33 +2481,36 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, goto out; } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } ret = 0; out: + ++bs->write_gen; + bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, + req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); return ret; } -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) { Coroutine *co; DiscardCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, + .offset = offset, + .count = count, .ret = NOT_DONE, }; if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); + bdrv_pdiscard_co_entry(&rwco); } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); + co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); + qemu_coroutine_enter(co); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); } @@ -2595,19 +2519,6 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) return rwco.ret; } -typedef struct { - CoroutineIOCompletion *co; - QEMUBH *bh; -} BdrvIoctlCompletionData; - -static void bdrv_ioctl_bh_cb(void *opaque) -{ - BdrvIoctlCompletionData *data = opaque; - - bdrv_co_io_em_complete(data->co, -ENOTSUP); - qemu_bh_delete(data->bh); -} - static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; @@ -2625,11 +2536,8 @@ static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); if (!acb) { - BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); - data->bh = aio_bh_new(bdrv_get_aio_context(bs), - bdrv_ioctl_bh_cb, data); - data->co = &co; - qemu_bh_schedule(data->bh); + co.ret = -ENOTSUP; + goto out; } qemu_coroutine_yield(); out: @@ -2664,9 +2572,9 @@ int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) /* Fast-path if already in coroutine context */ bdrv_co_ioctl_entry(&data); } else { - Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, &data); - qemu_coroutine_enter(co, &data); + qemu_coroutine_enter(co); while (data.ret == -EINPROGRESS) { aio_poll(bdrv_get_aio_context(bs), true); } @@ -2694,8 +2602,8 @@ BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, acb->req.error = -EINPROGRESS; acb->req.req = req; acb->req.buf = buf; - co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); - qemu_coroutine_enter(co, acb); + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry, acb); + qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); return &acb->common; @@ -2763,48 +2671,66 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs, void bdrv_io_plug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file->bs); + BdrvChild *child; + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_plug(child->bs); + } + + if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } } void bdrv_io_unplug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file->bs); + BdrvChild *child; + + assert(bs->io_plugged); + if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } } -} -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file->bs); + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplug(child->bs); } - bdrv_start_throttled_reqs(bs); } -void bdrv_drained_begin(BlockDriverState *bs) +void bdrv_io_unplugged_begin(BlockDriverState *bs) { - if (!bs->quiesce_counter++) { - aio_disable_external(bdrv_get_aio_context(bs)); + BdrvChild *child; + + if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } + } + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_begin(child->bs); } - bdrv_drain(bs); } -void bdrv_drained_end(BlockDriverState *bs) +void bdrv_io_unplugged_end(BlockDriverState *bs) { - assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { - return; + BdrvChild *child; + + assert(bs->io_plug_disabled); + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_end(child->bs); + } + + if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } - aio_enable_external(bdrv_get_aio_context(bs)); } diff --git a/block/iscsi.c b/block/iscsi.c index 302baf84c..95ce9e139 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,7 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> - * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de> + * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -46,7 +46,6 @@ #ifdef __linux__ #include <scsi/sg.h> -#include <block/scsi.h> #endif typedef struct IscsiLun { @@ -62,7 +61,23 @@ typedef struct IscsiLun { struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; unsigned char *zeroblock; - unsigned long *allocationmap; + /* The allocmap tracks which clusters (pages) on the iSCSI target are + * allocated and which are not. In case a target returns zeros for + * unallocated pages (iscsilun->lprz) we can directly return zeros instead + * of reading zeros over the wire if a read request falls within an + * unallocated block. As there are 3 possible states we need 2 bitmaps to + * track. allocmap_valid keeps track if QEMU's information about a page is + * valid. allocmap tracks if a page is allocated or not. In case QEMU has no + * valid information about a page the corresponding allocmap entry should be + * switched to unallocated as well to force a new lookup of the allocation + * status as lookups are generally skipped if a page is suspect to be + * allocated. If a iSCSI target is opened with cache.direct = on the + * allocmap_valid does not exist turning all cached information invalid so + * that a fresh lookup is made for any page even if allocmap entry returns + * it's unallocated. */ + unsigned long *allocmap; + unsigned long *allocmap_valid; + long allocmap_size; int cluster_sectors; bool use_16_for_rw; bool write_protected; @@ -153,7 +168,7 @@ static void iscsi_co_generic_bh_cb(void *opaque) struct IscsiTask *iTask = opaque; iTask->complete = 1; qemu_bh_delete(iTask->bh); - qemu_coroutine_enter(iTask->co, NULL); + qemu_coroutine_enter(iTask->co); } static void iscsi_retry_timer_expired(void *opaque) @@ -161,7 +176,7 @@ static void iscsi_retry_timer_expired(void *opaque) struct IscsiTask *iTask = opaque; iTask->complete = 1; if (iTask->co) { - qemu_coroutine_enter(iTask->co, NULL); + qemu_coroutine_enter(iTask->co); } } @@ -401,53 +416,159 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; } -static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, - IscsiLun *iscsilun) +static bool is_byte_request_lun_aligned(int64_t offset, int count, + IscsiLun *iscsilun) { - if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || - (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { - error_report("iSCSI misaligned request: " - "iscsilun->block_size %u, sector_num %" PRIi64 - ", nb_sectors %d", - iscsilun->block_size, sector_num, nb_sectors); - return 0; - } - return 1; + if (offset % iscsilun->block_size || count % iscsilun->block_size) { + error_report("iSCSI misaligned request: " + "iscsilun->block_size %u, offset %" PRIi64 + ", count %d", + iscsilun->block_size, offset, count); + return false; + } + return true; } -static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun) +static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors, + IscsiLun *iscsilun) { - return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, - iscsilun), - iscsilun->cluster_sectors)); + assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS); + return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, + iscsilun); } -static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +static void iscsi_allocmap_free(IscsiLun *iscsilun) { - if (iscsilun->allocationmap == NULL) { - return; + g_free(iscsilun->allocmap); + g_free(iscsilun->allocmap_valid); + iscsilun->allocmap = NULL; + iscsilun->allocmap_valid = NULL; +} + + +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags) +{ + iscsi_allocmap_free(iscsilun); + + iscsilun->allocmap_size = + DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun), + iscsilun->cluster_sectors); + + iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size); + if (!iscsilun->allocmap) { + return -ENOMEM; + } + + if (open_flags & BDRV_O_NOCACHE) { + /* in case that cache.direct = on all allocmap entries are + * treated as invalid to force a relookup of the block + * status on every read request */ + return 0; } - bitmap_set(iscsilun->allocationmap, - sector_num / iscsilun->cluster_sectors, - DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); + + iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size); + if (!iscsilun->allocmap_valid) { + /* if we are under memory pressure free the allocmap as well */ + iscsi_allocmap_free(iscsilun); + return -ENOMEM; + } + + return 0; } -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +static void +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors, bool allocated, bool valid) { - int64_t cluster_num, nb_clusters; - if (iscsilun->allocationmap == NULL) { + int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk; + + if (iscsilun->allocmap == NULL) { return; } - cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); - nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors - - cluster_num; - if (nb_clusters > 0) { - bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); + /* expand to entirely contain all affected clusters */ + cl_num_expanded = sector_num / iscsilun->cluster_sectors; + nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors, + iscsilun->cluster_sectors) - cl_num_expanded; + /* shrink to touch only completely contained clusters */ + cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); + nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors + - cl_num_shrunk; + if (allocated) { + bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded); + } else { + bitmap_clear(iscsilun->allocmap, cl_num_shrunk, nb_cls_shrunk); + } + + if (iscsilun->allocmap_valid == NULL) { + return; + } + if (valid) { + bitmap_set(iscsilun->allocmap_valid, cl_num_shrunk, nb_cls_shrunk); + } else { + bitmap_clear(iscsilun->allocmap_valid, cl_num_expanded, + nb_cls_expanded); + } +} + +static void +iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true); +} + +static void +iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update + * is ignored, so this will in effect be an iscsi_allocmap_set_invalid. + */ + iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true); +} + +static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false); +} + +static void iscsi_allocmap_invalidate(IscsiLun *iscsilun) +{ + if (iscsilun->allocmap) { + bitmap_zero(iscsilun->allocmap, iscsilun->allocmap_size); + } + if (iscsilun->allocmap_valid) { + bitmap_zero(iscsilun->allocmap_valid, iscsilun->allocmap_size); } } +static inline bool +iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + unsigned long size; + if (iscsilun->allocmap == NULL) { + return true; + } + size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + return !(find_next_bit(iscsilun->allocmap, size, + sector_num / iscsilun->cluster_sectors) == size); +} + +static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, + int64_t sector_num, int nb_sectors) +{ + unsigned long size; + if (iscsilun->allocmap_valid == NULL) { + return false; + } + size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + return (find_next_zero_bit(iscsilun->allocmap_valid, size, + sector_num / iscsilun->cluster_sectors) == size); +} + static int coroutine_fn iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov, int flags) @@ -456,23 +577,23 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - bool fua; + bool fua = flags & BDRV_REQ_FUA; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (fua) { + assert(iscsilun->dpofua); + } + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } - if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { - error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " - "of %d sectors", nb_sectors, bs->bl.max_transfer_length); - return -EINVAL; + if (bs->bl.max_transfer) { + assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer); } lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA); if (iscsilun->use_16_for_rw) { iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, @@ -505,34 +626,17 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { + iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors); return iTask.err_code; } - iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors); return 0; } -static int coroutine_fn -iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0); -} -static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, - int64_t sector_num, int nb_sectors) -{ - unsigned long size; - if (iscsilun->allocationmap == NULL) { - return true; - } - size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); - return !(find_next_bit(iscsilun->allocationmap, size, - sector_num / iscsilun->cluster_sectors) == size); -} - static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, @@ -546,7 +650,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, iscsi_co_init_iscsitask(iscsilun, &iTask); - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { ret = -EINVAL; goto out; } @@ -616,9 +720,9 @@ retry: } if (ret & BDRV_BLOCK_ZERO) { - iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); + iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum); } else { - iscsi_allocationmap_set(iscsilun, sector_num, *pnum); + iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum); } if (*pnum > nb_sectors) { @@ -643,26 +747,40 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, uint64_t lba; uint32_t num_sectors; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } - if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { - error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " - "of %d sectors", nb_sectors, bs->bl.max_transfer_length); - return -EINVAL; + if (bs->bl.max_transfer) { + assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer); } - if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && - !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { - int64_t ret; + /* if cache.direct is off and we have a valid entry in our allocation map + * we can skip checking the block status and directly return zeroes if + * the request falls within an unallocated area */ + if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) && + !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + qemu_iovec_memset(iov, 0, 0x00, iov->size); + return 0; + } + + if (nb_sectors >= ISCSI_CHECKALLOC_THRES && + !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) && + !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) { int pnum; BlockDriverState *file; - ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file); + /* check the block status from the beginning of the cluster + * containing the start sector */ + int64_t ret = iscsi_co_get_block_status(bs, + sector_num - sector_num % iscsilun->cluster_sectors, + BDRV_REQUEST_MAX_SECTORS, &pnum, &file); if (ret < 0) { return ret; } - if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { + /* if the whole request falls into an unallocated area we can avoid + * to read and directly return zeroes instead */ + if (ret & BDRV_BLOCK_ZERO && + pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) { qemu_iovec_memset(iov, 0, 0x00, iov->size); return 0; } @@ -766,6 +884,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, acb->ioh->driver_status = 0; acb->ioh->host_status = 0; acb->ioh->resid = 0; + acb->ioh->status = status; #define SG_ERR_DRIVER_SENSE 0x08 @@ -837,6 +956,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, return &acb->common; } + if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) { + error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)", + acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE); + qemu_aio_unref(acb); + return NULL; + } + acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi command. %s", @@ -916,29 +1042,26 @@ iscsi_getlength(BlockDriverState *bs) } static int -coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; struct unmap_list list; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } + assert(is_byte_request_lun_aligned(offset, count, iscsilun)); if (!iscsilun->lbp.lbpu) { /* UNMAP is not supported by the target */ return 0; } - list.lba = sector_qemu2lun(sector_num, iscsilun); - list.num = sector_qemu2lun(nb_sectors, iscsilun); + list.lba = offset / iscsilun->block_size; + list.num = count / iscsilun->block_size; iscsi_co_init_iscsitask(iscsilun, &iTask); retry: if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, - iscsi_co_generic_cb, &iTask) == NULL) { + iscsi_co_generic_cb, &iTask) == NULL) { return -ENOMEM; } @@ -968,14 +1091,15 @@ retry: return iTask.err_code; } - iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); return 0; } static int -coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; @@ -983,8 +1107,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, uint32_t nb_blocks; bool use_16_for_ws = iscsilun->use_16_for_rw; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; + if (!is_byte_request_lun_aligned(offset, count, iscsilun)) { + return -ENOTSUP; } if (flags & BDRV_REQ_MAY_UNMAP) { @@ -1005,8 +1129,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, return -ENOTSUP; } - lba = sector_qemu2lun(sector_num, iscsilun); - nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + lba = offset / iscsilun->block_size; + nb_blocks = count / iscsilun->block_size; if (iscsilun->zeroblock == NULL) { iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); @@ -1058,13 +1182,17 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { + iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); return iTask.err_code; } if (flags & BDRV_REQ_MAY_UNMAP) { - iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); } else { - iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); } return 0; @@ -1555,6 +1683,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, task = NULL; iscsi_modesense_sync(iscsilun); + if (iscsilun->dpofua) { + bs->supported_write_flags = BDRV_REQ_FUA; + } + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; /* Check the write protect flag of the LUN if we want to write */ if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && @@ -1571,14 +1703,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, goto out; } bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); - bs->request_alignment = iscsilun->block_size; /* We don't have any emulation for devices other than disks and CD-ROMs, so * this must be sg ioctl compatible. We force it to be sg, otherwise qemu * will try to read from the device to guess the image format. */ if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { - bs->sg = 1; + bs->sg = true; } task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, @@ -1634,10 +1765,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; if (iscsilun->lbprz) { - iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); - if (iscsilun->allocationmap == NULL) { - ret = -ENOMEM; - } + ret = iscsi_allocmap_init(iscsilun, bs->open_flags); } } @@ -1674,48 +1802,54 @@ static void iscsi_close(BlockDriverState *bs) } iscsi_destroy_context(iscsi); g_free(iscsilun->zeroblock); - g_free(iscsilun->allocationmap); + iscsi_allocmap_free(iscsilun); memset(iscsilun, 0, sizeof(IscsiLun)); } -static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) -{ - return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); -} - static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) { /* We don't actually refresh here, but just return data queried in * iscsi_open(): iscsi targets don't change their limits. */ IscsiLun *iscsilun = bs->opaque; - uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + uint64_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + + bs->bl.request_alignment = iscsilun->block_size; if (iscsilun->bl.max_xfer_len) { max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); } - bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); + if (max_xfer_len * iscsilun->block_size < INT_MAX) { + bs->bl.max_transfer = max_xfer_len * iscsilun->block_size; + } if (iscsilun->lbp.lbpu) { - if (iscsilun->bl.max_unmap < 0xffffffff) { - bs->bl.max_discard = - sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); + if (iscsilun->bl.max_unmap < 0xffffffff / iscsilun->block_size) { + bs->bl.max_pdiscard = + iscsilun->bl.max_unmap * iscsilun->block_size; } - bs->bl.discard_alignment = - sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + bs->bl.pdiscard_alignment = + iscsilun->bl.opt_unmap_gran * iscsilun->block_size; + } else { + bs->bl.pdiscard_alignment = iscsilun->block_size; } - if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = - sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); + if (iscsilun->bl.max_ws_len < 0xffffffff / iscsilun->block_size) { + bs->bl.max_pwrite_zeroes = + iscsilun->bl.max_ws_len * iscsilun->block_size; } if (iscsilun->lbp.lbpws) { - bs->bl.write_zeroes_alignment = - sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + bs->bl.pwrite_zeroes_alignment = + iscsilun->bl.opt_unmap_gran * iscsilun->block_size; + } else { + bs->bl.pwrite_zeroes_alignment = iscsilun->block_size; + } + if (iscsilun->bl.opt_xfer_len && + iscsilun->bl.opt_xfer_len < INT_MAX / iscsilun->block_size) { + bs->bl.opt_transfer = pow2floor(iscsilun->bl.opt_xfer_len * + iscsilun->block_size); } - bs->bl.opt_transfer_length = - sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); } /* Note that this will not re-establish a connection with an iSCSI target - it @@ -1732,6 +1866,16 @@ static int iscsi_reopen_prepare(BDRVReopenState *state, return 0; } +static void iscsi_reopen_commit(BDRVReopenState *reopen_state) +{ + IscsiLun *iscsilun = reopen_state->bs->opaque; + + /* the cache.direct status might have changed */ + if (iscsilun->allocmap != NULL) { + iscsi_allocmap_init(iscsilun, reopen_state->flags); + } +} + static int iscsi_truncate(BlockDriverState *bs, int64_t offset) { IscsiLun *iscsilun = bs->opaque; @@ -1751,9 +1895,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) return -EINVAL; } - if (iscsilun->allocationmap != NULL) { - g_free(iscsilun->allocationmap); - iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); + if (iscsilun->allocmap != NULL) { + iscsi_allocmap_init(iscsilun, bs->open_flags); } return 0; @@ -1813,6 +1956,13 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } +static void iscsi_invalidate_cache(BlockDriverState *bs, + Error **errp) +{ + IscsiLun *iscsilun = bs->opaque; + iscsi_allocmap_invalidate(iscsilun); +} + static QemuOptsList iscsi_create_opts = { .name = "iscsi-create-opts", .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), @@ -1836,7 +1986,9 @@ static BlockDriver bdrv_iscsi = { .bdrv_close = iscsi_close, .bdrv_create = iscsi_create, .create_opts = &iscsi_create_opts, - .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_invalidate_cache = iscsi_invalidate_cache, .bdrv_getlength = iscsi_getlength, .bdrv_get_info = iscsi_get_info, @@ -1844,12 +1996,10 @@ static BlockDriver bdrv_iscsi = { .bdrv_refresh_limits = iscsi_refresh_limits, .bdrv_co_get_block_status = iscsi_co_get_block_status, - .bdrv_co_discard = iscsi_co_discard, - .bdrv_co_write_zeroes = iscsi_co_write_zeroes, + .bdrv_co_pdiscard = iscsi_co_pdiscard, + .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, - .bdrv_co_writev = iscsi_co_writev, .bdrv_co_writev_flags = iscsi_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ diff --git a/block/linux-aio.c b/block/linux-aio.c index 805757e02..e906abebb 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -11,8 +11,10 @@ #include "qemu-common.h" #include "block/aio.h" #include "qemu/queue.h" +#include "block/block.h" #include "block/raw-aio.h" #include "qemu/event_notifier.h" +#include "qemu/coroutine.h" #include <libaio.h> @@ -26,11 +28,10 @@ */ #define MAX_EVENTS 128 -#define MAX_QUEUED_IO 128 - struct qemu_laiocb { BlockAIOCB common; - struct qemu_laio_state *ctx; + Coroutine *co; + LinuxAioState *ctx; struct iocb iocb; ssize_t ret; size_t nbytes; @@ -41,12 +42,15 @@ struct qemu_laiocb { typedef struct { int plugged; - unsigned int n; + unsigned int in_queue; + unsigned int in_flight; bool blocked; QSIMPLEQ_HEAD(, qemu_laiocb) pending; } LaioQueue; -struct qemu_laio_state { +struct LinuxAioState { + AioContext *aio_context; + io_context_t ctx; EventNotifier e; @@ -60,7 +64,7 @@ struct qemu_laio_state { int event_max; }; -static void ioq_submit(struct qemu_laio_state *s); +static void ioq_submit(LinuxAioState *s); static inline ssize_t io_event_ret(struct io_event *ev) { @@ -70,8 +74,7 @@ static inline ssize_t io_event_ret(struct io_event *ev) /* * Completes an AIO request (calls the callback and frees the ACB). */ -static void qemu_laio_process_completion(struct qemu_laio_state *s, - struct qemu_laiocb *laiocb) +static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) { int ret; @@ -85,13 +88,18 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, qemu_iovec_memset(laiocb->qiov, ret, 0, laiocb->qiov->size - ret); } else { - ret = -EINVAL; + ret = -ENOSPC; } } } - laiocb->common.cb(laiocb->common.opaque, ret); - qemu_aio_unref(laiocb); + laiocb->ret = ret; + if (laiocb->co) { + qemu_coroutine_enter(laiocb->co); + } else { + laiocb->common.cb(laiocb->common.opaque, ret); + qemu_aio_unref(laiocb); + } } /* The completion BH fetches completed I/O requests and invokes their @@ -99,7 +107,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, * * The function is somewhat tricky because it supports nested event loops, for * example when a request callback invokes aio_poll(). In order to do this, - * the completion events array and index are kept in qemu_laio_state. The BH + * the completion events array and index are kept in LinuxAioState. The BH * reschedules itself as long as there are completions pending so it will * either be called again in a nested event loop or will be called after all * events have been completed. When there are no events left to complete, the @@ -107,7 +115,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, */ static void qemu_laio_completion_bh(void *opaque) { - struct qemu_laio_state *s = opaque; + LinuxAioState *s = opaque; /* Fetch more completion events when empty */ if (s->event_idx == s->event_max) { @@ -122,6 +130,7 @@ static void qemu_laio_completion_bh(void *opaque) s->event_max = 0; return; /* no more events */ } + s->io_q.in_flight -= s->event_max; } /* Reschedule so nested event loops see currently pending completions */ @@ -136,20 +145,22 @@ static void qemu_laio_completion_bh(void *opaque) laiocb->ret = io_event_ret(&s->events[s->event_idx]); s->event_idx++; - qemu_laio_process_completion(s, laiocb); + qemu_laio_process_completion(laiocb); } if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } + + qemu_bh_cancel(s->completion_bh); } static void qemu_laio_completion_cb(EventNotifier *e) { - struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); + LinuxAioState *s = container_of(e, LinuxAioState, e); if (event_notifier_test_and_clear(&s->e)) { - qemu_bh_schedule(s->completion_bh); + qemu_laio_completion_bh(s); } } @@ -181,22 +192,26 @@ static void ioq_init(LaioQueue *io_q) { QSIMPLEQ_INIT(&io_q->pending); io_q->plugged = 0; - io_q->n = 0; + io_q->in_queue = 0; + io_q->in_flight = 0; io_q->blocked = false; } -static void ioq_submit(struct qemu_laio_state *s) +static void ioq_submit(LinuxAioState *s) { int ret, len; struct qemu_laiocb *aiocb; - struct iocb *iocbs[MAX_QUEUED_IO]; + struct iocb *iocbs[MAX_EVENTS]; QSIMPLEQ_HEAD(, qemu_laiocb) completed; do { + if (s->io_q.in_flight >= MAX_EVENTS) { + break; + } len = 0; QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { iocbs[len++] = &aiocb->iocb; - if (len == MAX_QUEUED_IO) { + if (s->io_q.in_flight + len >= MAX_EVENTS) { break; } } @@ -206,55 +221,43 @@ static void ioq_submit(struct qemu_laio_state *s) break; } if (ret < 0) { - abort(); + /* Fail the first request, retry the rest */ + aiocb = QSIMPLEQ_FIRST(&s->io_q.pending); + QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); + s->io_q.in_queue--; + aiocb->ret = ret; + qemu_laio_process_completion(aiocb); + continue; } - s->io_q.n -= ret; + s->io_q.in_flight += ret; + s->io_q.in_queue -= ret; aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); - s->io_q.blocked = (s->io_q.n > 0); + s->io_q.blocked = (s->io_q.in_queue > 0); } -void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) { - struct qemu_laio_state *s = aio_ctx; - s->io_q.plugged++; } -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s) { - struct qemu_laio_state *s = aio_ctx; - - assert(s->io_q.plugged > 0 || !unplug); - - if (unplug && --s->io_q.plugged > 0) { - return; - } - - if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { + assert(s->io_q.plugged); + if (--s->io_q.plugged == 0 && + !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } } -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) +static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, + int type) { - struct qemu_laio_state *s = aio_ctx; - struct qemu_laiocb *laiocb; - struct iocb *iocbs; - off_t offset = sector_num * 512; - - laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); - laiocb->nbytes = nb_sectors * 512; - laiocb->ctx = s; - laiocb->ret = -EINPROGRESS; - laiocb->is_read = (type == QEMU_AIO_READ); - laiocb->qiov = qiov; - - iocbs = &laiocb->iocb; + LinuxAioState *s = laiocb->ctx; + struct iocb *iocbs = &laiocb->iocb; + QEMUIOVector *qiov = laiocb->qiov; switch (type) { case QEMU_AIO_WRITE: @@ -267,43 +270,83 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, default: fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", __func__, type); - goto out_free_aiocb; + return -EIO; } io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); - s->io_q.n++; + s->io_q.in_queue++; if (!s->io_q.blocked && - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { + (!s->io_q.plugged || + s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { ioq_submit(s); } - return &laiocb->common; -out_free_aiocb: - qemu_aio_unref(laiocb); - return NULL; + return 0; } -void laio_detach_aio_context(void *s_, AioContext *old_context) +int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, + uint64_t offset, QEMUIOVector *qiov, int type) { - struct qemu_laio_state *s = s_; + int ret; + struct qemu_laiocb laiocb = { + .co = qemu_coroutine_self(), + .nbytes = qiov->size, + .ctx = s, + .is_read = (type == QEMU_AIO_READ), + .qiov = qiov, + }; + + ret = laio_do_submit(fd, &laiocb, offset, type); + if (ret < 0) { + return ret; + } + + qemu_coroutine_yield(); + return laiocb.ret; +} + +BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + struct qemu_laiocb *laiocb; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + int ret; + + laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); + laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE; + laiocb->ctx = s; + laiocb->ret = -EINPROGRESS; + laiocb->is_read = (type == QEMU_AIO_READ); + laiocb->qiov = qiov; + + ret = laio_do_submit(fd, laiocb, offset, type); + if (ret < 0) { + qemu_aio_unref(laiocb); + return NULL; + } + return &laiocb->common; +} + +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) +{ aio_set_event_notifier(old_context, &s->e, false, NULL); qemu_bh_delete(s->completion_bh); } -void laio_attach_aio_context(void *s_, AioContext *new_context) +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) { - struct qemu_laio_state *s = s_; - + s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); aio_set_event_notifier(new_context, &s->e, false, qemu_laio_completion_cb); } -void *laio_init(void) +LinuxAioState *laio_init(void) { - struct qemu_laio_state *s; + LinuxAioState *s; s = g_malloc0(sizeof(*s)); if (event_notifier_init(&s->e, false) < 0) { @@ -325,10 +368,8 @@ out_free_state: return NULL; } -void laio_cleanup(void *s_) +void laio_cleanup(LinuxAioState *s) { - struct qemu_laio_state *s = s_; - event_notifier_cleanup(&s->e); if (io_destroy(s->ctx) != 0) { diff --git a/block/mirror.c b/block/mirror.c index 039f48125..e0b3f4180 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -20,11 +20,12 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "qemu/bitmap.h" -#include "qemu/error-report.h" #define SLICE_TIME 100000000ULL /* ns */ #define MAX_IN_FLIGHT 16 -#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) +#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */ +#define DEFAULT_MIRROR_BUF_SIZE \ + (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE) /* The mirroring buffer is a list of granularity-sized chunks. * Free chunks are organized in a list. @@ -36,7 +37,7 @@ typedef struct MirrorBuffer { typedef struct MirrorBlockJob { BlockJob common; RateLimit limit; - BlockDriverState *target; + BlockBackend *target; BlockDriverState *base; /* The name of the graph node to replace */ char *replaces; @@ -45,6 +46,7 @@ typedef struct MirrorBlockJob { /* Used to block operations on the drive-mirror-replace target */ Error *replace_blocker; bool is_none_mode; + BlockMirrorBackingMode backing_mode; BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; @@ -58,9 +60,10 @@ typedef struct MirrorBlockJob { QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; int buf_free_count; + uint64_t last_pause_ns; unsigned long *in_flight_bitmap; int in_flight; - int sectors_in_flight; + int64_t sectors_in_flight; int ret; bool unmap; bool waiting_for_io; @@ -80,11 +83,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, { s->synced = false; if (read) { - return block_job_error_action(&s->common, s->common.bs, - s->on_source_error, true, error); + return block_job_error_action(&s->common, s->on_source_error, + true, error); } else { - return block_job_error_action(&s->common, s->target, - s->on_target_error, false, error); + return block_job_error_action(&s->common, s->on_target_error, + false, error); } } @@ -121,7 +124,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) g_free(op); if (s->waiting_for_io) { - qemu_coroutine_enter(s->common.co, NULL); + qemu_coroutine_enter(s->common.co); } } @@ -157,8 +160,8 @@ static void mirror_read_complete(void *opaque, int ret) mirror_iteration_done(op, ret); return; } - bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, - mirror_write_complete, op); + blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov, + 0, mirror_write_complete, op); } static inline void mirror_clip_sectors(MirrorBlockJob *s, @@ -186,8 +189,9 @@ static int mirror_cow_align(MirrorBlockJob *s, need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, s->cow_bitmap); if (need_cow) { - bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors, - &align_sector_num, &align_nb_sectors); + bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num, + *nb_sectors, &align_sector_num, + &align_nb_sectors); } if (align_nb_sectors > max_sectors) { @@ -217,23 +221,29 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s) } /* Submit async read while handling COW. - * Returns: nb_sectors if no alignment is necessary, or + * Returns: The number of sectors copied after and including sector_num, + * excluding any sectors copied prior to sector_num due to alignment. + * This will be nb_sectors if no alignment is necessary, or * (new_end - sector_num) if tail is rounded up or down due to * alignment or buffer limit. */ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, int nb_sectors) { - BlockDriverState *source = s->common.bs; + BlockBackend *source = s->common.blk; int sectors_per_chunk, nb_chunks; - int ret = nb_sectors; + int ret; MirrorOp *op; + int max_sectors; sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + max_sectors = sectors_per_chunk * s->max_iov; /* We can only handle as much as buf_size at a time. */ nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors); + nb_sectors = MIN(max_sectors, nb_sectors); assert(nb_sectors); + ret = nb_sectors; if (s->cow_bitmap) { ret += mirror_cow_align(s, §or_num, &nb_sectors); @@ -274,7 +284,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0, mirror_read_complete, op); return ret; } @@ -296,10 +306,12 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s, s->in_flight++; s->sectors_in_flight += nb_sectors; if (is_discard) { - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, + blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS, + op->nb_sectors << BDRV_SECTOR_BITS, mirror_write_complete, op); } else { - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, + blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE, + op->nb_sectors * BDRV_SECTOR_SIZE, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, mirror_write_complete, op); } @@ -307,13 +319,16 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s, static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { - BlockDriverState *source = s->common.bs; + BlockDriverState *source = blk_bs(s->common.blk); int64_t sector_num, first_chunk; uint64_t delay_ns = 0; /* At least the first dirty chunk is mirrored in one iteration. */ int nb_chunks = 1; int64_t end = s->bdev_length / BDRV_SECTOR_SIZE; int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)); + int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT, + MAX_IO_SECTORS); sector_num = hbitmap_iter_next(&s->hbi); if (sector_num < 0) { @@ -325,10 +340,12 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) first_chunk = sector_num / sectors_per_chunk; while (test_bit(first_chunk, s->in_flight_bitmap)) { - trace_mirror_yield_in_flight(s, first_chunk, s->in_flight); + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); mirror_wait_for_io(s); } + block_job_pause_point(&s->common); + /* Find the number of consective dirty chunks following the first dirty * one, and wait for in flight requests in them. */ while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { @@ -362,7 +379,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); while (nb_chunks > 0 && sector_num < end) { int ret; - int io_sectors; + int io_sectors, io_sectors_acct; BlockDriverState *file; enum MirrorMethod { MIRROR_METHOD_COPY, @@ -375,7 +392,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) nb_chunks * sectors_per_chunk, &io_sectors, &file); if (ret < 0) { - io_sectors = nb_chunks * sectors_per_chunk; + io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors); + } else if (ret & BDRV_BLOCK_DATA) { + io_sectors = MIN(io_sectors, max_io_sectors); } io_sectors -= io_sectors % sectors_per_chunk; @@ -384,8 +403,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { int64_t target_sector_num; int target_nb_sectors; - bdrv_round_to_clusters(s->target, sector_num, io_sectors, - &target_sector_num, &target_nb_sectors); + bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num, + io_sectors, &target_sector_num, + &target_nb_sectors); if (target_sector_num == sector_num && target_nb_sectors == io_sectors) { mirror_method = ret & BDRV_BLOCK_ZERO ? @@ -394,16 +414,30 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) } } + while (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + mirror_wait_for_io(s); + } + + if (s->ret < 0) { + return 0; + } + mirror_clip_sectors(s, sector_num, &io_sectors); switch (mirror_method) { case MIRROR_METHOD_COPY: io_sectors = mirror_do_read(s, sector_num, io_sectors); + io_sectors_acct = io_sectors; break; case MIRROR_METHOD_ZERO: - mirror_do_zero_or_discard(s, sector_num, io_sectors, false); - break; case MIRROR_METHOD_DISCARD: - mirror_do_zero_or_discard(s, sector_num, io_sectors, true); + mirror_do_zero_or_discard(s, sector_num, io_sectors, + mirror_method == MIRROR_METHOD_DISCARD); + if (write_zeroes_ok) { + io_sectors_acct = 0; + } else { + io_sectors_acct = io_sectors; + } break; default: abort(); @@ -411,7 +445,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) assert(io_sectors); sector_num += io_sectors; nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk); - delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors); + if (s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct); + } } return delay_ns; } @@ -449,7 +485,8 @@ static void mirror_exit(BlockJob *job, void *opaque) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); MirrorExitData *data = opaque; AioContext *replace_aio_context = NULL; - BlockDriverState *src = s->common.bs; + BlockDriverState *src = blk_bs(s->common.blk); + BlockDriverState *target_bs = blk_bs(s->target); /* Make sure that the source BDS doesn't go away before we called * block_job_completed(). */ @@ -461,26 +498,25 @@ static void mirror_exit(BlockJob *job, void *opaque) } if (s->should_complete && data->ret == 0) { - BlockDriverState *to_replace = s->common.bs; + BlockDriverState *to_replace = src; if (s->to_replace) { to_replace = s->to_replace; } - /* This was checked in mirror_start_job(), but meanwhile one of the - * nodes could have been newly attached to a BlockBackend. */ - if (to_replace->blk && s->target->blk) { - error_report("block job: Can't create node with two BlockBackends"); - data->ret = -EINVAL; - goto out; + if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) { + bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL); } - if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { - bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); - } - bdrv_replace_in_backing_chain(to_replace, s->target); - } + /* The mirror job has no requests in flight any more, but we need to + * drain potential other users of the BDS before changing the graph. */ + bdrv_drained_begin(target_bs); + bdrv_replace_in_backing_chain(to_replace, target_bs); + bdrv_drained_end(target_bs); -out: + /* We just changed the BDS the job BB refers to */ + blk_remove_bs(job->blk); + blk_insert_bs(job->blk, src); + } if (s->to_replace) { bdrv_op_unblock_all(s->to_replace, s->replace_blocker); error_free(s->replace_blocker); @@ -490,29 +526,102 @@ out: aio_context_release(replace_aio_context); } g_free(s->replaces); - bdrv_op_unblock_all(s->target, s->common.blocker); - bdrv_unref(s->target); + bdrv_op_unblock_all(target_bs, s->common.blocker); + blk_unref(s->target); block_job_completed(&s->common, data->ret); g_free(data); bdrv_drained_end(src); - if (qemu_get_aio_context() == bdrv_get_aio_context(src)) { - aio_enable_external(iohandler_get_aio_context()); - } bdrv_unref(src); } +static void mirror_throttle(MirrorBlockJob *s) +{ + int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + + if (now - s->last_pause_ns > SLICE_TIME) { + s->last_pause_ns = now; + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); + } else { + block_job_pause_point(&s->common); + } +} + +static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) +{ + int64_t sector_num, end; + BlockDriverState *base = s->base; + BlockDriverState *bs = blk_bs(s->common.blk); + BlockDriverState *target_bs = blk_bs(s->target); + int ret, n; + + end = s->bdev_length / BDRV_SECTOR_SIZE; + + if (base == NULL && !bdrv_has_zero_init(target_bs)) { + if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end); + return 0; + } + + for (sector_num = 0; sector_num < end; ) { + int nb_sectors = MIN(end - sector_num, + QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS); + + mirror_throttle(s); + + if (block_job_is_cancelled(&s->common)) { + return 0; + } + + if (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1); + mirror_wait_for_io(s); + continue; + } + + mirror_do_zero_or_discard(s, sector_num, nb_sectors, false); + sector_num += nb_sectors; + } + + mirror_drain(s); + } + + /* First part, loop on the sectors and initialize the dirty bitmap. */ + for (sector_num = 0; sector_num < end; ) { + /* Just to make sure we are not exceeding int limit. */ + int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, + end - sector_num); + + mirror_throttle(s); + + if (block_job_is_cancelled(&s->common)) { + return 0; + } + + ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); + if (ret < 0) { + return ret; + } + + assert(n > 0); + if (ret == 1) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); + } + sector_num += n; + } + return 0; +} + static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; MirrorExitData *data; - BlockDriverState *bs = s->common.bs; - int64_t sector_num, end, length; - uint64_t last_pause_ns; + BlockDriverState *bs = blk_bs(s->common.blk); + BlockDriverState *target_bs = blk_bs(s->target); + int64_t length; BlockDriverInfo bdi; char backing_filename[2]; /* we only need 2 characters because we are only checking for a NULL string */ int ret = 0; - int n; int target_cluster_size = BDRV_SECTOR_SIZE; if (block_job_is_cancelled(&s->common)) { @@ -541,20 +650,19 @@ static void coroutine_fn mirror_run(void *opaque) * the destination do COW. Instead, we copy sectors around the * dirty data if needed. We need a bitmap to do that. */ - bdrv_get_backing_filename(s->target, backing_filename, + bdrv_get_backing_filename(target_bs, backing_filename, sizeof(backing_filename)); - if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) { + if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) { target_cluster_size = bdi.cluster_size; } - if (backing_filename[0] && !s->target->backing + if (backing_filename[0] && !target_bs->backing && s->granularity < target_cluster_size) { s->buf_size = MAX(s->buf_size, target_cluster_size); s->cow_bitmap = bitmap_new(length); } s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; - s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov); + s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov); - end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); if (s->buf == NULL) { ret = -ENOMEM; @@ -563,45 +671,18 @@ static void coroutine_fn mirror_run(void *opaque) mirror_free_init(s); - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); if (!s->is_none_mode) { - /* First part, loop on the sectors and initialize the dirty bitmap. */ - BlockDriverState *base = s->base; - bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target); - - for (sector_num = 0; sector_num < end; ) { - /* Just to make sure we are not exceeding int limit. */ - int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, - end - sector_num); - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - - if (now - last_pause_ns > SLICE_TIME) { - last_pause_ns = now; - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); - } - - if (block_job_is_cancelled(&s->common)) { - goto immediate_exit; - } - - ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); - - if (ret < 0) { - goto immediate_exit; - } - - assert(n > 0); - if (ret == 1 || mark_all_dirty) { - bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); - } - sector_num += n; + ret = mirror_dirty_init(s); + if (ret < 0 || block_job_is_cancelled(&s->common)) { + goto immediate_exit; } } bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); for (;;) { uint64_t delay_ns = 0; - int64_t cnt; + int64_t cnt, delta; bool should_complete; if (s->ret < 0) { @@ -609,6 +690,8 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } + block_job_pause_point(&s->common); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); /* s->common.offset contains the number of bytes already processed so * far, cnt is the number of dirty sectors remaining and @@ -622,9 +705,10 @@ static void coroutine_fn mirror_run(void *opaque) * We do so every SLICE_TIME nanoseconds, or when there is an error, * or when the source is clean, whichever comes first. */ - if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && + delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns; + if (delta < SLICE_TIME && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { - if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || + if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); mirror_wait_for_io(s); @@ -637,7 +721,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = false; if (s->in_flight == 0 && cnt == 0) { trace_mirror_before_flush(s); - ret = bdrv_flush(s->target); + ret = blk_flush(s->target); if (ret < 0) { if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) { @@ -692,7 +776,7 @@ static void coroutine_fn mirror_run(void *opaque) s->common.cancelled = false; break; } - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } immediate_exit: @@ -710,21 +794,12 @@ immediate_exit: g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); - if (s->target->blk) { - blk_iostatus_disable(s->target->blk); - } data = g_malloc(sizeof(*data)); data->ret = ret; /* Before we switch to target in mirror_exit, make sure data doesn't * change. */ - bdrv_drained_begin(s->common.bs); - if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) { - /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the - * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we - * need a block layer API change to achieve this. */ - aio_disable_external(iohandler_get_aio_context()); - } + bdrv_drained_begin(bs); block_job_defer_to_main_loop(&s->common, mirror_exit, data); } @@ -739,32 +814,31 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static void mirror_iostatus_reset(BlockJob *job) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - - if (s->target->blk) { - blk_iostatus_reset(s->target->blk); - } -} - static void mirror_complete(BlockJob *job, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - Error *local_err = NULL; - int ret; + BlockDriverState *src, *target; + + src = blk_bs(job->blk); + target = blk_bs(s->target); - ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - return; - } if (!s->synced) { - error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id); + error_setg(errp, "The active block job '%s' cannot be completed", + job->id); return; } - /* check the target bs is not blocked and block all operations on it */ + if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) { + int ret; + + assert(!target->backing); + ret = bdrv_open_backing_file(target, NULL, "backing", errp); + if (ret < 0) { + return; + } + } + + /* block all operations on to_replace bs */ if (s->replaces) { AioContext *replace_aio_context; @@ -785,31 +859,57 @@ static void mirror_complete(BlockJob *job, Error **errp) aio_context_release(replace_aio_context); } + if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) { + BlockDriverState *backing = s->is_none_mode ? src : s->base; + if (backing_bs(target) != backing) { + bdrv_set_backing_hd(target, backing); + } + } + s->should_complete = true; block_job_enter(&s->common); } +/* There is no matching mirror_resume() because mirror_run() will begin + * iterating again when the job is resumed. + */ +static void coroutine_fn mirror_pause(BlockJob *job) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + mirror_drain(s); +} + +static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + blk_set_aio_context(s->target, new_context); +} + static const BlockJobDriver mirror_job_driver = { - .instance_size = sizeof(MirrorBlockJob), - .job_type = BLOCK_JOB_TYPE_MIRROR, - .set_speed = mirror_set_speed, - .iostatus_reset= mirror_iostatus_reset, - .complete = mirror_complete, + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_MIRROR, + .set_speed = mirror_set_speed, + .complete = mirror_complete, + .pause = mirror_pause, + .attached_aio_context = mirror_attached_aio_context, }; static const BlockJobDriver commit_active_job_driver = { - .instance_size = sizeof(MirrorBlockJob), - .job_type = BLOCK_JOB_TYPE_COMMIT, - .set_speed = mirror_set_speed, - .iostatus_reset - = mirror_iostatus_reset, - .complete = mirror_complete, + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = mirror_set_speed, + .complete = mirror_complete, + .pause = mirror_pause, + .attached_aio_context = mirror_attached_aio_context, }; -static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, - const char *replaces, +static void mirror_start_job(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, const char *replaces, int64_t speed, uint32_t granularity, int64_t buf_size, + BlockMirrorBackingMode backing_mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, bool unmap, @@ -819,7 +919,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; - BlockDriverState *replaced_bs; if (granularity == 0) { granularity = bdrv_get_default_bitmap_granularity(target); @@ -827,13 +926,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, assert ((granularity & (granularity - 1)) == 0); - if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || - on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); - return; - } - if (buf_size < 0) { error_setg(errp, "Invalid parameter 'buf-size'"); return; @@ -843,31 +935,19 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, buf_size = DEFAULT_MIRROR_BUF_SIZE; } - /* We can't support this case as long as the block layer can't handle - * multiple BlockBackends per BlockDriverState. */ - if (replaces) { - replaced_bs = bdrv_lookup_bs(replaces, replaces, errp); - if (replaced_bs == NULL) { - return; - } - } else { - replaced_bs = bs; - } - if (replaced_bs->blk && target->blk) { - error_setg(errp, "Can't create node with two BlockBackends"); - return; - } - - s = block_job_create(driver, bs, speed, cb, opaque, errp); + s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp); if (!s) { return; } + s->target = blk_new(); + blk_insert_bs(s->target, target); + s->replaces = g_strdup(replaces); s->on_source_error = on_source_error; s->on_target_error = on_target_error; - s->target = target; s->is_none_mode = is_none_mode; + s->backing_mode = backing_mode; s->base = base; s->granularity = granularity; s->buf_size = ROUND_UP(buf_size, granularity); @@ -876,25 +956,23 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { g_free(s->replaces); + blk_unref(s->target); block_job_unref(&s->common); return; } - bdrv_op_block_all(s->target, s->common.blocker); + bdrv_op_block_all(target, s->common.blocker); - if (s->target->blk) { - blk_set_on_error(s->target->blk, on_target_error, on_target_error); - blk_iostatus_enable(s->target->blk); - } - s->common.co = qemu_coroutine_create(mirror_run); + s->common.co = qemu_coroutine_create(mirror_run, s); trace_mirror_start(bs, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); + qemu_coroutine_enter(s->common.co); } -void mirror_start(BlockDriverState *bs, BlockDriverState *target, - const char *replaces, +void mirror_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, const char *replaces, int64_t speed, uint32_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockdevOnError on_source_error, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, + BlockdevOnError on_source_error, BlockdevOnError on_target_error, bool unmap, BlockCompletionFunc *cb, @@ -909,14 +987,14 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; - mirror_start_job(bs, target, replaces, - speed, granularity, buf_size, + mirror_start_job(job_id, bs, target, replaces, + speed, granularity, buf_size, backing_mode, on_source_error, on_target_error, unmap, cb, opaque, errp, &mirror_job_driver, is_none_mode, base); } -void commit_active_start(BlockDriverState *bs, BlockDriverState *base, - int64_t speed, +void commit_active_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, int64_t speed, BlockdevOnError on_error, BlockCompletionFunc *cb, void *opaque, Error **errp) @@ -957,8 +1035,8 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base, } } - bdrv_ref(base); - mirror_start_job(bs, base, NULL, speed, 0, 0, + mirror_start_job(job_id, bs, base, NULL, speed, 0, 0, + MIRROR_LEAVE_BACKING_CHAIN, on_error, on_error, false, cb, opaque, &local_err, &commit_active_job_driver, false, base); if (local_err) { diff --git a/block/nbd-client.c b/block/nbd-client.c index 878e879ac..2cf3237ef 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -38,7 +38,7 @@ static void nbd_recv_coroutines_enter_all(NbdClientSession *s) for (i = 0; i < MAX_NBD_REQUESTS; i++) { if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); + qemu_coroutine_enter(s->recv_coroutine[i]); } } } @@ -99,7 +99,7 @@ static void nbd_reply_ready(void *opaque) } if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); + qemu_coroutine_enter(s->recv_coroutine[i]); return; } @@ -111,12 +111,12 @@ static void nbd_restart_write(void *opaque) { BlockDriverState *bs = opaque; - qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL); + qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine); } static int nbd_co_send_request(BlockDriverState *bs, struct nbd_request *request, - QEMUIOVector *qiov, int offset) + QEMUIOVector *qiov) { NbdClientSession *s = nbd_get_client_session(bs); AioContext *aio_context; @@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs, qio_channel_set_cork(s->ioc, true); rc = nbd_send_request(s->ioc, request); if (rc >= 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 0); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, + false); if (ret != request->len) { rc = -EIO; } @@ -167,8 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs, } static void nbd_co_receive_reply(NbdClientSession *s, - struct nbd_request *request, struct nbd_reply *reply, - QEMUIOVector *qiov, int offset) + struct nbd_request *request, + struct nbd_reply *reply, + QEMUIOVector *qiov) { int ret; @@ -181,8 +182,8 @@ static void nbd_co_receive_reply(NbdClientSession *s, reply->error = EIO; } else { if (qiov && reply->error == 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 1); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, + true); if (ret != request->len) { reply->error = EIO; } @@ -217,97 +218,62 @@ static void nbd_coroutine_end(NbdClientSession *s, } } -static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) +int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_READ }; + struct nbd_request request = { + .type = NBD_CMD_READ, + .from = offset, + .len = bytes, + }; struct nbd_reply reply; ssize_t ret; - request.from = sector_num * 512; - request.len = nb_sectors * 512; + assert(bytes <= NBD_MAX_BUFFER_SIZE); + assert(!flags); nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, qiov, offset); + nbd_co_receive_reply(client, &request, &reply, qiov); } nbd_coroutine_end(client, &request); return -reply.error; - } -static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset, int *flags) +int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_WRITE }; + struct nbd_request request = { + .type = NBD_CMD_WRITE, + .from = offset, + .len = bytes, + }; struct nbd_reply reply; ssize_t ret; - if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) { - *flags &= ~BDRV_REQ_FUA; + if (flags & BDRV_REQ_FUA) { + assert(client->nbdflags & NBD_FLAG_SEND_FUA); request.type |= NBD_CMD_FLAG_FUA; } - request.from = sector_num * 512; - request.len = nb_sectors * 512; + assert(bytes <= NBD_MAX_BUFFER_SIZE); nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, qiov, offset); + ret = nbd_co_send_request(bs, &request, qiov); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; } -/* qemu-nbd has a limit of slightly less than 1M per request. Try to - * remain aligned to 4K. */ -#define NBD_MAX_SECTORS 2040 - -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); -} - -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset, - flags); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags); -} - int nbd_client_co_flush(BlockDriverState *bs) { NbdClientSession *client = nbd_get_client_session(bs); @@ -323,36 +289,37 @@ int nbd_client_co_flush(BlockDriverState *bs) request.len = 0; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; } -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_TRIM }; + struct nbd_request request = { + .type = NBD_CMD_TRIM, + .from = offset, + .len = count, + }; struct nbd_reply reply; ssize_t ret; if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { return 0; } - request.from = sector_num * 512; - request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; @@ -414,6 +381,9 @@ int nbd_client_init(BlockDriverState *bs, logout("Failed to negotiate with the NBD server\n"); return ret; } + if (client->nbdflags & NBD_FLAG_SEND_FUA) { + bs->supported_write_flags = BDRV_REQ_FUA; + } qemu_co_mutex_init(&client->send_mutex); qemu_co_mutex_init(&client->free_sema); diff --git a/block/nbd-client.h b/block/nbd-client.h index bc7aec079..044aca453 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -20,7 +20,7 @@ typedef struct NbdClientSession { QIOChannelSocket *sioc; /* The master data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ - uint32_t nbdflags; + uint16_t nbdflags; off_t size; CoMutex send_mutex; @@ -44,13 +44,12 @@ int nbd_client_init(BlockDriverState *bs, Error **errp); void nbd_client_close(BlockDriverState *bs); -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors); +int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int nbd_client_co_flush(BlockDriverState *bs); -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags); -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov); +int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags); +int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags); void nbd_client_detach_aio_context(BlockDriverState *bs); void nbd_client_attach_aio_context(BlockDriverState *bs, diff --git a/block/nbd.c b/block/nbd.c index f7ea3b360..6bc06d619 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -42,6 +42,9 @@ typedef struct BDRVNBDState { NbdClientSession client; + + /* For nbd_refresh_filename() */ + char *path, *host, *port, *export, *tlscredsid; } BDRVNBDState; static int nbd_parse_uri(const char *filename, QDict *options) @@ -188,13 +191,15 @@ out: g_free(file); } -static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export, - Error **errp) +static SocketAddress *nbd_config(BDRVNBDState *s, QemuOpts *opts, Error **errp) { SocketAddress *saddr; - if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { - if (qdict_haskey(options, "path")) { + s->path = g_strdup(qemu_opt_get(opts, "path")); + s->host = g_strdup(qemu_opt_get(opts, "host")); + + if (!s->path == !s->host) { + if (s->path) { error_setg(errp, "path and host may not be used at the same time."); } else { error_setg(errp, "one of path and host must be specified."); @@ -204,32 +209,28 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export, saddr = g_new0(SocketAddress, 1); - if (qdict_haskey(options, "path")) { + if (s->path) { UnixSocketAddress *q_unix; saddr->type = SOCKET_ADDRESS_KIND_UNIX; q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1); - q_unix->path = g_strdup(qdict_get_str(options, "path")); - qdict_del(options, "path"); + q_unix->path = g_strdup(s->path); } else { InetSocketAddress *inet; + + s->port = g_strdup(qemu_opt_get(opts, "port")); + saddr->type = SOCKET_ADDRESS_KIND_INET; inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1); - inet->host = g_strdup(qdict_get_str(options, "host")); - if (!qdict_get_try_str(options, "port")) { + inet->host = g_strdup(s->host); + inet->port = g_strdup(s->port); + if (!inet->port) { inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT); - } else { - inet->port = g_strdup(qdict_get_str(options, "port")); } - qdict_del(options, "host"); - qdict_del(options, "port"); } s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX; - *export = g_strdup(qdict_get_try_str(options, "export")); - if (*export) { - qdict_del(options, "export"); - } + s->export = g_strdup(qemu_opt_get(opts, "export")); return saddr; } @@ -292,28 +293,66 @@ static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp) } +static QemuOptsList nbd_runtime_opts = { + .name = "nbd", + .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head), + .desc = { + { + .name = "host", + .type = QEMU_OPT_STRING, + .help = "TCP host to connect to", + }, + { + .name = "port", + .type = QEMU_OPT_STRING, + .help = "TCP port to connect to", + }, + { + .name = "path", + .type = QEMU_OPT_STRING, + .help = "Unix socket path to connect to", + }, + { + .name = "export", + .type = QEMU_OPT_STRING, + .help = "Name of the NBD export to open", + }, + { + .name = "tls-creds", + .type = QEMU_OPT_STRING, + .help = "ID of the TLS credentials to use", + }, + }, +}; + static int nbd_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVNBDState *s = bs->opaque; - char *export = NULL; + QemuOpts *opts = NULL; + Error *local_err = NULL; QIOChannelSocket *sioc = NULL; - SocketAddress *saddr; - const char *tlscredsid; + SocketAddress *saddr = NULL; QCryptoTLSCreds *tlscreds = NULL; const char *hostname = NULL; int ret = -EINVAL; + opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto error; + } + /* Pop the config into our state object. Exit if invalid. */ - saddr = nbd_config(s, options, &export, errp); + saddr = nbd_config(s, opts, errp); if (!saddr) { goto error; } - tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds")); - if (tlscredsid) { - qdict_del(options, "tls-creds"); - tlscreds = nbd_get_tls_creds(tlscredsid, errp); + s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds")); + if (s->tlscredsid) { + tlscreds = nbd_get_tls_creds(s->tlscredsid, errp); if (!tlscreds) { goto error; } @@ -335,7 +374,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, } /* NBD handshake */ - ret = nbd_client_init(bs, sioc, export, + ret = nbd_client_init(bs, sioc, s->export, tlscreds, hostname, errp); error: if (sioc) { @@ -344,42 +383,18 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, if (tlscreds) { object_unref(OBJECT(tlscreds)); } - qapi_free_SocketAddress(saddr); - g_free(export); - return ret; -} - -static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); -} - -static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags) -{ - int ret; - - ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags); if (ret < 0) { - return ret; - } - - /* The flag wasn't sent to the server, so we need to emulate it with an - * explicit flush */ - if (flags & BDRV_REQ_FUA) { - ret = nbd_client_co_flush(bs); + g_free(s->path); + g_free(s->host); + g_free(s->port); + g_free(s->export); + g_free(s->tlscredsid); } - + qapi_free_SocketAddress(saddr); + qemu_opts_del(opts); return ret; } -static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - static int nbd_co_flush(BlockDriverState *bs) { return nbd_client_co_flush(bs); @@ -387,19 +402,21 @@ static int nbd_co_flush(BlockDriverState *bs) static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) { - bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; - bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; -} - -static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - return nbd_client_co_discard(bs, sector_num, nb_sectors); + bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE; + bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE; } static void nbd_close(BlockDriverState *bs) { + BDRVNBDState *s = bs->opaque; + nbd_client_close(bs); + + g_free(s->path); + g_free(s->host); + g_free(s->port); + g_free(s->export); + g_free(s->tlscredsid); } static int64_t nbd_getlength(BlockDriverState *bs) @@ -422,48 +439,45 @@ static void nbd_attach_aio_context(BlockDriverState *bs, static void nbd_refresh_filename(BlockDriverState *bs, QDict *options) { + BDRVNBDState *s = bs->opaque; QDict *opts = qdict_new(); - const char *path = qdict_get_try_str(options, "path"); - const char *host = qdict_get_try_str(options, "host"); - const char *port = qdict_get_try_str(options, "port"); - const char *export = qdict_get_try_str(options, "export"); - const char *tlscreds = qdict_get_try_str(options, "tls-creds"); qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd"))); - if (path && export) { + if (s->path && s->export) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd+unix:///%s?socket=%s", export, path); - } else if (path && !export) { + "nbd+unix:///%s?socket=%s", s->export, s->path); + } else if (s->path && !s->export) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd+unix://?socket=%s", path); - } else if (!path && export && port) { + "nbd+unix://?socket=%s", s->path); + } else if (!s->path && s->export && s->port) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s:%s/%s", host, port, export); - } else if (!path && export && !port) { + "nbd://%s:%s/%s", s->host, s->port, s->export); + } else if (!s->path && s->export && !s->port) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s/%s", host, export); - } else if (!path && !export && port) { + "nbd://%s/%s", s->host, s->export); + } else if (!s->path && !s->export && s->port) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s:%s", host, port); - } else if (!path && !export && !port) { + "nbd://%s:%s", s->host, s->port); + } else if (!s->path && !s->export && !s->port) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s", host); + "nbd://%s", s->host); } - if (path) { - qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path))); - } else if (port) { - qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); - qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port))); + if (s->path) { + qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(s->path))); + } else if (s->port) { + qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host))); + qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(s->port))); } else { - qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); + qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host))); } - if (export) { - qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export))); + if (s->export) { + qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(s->export))); } - if (tlscreds) { - qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds))); + if (s->tlscredsid) { + qdict_put_obj(opts, "tls-creds", + QOBJECT(qstring_from_str(s->tlscredsid))); } bs->full_open_options = opts; @@ -475,13 +489,11 @@ static BlockDriver bdrv_nbd = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, @@ -495,13 +507,11 @@ static BlockDriver bdrv_nbd_tcp = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, @@ -515,13 +525,11 @@ static BlockDriver bdrv_nbd_unix = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, diff --git a/block/nfs.c b/block/nfs.c index 9f51cc3f1..8602a4421 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -1,7 +1,7 @@ /* * QEMU Block driver for native access to files on NFS shares * - * Copyright (c) 2014 Peter Lieven <pl@kamp.de> + * Copyright (c) 2014-2016 Peter Lieven <pl@kamp.de> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -38,6 +38,7 @@ #include <nfsc/libnfs.h> #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 +#define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE) #define QEMU_NFS_MAX_DEBUG_LEVEL 2 typedef struct NFSClient { @@ -47,6 +48,7 @@ typedef struct NFSClient { bool has_zero_init; AioContext *aio_context; blkcnt_t st_blocks; + bool cache_used; } NFSClient; typedef struct NFSRPC { @@ -102,7 +104,7 @@ static void nfs_co_generic_bh_cb(void *opaque) NFSRPC *task = opaque; task->complete = 1; qemu_bh_delete(task->bh); - qemu_coroutine_enter(task->co, NULL); + qemu_coroutine_enter(task->co); } static void @@ -278,7 +280,7 @@ static void nfs_file_close(BlockDriverState *bs) } static int64_t nfs_client_open(NFSClient *client, const char *filename, - int flags, Error **errp) + int flags, Error **errp, int open_flags) { int ret = -EINVAL, i; struct stat st; @@ -330,12 +332,38 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, nfs_set_tcp_syncnt(client->context, val); #ifdef LIBNFS_FEATURE_READAHEAD } else if (!strcmp(qp->p[i].name, "readahead")) { + if (open_flags & BDRV_O_NOCACHE) { + error_setg(errp, "Cannot enable NFS readahead " + "if cache.direct = on"); + goto fail; + } if (val > QEMU_NFS_MAX_READAHEAD_SIZE) { error_report("NFS Warning: Truncating NFS readahead" " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE); val = QEMU_NFS_MAX_READAHEAD_SIZE; } nfs_set_readahead(client->context, val); +#ifdef LIBNFS_FEATURE_PAGECACHE + nfs_set_pagecache_ttl(client->context, 0); +#endif + client->cache_used = true; +#endif +#ifdef LIBNFS_FEATURE_PAGECACHE + nfs_set_pagecache_ttl(client->context, 0); + } else if (!strcmp(qp->p[i].name, "pagecache")) { + if (open_flags & BDRV_O_NOCACHE) { + error_setg(errp, "Cannot enable NFS pagecache " + "if cache.direct = on"); + goto fail; + } + if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) { + error_report("NFS Warning: Truncating NFS pagecache" + " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE); + val = QEMU_NFS_MAX_PAGECACHE_SIZE; + } + nfs_set_pagecache(client->context, val); + nfs_set_pagecache_ttl(client->context, 0); + client->cache_used = true; #endif #ifdef LIBNFS_FEATURE_DEBUG } else if (!strcmp(qp->p[i].name, "debug")) { @@ -418,7 +446,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, } ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, - errp); + errp, bs->open_flags); if (ret < 0) { goto out; } @@ -454,7 +482,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); - ret = nfs_client_open(client, url, O_CREAT, errp); + ret = nfs_client_open(client, url, O_CREAT, errp, 0); if (ret < 0) { goto out; } @@ -516,6 +544,12 @@ static int nfs_reopen_prepare(BDRVReopenState *state, return -EACCES; } + if ((state->flags & BDRV_O_NOCACHE) && client->cache_used) { + error_setg(errp, "Cannot disable cache if libnfs readahead or" + " pagecache is enabled"); + return -EINVAL; + } + /* Update cache for read-only reopens */ if (!(state->flags & BDRV_O_RDWR)) { ret = nfs_fstat(client->context, client->fh, &st); @@ -530,6 +564,15 @@ static int nfs_reopen_prepare(BDRVReopenState *state, return 0; } +#ifdef LIBNFS_FEATURE_PAGECACHE +static void nfs_invalidate_cache(BlockDriverState *bs, + Error **errp) +{ + NFSClient *client = bs->opaque; + nfs_pagecache_invalidate(client->context, client->fh); +} +#endif + static BlockDriver bdrv_nfs = { .format_name = "nfs", .protocol_name = "nfs", @@ -553,6 +596,10 @@ static BlockDriver bdrv_nfs = { .bdrv_detach_aio_context = nfs_detach_aio_context, .bdrv_attach_aio_context = nfs_attach_aio_context, + +#ifdef LIBNFS_FEATURE_PAGECACHE + .bdrv_invalidate_cache = nfs_invalidate_cache, +#endif }; static void nfs_block_init(void) diff --git a/block/null.c b/block/null.c index 396500bab..b511010ba 100644 --- a/block/null.c +++ b/block/null.c @@ -12,6 +12,8 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" #include "block/block_int.h" #define NULL_OPT_LATENCY "latency-ns" @@ -223,6 +225,20 @@ static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs, } } +static void null_refresh_filename(BlockDriverState *bs, QDict *opts) +{ + QINCREF(opts); + qdict_del(opts, "filename"); + + if (!qdict_size(opts)) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://", + bs->drv->format_name); + } + + qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name)); + bs->full_open_options = opts; +} + static BlockDriver bdrv_null_co = { .format_name = "null-co", .protocol_name = "null-co", @@ -238,6 +254,8 @@ static BlockDriver bdrv_null_co = { .bdrv_reopen_prepare = null_reopen_prepare, .bdrv_co_get_block_status = null_co_get_block_status, + + .bdrv_refresh_filename = null_refresh_filename, }; static BlockDriver bdrv_null_aio = { @@ -255,6 +273,8 @@ static BlockDriver bdrv_null_aio = { .bdrv_reopen_prepare = null_reopen_prepare, .bdrv_co_get_block_status = null_co_get_block_status, + + .bdrv_refresh_filename = null_refresh_filename, }; static void bdrv_null_init(void) diff --git a/block/parallels.c b/block/parallels.c index 324ed43ac..2ccefa7d8 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -33,6 +33,7 @@ #include "block/block_int.h" #include "sysemu/block-backend.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include "qemu/bitmap.h" #include "qapi/util.h" @@ -42,6 +43,7 @@ #define HEADER_MAGIC2 "WithouFreSpacExt" #define HEADER_VERSION 2 #define HEADER_INUSE_MAGIC (0x746F6E59) +#define MAX_PARALLELS_IMAGE_FACTOR (1ull << 32) #define DEFAULT_CLUSTER_SIZE 1048576 /* 1 MiB */ @@ -203,13 +205,15 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, return -EINVAL; } - to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; + to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx; space = to_allocate * s->tracks; if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) { int ret; space += s->prealloc_size; if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { - ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0); + ret = bdrv_pwrite_zeroes(bs->file, + s->data_end << BDRV_SECTOR_BITS, + space << BDRV_SECTOR_BITS, 0); } else { ret = bdrv_truncate(bs->file->bs, (s->data_end + space) << BDRV_SECTOR_BITS); @@ -247,7 +251,7 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) if (off + to_write > s->header_size) { to_write = s->header_size - off; } - ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off, + ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); @@ -308,7 +312,7 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs, qemu_iovec_reset(&hd_qiov); qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov); + ret = bdrv_co_writev(bs->file, position, n, &hd_qiov); if (ret < 0) { break; } @@ -348,7 +352,7 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs, qemu_iovec_reset(&hd_qiov); qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov); + ret = bdrv_co_readv(bs->file, position, n, &hd_qiov); if (ret < 0) { break; } @@ -429,7 +433,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, } if (flush_bat) { - ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size); + ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size); if (ret < 0) { res->check_errors++; return ret; @@ -472,6 +476,10 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) BDRV_SECTOR_SIZE); cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE); + if (total_size >= MAX_PARALLELS_IMAGE_FACTOR * cl_size) { + error_propagate(errp, local_err); + return -E2BIG; + } ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { @@ -512,11 +520,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) memset(tmp, 0, sizeof(tmp)); memcpy(tmp, &header, sizeof(header)); - ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0); if (ret < 0) { goto exit; } - ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); + ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE, + (bat_sectors - 1) << BDRV_SECTOR_BITS, 0); if (ret < 0) { goto exit; } @@ -559,7 +568,7 @@ static int parallels_update_header(BlockDriverState *bs) if (size > s->header_size) { size = s->header_size; } - return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size); + return bdrv_pwrite_sync(bs->file, 0, s->header, size); } static int parallels_open(BlockDriverState *bs, QDict *options, int flags, @@ -572,7 +581,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; char *buf; - ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph)); + ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph)); if (ret < 0) { goto fail; } @@ -627,7 +636,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, s->header_size = size; } - ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size); + ret = bdrv_pread(bs->file, 0, s->header, s->header_size); if (ret < 0) { goto fail; } diff --git a/block/qapi.c b/block/qapi.c index c5f6ba643..6f947e3e6 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, info->backing_file_depth = bdrv_get_backing_file_depth(bs); info->detect_zeroes = bs->detect_zeroes; - if (bs->throttle_state) { + if (blk && blk_get_public(blk)->throttle_state) { ThrottleConfig cfg; - throttle_group_get_config(bs, &cfg); + throttle_group_get_config(blk, &cfg); info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; @@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, info->iops_size = cfg.op_size; info->has_group = true; - info->group = g_strdup(throttle_group_get_name(bs)); + info->group = g_strdup(throttle_group_get_name(blk)); } info->write_threshold = bdrv_write_threshold_get(bs); @@ -690,16 +690,15 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, ImageInfoSpecific *info_spec) { - QmpOutputVisitor *ov = qmp_output_visitor_new(); QObject *obj, *data; + Visitor *v = qmp_output_visitor_new(&obj); - visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec, - &error_abort); - obj = qmp_output_get_qobject(ov); + visit_type_ImageInfoSpecific(v, NULL, &info_spec, &error_abort); + visit_complete(v, &obj); assert(qobject_type(obj) == QTYPE_QDICT); data = qdict_get(qobject_to_qdict(obj), "data"); dump_qobject(func_fprintf, f, 1, data); - qmp_output_visitor_cleanup(ov); + visit_free(v); } void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, diff --git a/block/qcow.c b/block/qcow.c index 60ddb12ec..6f9b2e2d2 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -28,6 +28,7 @@ #include "block/block_int.h" #include "sysemu/block-backend.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include <zlib.h> #include "qapi/qmp/qerror.h" #include "crypto/cipher.h" @@ -104,7 +105,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, int ret; QCowHeader header; - ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { goto fail; } @@ -161,13 +162,19 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, if (s->crypt_method_header) { if (bdrv_uses_whitelist() && s->crypt_method_header == QCOW_CRYPT_AES) { - error_report("qcow built-in AES encryption is deprecated"); - error_printf("Support for it will be removed in a future release.\n" - "You can use 'qemu-img convert' to switch to an\n" - "unencrypted qcow image, or a LUKS raw image.\n"); + error_setg(errp, + "Use of AES-CBC encrypted qcow images is no longer " + "supported in system emulators"); + error_append_hint(errp, + "You can use 'qemu-img convert' to convert your " + "image to an alternative supported format, such " + "as unencrypted qcow, or raw with the LUKS " + "format instead.\n"); + ret = -ENOSYS; + goto fail; } - bs->encrypted = 1; + bs->encrypted = true; } s->cluster_bits = header.cluster_bits; s->cluster_size = 1 << s->cluster_bits; @@ -201,7 +208,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { goto fail; @@ -232,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } - ret = bdrv_pread(bs->file->bs, header.backing_file_offset, + ret = bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { goto fail; @@ -383,7 +390,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* update the L1 entry */ s->l1_table[l1_index] = l2_offset; tmp = cpu_to_be64(l2_offset); - if (bdrv_pwrite_sync(bs->file->bs, + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset + l1_index * sizeof(tmp), &tmp, sizeof(tmp)) < 0) return 0; @@ -413,11 +420,11 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, l2_table = s->l2_cache + (min_index << s->l2_bits); if (new_l2_table) { memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table, + if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) < 0) return 0; } else { - if (bdrv_pread(bs->file->bs, l2_offset, l2_table, + if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != s->l2_size * sizeof(uint64_t)) return 0; @@ -443,7 +450,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); /* write the cluster content */ - if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache, + if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != s->cluster_size) return -1; @@ -473,7 +480,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, errno = EIO; return -1; } - if (bdrv_pwrite(bs->file->bs, + if (bdrv_pwrite(bs->file, cluster_offset + i * 512, s->cluster_data, 512) != 512) return -1; @@ -488,7 +495,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* update L2 table */ tmp = cpu_to_be64(cluster_offset); l2_table[l2_index] = tmp; - if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp), + if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) < 0) return 0; } @@ -558,7 +565,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) if (s->cluster_cache_offset != coffset) { csize = cluster_offset >> (63 - s->cluster_bits); csize &= (s->cluster_size - 1); - ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize); + ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); if (ret != csize) return -1; if (decompress_buffer(s->cluster_cache, s->cluster_size, @@ -612,8 +619,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing->bs, sector_num, - n, &hd_qiov); + ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; @@ -637,7 +643,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, + ret = bdrv_co_readv(bs->file, (cluster_offset >> 9) + index_in_cluster, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -739,7 +745,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file->bs, + ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + index_in_cluster, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -853,24 +859,24 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) } /* write all the data */ - ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); + ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0); if (ret != sizeof(header)) { goto exit; } if (backing_file) { ret = blk_pwrite(qcow_blk, sizeof(header), - backing_file, backing_filename_len); + backing_file, backing_filename_len, 0); if (ret != backing_filename_len) { goto exit; } } tmp = g_malloc0(BDRV_SECTOR_SIZE); - for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ - BDRV_SECTOR_SIZE); i++) { - ret = blk_pwrite(qcow_blk, header_size + - BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE); + i++) { + ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i, + tmp, BDRV_SECTOR_SIZE, 0); if (ret != BDRV_SECTOR_SIZE) { g_free(tmp); goto exit; @@ -893,7 +899,7 @@ static int qcow_make_empty(BlockDriverState *bs) int ret; memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table, + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) return -1; ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length); @@ -907,6 +913,49 @@ static int qcow_make_empty(BlockDriverState *bs) return 0; } +typedef struct QcowWriteCo { + BlockDriverState *bs; + int64_t sector_num; + const uint8_t *buf; + int nb_sectors; + int ret; +} QcowWriteCo; + +static void qcow_write_co_entry(void *opaque) +{ + QcowWriteCo *co = opaque; + QEMUIOVector qiov; + + struct iovec iov = (struct iovec) { + .iov_base = (uint8_t*) co->buf, + .iov_len = co->nb_sectors * BDRV_SECTOR_SIZE, + }; + qemu_iovec_init_external(&qiov, &iov, 1); + + co->ret = qcow_co_writev(co->bs, co->sector_num, co->nb_sectors, &qiov); +} + +/* Wrapper for non-coroutine contexts */ +static int qcow_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + Coroutine *co; + AioContext *aio_context = bdrv_get_aio_context(bs); + QcowWriteCo data = { + .bs = bs, + .sector_num = sector_num, + .buf = buf, + .nb_sectors = nb_sectors, + .ret = -EINPROGRESS, + }; + co = qemu_coroutine_create(qcow_write_co_entry, &data); + qemu_coroutine_enter(co); + while (data.ret == -EINPROGRESS) { + aio_poll(aio_context, true); + } + return data.ret; +} + /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, @@ -934,7 +983,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, return ret; } - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + out_buf = g_malloc(s->cluster_size); /* best compression, small window, no zlib header */ memset(&strm, 0, sizeof(strm)); @@ -963,7 +1012,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, if (ret != Z_STREAM_END || out_len >= s->cluster_size) { /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + ret = qcow_write(bs, sector_num, buf, s->cluster_sectors); if (ret < 0) { goto fail; } @@ -976,7 +1025,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, } cluster_offset &= s->cluster_offset_mask; - ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); if (ret < 0) { goto fail; } diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 0fe8edae4..6eaefeddc 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -24,11 +24,6 @@ /* Needed for CONFIG_MADVISE */ #include "qemu/osdep.h" - -#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE) -#include <sys/mman.h> -#endif - #include "block/block_int.h" #include "qemu-common.h" #include "qcow2.h" @@ -215,7 +210,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); } - ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset, + ret = bdrv_pwrite(bs->file, c->entries[i].offset, qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); if (ret < 0) { return ret; @@ -226,7 +221,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) return 0; } -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c) { BDRVQcow2State *s = bs->opaque; int result = 0; @@ -242,8 +237,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) } } + return result; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ + int result = qcow2_cache_write(bs, c); + if (result == 0) { - ret = bdrv_flush(bs->file->bs); + int ret = bdrv_flush(bs->file->bs); if (ret < 0) { result = ret; } @@ -355,7 +357,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); } - ret = bdrv_pread(bs->file->bs, offset, + ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); if (ret < 0) { diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 31ecc1030..f94183529 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -29,6 +29,7 @@ #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" +#include "qemu/bswap.h" #include "trace.h" int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, @@ -64,7 +65,8 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, } } - if (new_l1_size > INT_MAX / sizeof(uint64_t)) { + QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); + if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { return -EFBIG; } @@ -107,7 +109,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); - ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset, + ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); if (ret < 0) goto fail; @@ -116,9 +118,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, /* set new table */ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); - cpu_to_be32w((uint32_t*)data, new_l1_size); + stl_be_p(data, new_l1_size); stq_be_p(data + 4, new_l1_table_offset); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size), + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data, sizeof(data)); if (ret < 0) { goto fail; @@ -153,11 +155,9 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset, uint64_t **l2_table) { BDRVQcow2State *s = bs->opaque; - int ret; - - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); - return ret; + return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void **)l2_table); } /* @@ -186,7 +186,7 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) } BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - ret = bdrv_pwrite_sync(bs->file->bs, + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, buf, sizeof(buf)); if (ret < 0) { @@ -389,22 +389,18 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, return 0; } -static int coroutine_fn copy_sectors(BlockDriverState *bs, - uint64_t start_sect, - uint64_t cluster_offset, - int n_start, int n_end) +static int coroutine_fn do_perform_cow(BlockDriverState *bs, + uint64_t src_cluster_offset, + uint64_t cluster_offset, + int offset_in_cluster, + int bytes) { BDRVQcow2State *s = bs->opaque; QEMUIOVector qiov; struct iovec iov; - int n, ret; - - n = n_end - n_start; - if (n <= 0) { - return 0; - } + int ret; - iov.iov_len = n * BDRV_SECTOR_SIZE; + iov.iov_len = bytes; iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); if (iov.iov_base == NULL) { return -ENOMEM; @@ -423,17 +419,21 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, * interface. This avoids double I/O throttling and request tracking, * which can lead to deadlock when block layer copy-on-read is enabled. */ - ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); + ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, + bytes, &qiov, 0); if (ret < 0) { goto out; } if (bs->encrypted) { Error *err = NULL; + int64_t sector = (cluster_offset + offset_in_cluster) + >> BDRV_SECTOR_BITS; assert(s->cipher); - if (qcow2_encrypt_sectors(s, start_sect + n_start, - iov.iov_base, iov.iov_base, n, - true, &err) < 0) { + assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); + assert((bytes & ~BDRV_SECTOR_MASK) == 0); + if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base, + bytes >> BDRV_SECTOR_BITS, true, &err) < 0) { ret = -EIO; error_free(err); goto out; @@ -441,14 +441,14 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, } ret = qcow2_pre_write_overlap_check(bs, 0, - cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); + cluster_offset + offset_in_cluster, bytes); if (ret < 0) { goto out; } BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); - ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n, - &qiov); + ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, + bytes, &qiov, 0); if (ret < 0) { goto out; } @@ -463,47 +463,43 @@ out: /* * get_cluster_offset * - * For a given offset of the disk image, find the cluster offset in - * qcow2 file. The offset is stored in *cluster_offset. + * For a given offset of the virtual disk, find the cluster type and offset in + * the qcow2 file. The offset is stored in *cluster_offset. * - * on entry, *num is the number of contiguous sectors we'd like to - * access following offset. + * On entry, *bytes is the maximum number of contiguous bytes starting at + * offset that we are interested in. * - * on exit, *num is the number of contiguous sectors we can read. + * On exit, *bytes is the number of bytes starting at offset that have the same + * cluster type and (if applicable) are stored contiguously in the image file. + * Compressed clusters are always returned one by one. * * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error * cases. */ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset) + unsigned int *bytes, uint64_t *cluster_offset) { BDRVQcow2State *s = bs->opaque; unsigned int l2_index; uint64_t l1_index, l2_offset, *l2_table; int l1_bits, c; - unsigned int index_in_cluster, nb_clusters; - uint64_t nb_available, nb_needed; + unsigned int offset_in_cluster; + uint64_t bytes_available, bytes_needed, nb_clusters; int ret; - index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); - nb_needed = *num + index_in_cluster; + offset_in_cluster = offset_into_cluster(s, offset); + bytes_needed = (uint64_t) *bytes + offset_in_cluster; l1_bits = s->l2_bits + s->cluster_bits; - /* compute how many bytes there are between the offset and - * the end of the l1 entry - */ - - nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); - - /* compute the number of available sectors */ + /* compute how many bytes there are between the start of the cluster + * containing offset and the end of the l1 entry */ + bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) + + offset_in_cluster; - nb_available = (nb_available >> 9) + index_in_cluster; - - if (nb_needed > nb_available) { - nb_needed = nb_available; + if (bytes_needed > bytes_available) { + bytes_needed = bytes_available; } - assert(nb_needed <= INT_MAX); *cluster_offset = 0; @@ -540,8 +536,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); *cluster_offset = be64_to_cpu(l2_table[l2_index]); - /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ - nb_clusters = size_to_clusters(s, nb_needed << 9); + nb_clusters = size_to_clusters(s, bytes_needed); + /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned + * integers; the minimum cluster size is 512, so this assertion is always + * true */ + assert(nb_clusters <= INT_MAX); ret = qcow2_get_cluster_type(*cluster_offset); switch (ret) { @@ -588,13 +587,18 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - nb_available = (c * s->cluster_sectors); + bytes_available = (int64_t)c * s->cluster_size; out: - if (nb_available > nb_needed) - nb_available = nb_needed; + if (bytes_available > bytes_needed) { + bytes_available = bytes_needed; + } - *num = nb_available - index_in_cluster; + /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; + * subtracting offset_in_cluster will therefore definitely yield something + * not exceeding UINT_MAX */ + assert(bytes_available - offset_in_cluster <= UINT_MAX); + *bytes = bytes_available - offset_in_cluster; return ret; @@ -740,14 +744,12 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) BDRVQcow2State *s = bs->opaque; int ret; - if (r->nb_sectors == 0) { + if (r->nb_bytes == 0) { return 0; } qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, - r->offset / BDRV_SECTOR_SIZE, - r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); + ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes); qemu_co_mutex_lock(&s->lock); if (ret < 0) { @@ -809,13 +811,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { /* if two concurrent writes happen to the same unallocated cluster - * each write allocates separate cluster and writes data concurrently. - * The first one to complete updates l2 table with pointer to its - * cluster the second one has to do RMW (which is done above by - * copy_sectors()), update l2 table with its cluster pointer and free - * old cluster. This is what this loop does */ - if(l2_table[l2_index + i] != 0) + * each write allocates separate cluster and writes data concurrently. + * The first one to complete updates l2 table with pointer to its + * cluster the second one has to do RMW (which is done above by + * perform_cow()), update l2 table with its cluster pointer and free + * old cluster. This is what this loop does */ + if (l2_table[l2_index + i] != 0) { old_cluster[j++] = l2_table[l2_index + i]; + } l2_table[l2_index + i] = cpu_to_be64((cluster_offset + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); @@ -1197,25 +1200,20 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, /* * Save info needed for meta data update. * - * requested_sectors: Number of sectors from the start of the first + * requested_bytes: Number of bytes from the start of the first * newly allocated cluster to the end of the (possibly shortened * before) write request. * - * avail_sectors: Number of sectors from the start of the first + * avail_bytes: Number of bytes from the start of the first * newly allocated to the end of the last newly allocated cluster. * - * nb_sectors: The number of sectors from the start of the first + * nb_bytes: The number of bytes from the start of the first * newly allocated cluster to the end of the area that the write * request actually writes to (excluding COW at the end) */ - int requested_sectors = - (*bytes + offset_into_cluster(s, guest_offset)) - >> BDRV_SECTOR_BITS; - int avail_sectors = nb_clusters - << (s->cluster_bits - BDRV_SECTOR_BITS); - int alloc_n_start = offset_into_cluster(s, guest_offset) - >> BDRV_SECTOR_BITS; - int nb_sectors = MIN(requested_sectors, avail_sectors); + uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); + int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); + int nb_bytes = MIN(requested_bytes, avail_bytes); QCowL2Meta *old_m = *m; *m = g_malloc0(sizeof(**m)); @@ -1226,23 +1224,21 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, .alloc_offset = alloc_cluster_offset, .offset = start_of_cluster(s, guest_offset), .nb_clusters = nb_clusters, - .nb_available = nb_sectors, .cow_start = { .offset = 0, - .nb_sectors = alloc_n_start, + .nb_bytes = offset_into_cluster(s, guest_offset), }, .cow_end = { - .offset = nb_sectors * BDRV_SECTOR_SIZE, - .nb_sectors = avail_sectors - nb_sectors, + .offset = nb_bytes, + .nb_bytes = avail_bytes - nb_bytes, }, }; qemu_co_queue_init(&(*m)->dependent_requests); QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); - *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) - - offset_into_cluster(s, guest_offset)); + *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); assert(*bytes != 0); return 1; @@ -1274,7 +1270,8 @@ fail: * Return 0 on success and -errno in error cases */ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *host_offset, QCowL2Meta **m) + unsigned int *bytes, uint64_t *host_offset, + QCowL2Meta **m) { BDRVQcow2State *s = bs->opaque; uint64_t start, remaining; @@ -1282,13 +1279,11 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, uint64_t cur_bytes; int ret; - trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); - - assert((offset & ~BDRV_SECTOR_MASK) == 0); + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); again: start = offset; - remaining = (uint64_t)*num << BDRV_SECTOR_BITS; + remaining = *bytes; cluster_offset = 0; *host_offset = 0; cur_bytes = 0; @@ -1374,8 +1369,8 @@ again: } } - *num -= remaining >> BDRV_SECTOR_BITS; - assert(*num > 0); + *bytes -= remaining; + assert(*bytes > 0); assert(*host_offset != 0); return 0; @@ -1420,7 +1415,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) sector_offset = coffset & 511; csize = nb_csectors * 512 - sector_offset; BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data, + ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); if (ret < 0) { return ret; @@ -1689,7 +1684,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, (void **)&l2_table); } else { /* load inactive L2 tables from disk */ - ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, (void *)l2_table, s->cluster_sectors); } if (ret < 0) { @@ -1764,8 +1759,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE, - s->cluster_sectors, 0); + ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); if (ret < 0) { if (!preallocated) { qcow2_free_clusters(bs, offset, s->cluster_size, @@ -1797,7 +1791,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, (void *)l2_table, s->cluster_sectors); if (ret < 0) { goto fail; @@ -1867,12 +1861,12 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, } for (i = 0; i < s->nb_snapshots; i++) { - int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + - BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; + int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * + sizeof(uint64_t), BDRV_SECTOR_SIZE); l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); - ret = bdrv_read(bs->file->bs, + ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); if (ret < 0) { diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index ca6094ff5..cbfb3fe06 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -28,6 +28,7 @@ #include "block/block_int.h" #include "block/qcow2.h" #include "qemu/range.h" +#include "qemu/bswap.h" static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, @@ -103,7 +104,7 @@ int qcow2_refcount_init(BlockDriverState *bs) goto fail; } BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); - ret = bdrv_pread(bs->file->bs, s->refcount_table_offset, + ret = bdrv_pread(bs->file, s->refcount_table_offset, s->refcount_table, refcount_table_size2); if (ret < 0) { goto fail; @@ -217,13 +218,10 @@ static int load_refcount_block(BlockDriverState *bs, void **refcount_block) { BDRVQcow2State *s = bs->opaque; - int ret; BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); - ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - refcount_block); - - return ret; + return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + refcount_block); } /* @@ -433,7 +431,7 @@ static int alloc_refcount_block(BlockDriverState *bs, if (refcount_table_index < s->refcount_table_size) { uint64_t data64 = cpu_to_be64(new_block); BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); - ret = bdrv_pwrite_sync(bs->file->bs, + ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), &data64, sizeof(data64)); if (ret < 0) { @@ -489,14 +487,12 @@ static int alloc_refcount_block(BlockDriverState *bs, uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); blocks_clusters = 1 + - ((table_clusters + s->refcount_block_size - 1) - / s->refcount_block_size); + DIV_ROUND_UP(table_clusters, s->refcount_block_size); uint64_t meta_clusters = table_clusters + blocks_clusters; last_table_size = table_size; table_size = next_refcount_table_size(s, blocks_used + - ((meta_clusters + s->refcount_block_size - 1) - / s->refcount_block_size)); + DIV_ROUND_UP(meta_clusters, s->refcount_block_size)); } while (last_table_size != table_size); @@ -537,7 +533,7 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Write refcount blocks to disk */ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); - ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks, + ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, blocks_clusters * s->cluster_size); g_free(new_blocks); new_blocks = NULL; @@ -551,7 +547,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); - ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table, + ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, table_size * sizeof(uint64_t)); if (ret < 0) { goto fail_table; @@ -566,10 +562,10 @@ static int alloc_refcount_block(BlockDriverState *bs, uint64_t d64; uint32_t d32; } data; - cpu_to_be64w(&data.d64, table_offset); - cpu_to_be32w(&data.d32, table_clusters); + data.d64 = cpu_to_be64(table_offset); + data.d32 = cpu_to_be32(table_clusters); BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); - ret = bdrv_pwrite_sync(bs->file->bs, + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), &data, sizeof(data)); if (ret < 0) { @@ -619,9 +615,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) /* Discard is optional, ignore the return value */ if (ret >= 0) { - bdrv_discard(bs->file->bs, - d->offset >> BDRV_SECTOR_BITS, - d->bytes >> BDRV_SECTOR_BITS); + bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); } g_free(d); @@ -1074,7 +1068,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } l1_allocated = true; - ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); if (ret < 0) { goto fail; } @@ -1227,7 +1221,7 @@ fail: cpu_to_be64s(&l1_table[i]); } - ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset, + ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); for (i = 0; i < l1_size; i++) { @@ -1386,7 +1380,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, l2_size = s->l2_size * sizeof(uint64_t); l2_table = g_malloc(l2_size); - ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size); + ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); res->check_errors++; @@ -1518,7 +1512,7 @@ static int check_refcounts_l1(BlockDriverState *bs, res->check_errors++; goto fail; } - ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); res->check_errors++; @@ -1616,7 +1610,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, } } - ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, + ret = bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)); if (ret < 0) { fprintf(stderr, "ERROR: Could not read L2 table: %s\n", @@ -1668,7 +1662,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, goto fail; } - ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table, + ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); if (ret < 0) { fprintf(stderr, "ERROR: Could not write L2 table: %s\n", @@ -2102,7 +2096,7 @@ write_refblocks: on_disk_refblock = (void *)((char *) *refcount_table + refblock_index * s->cluster_size); - ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE, + ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, on_disk_refblock, s->cluster_sectors); if (ret < 0) { fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); @@ -2151,7 +2145,7 @@ write_refblocks: } assert(reftable_size < INT_MAX / sizeof(uint64_t)); - ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable, + ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, reftable_size * sizeof(uint64_t)); if (ret < 0) { fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); @@ -2159,12 +2153,11 @@ write_refblocks: } /* Enter new reftable into the image header */ - cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset, - reftable_offset); - cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters, - size_to_clusters(s, reftable_size * sizeof(uint64_t))); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, - refcount_table_offset), + reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); + reftable_offset_and_clusters.reftable_clusters = + cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t))); + ret = bdrv_pwrite_sync(bs->file, + offsetof(QCowHeader, refcount_table_offset), &reftable_offset_and_clusters, sizeof(reftable_offset_and_clusters)); if (ret < 0) { @@ -2411,7 +2404,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, return -ENOMEM; } - ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2); + ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); if (ret < 0) { g_free(l1); return ret; @@ -2564,7 +2557,7 @@ static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, return ret; } - ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size); + ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to write refblock"); return ret; @@ -2834,7 +2827,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, cpu_to_be64s(&new_reftable[i]); } - ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable, + ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, new_reftable_size * sizeof(uint64_t)); for (i = 0; i < new_reftable_size; i++) { diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 5f4a17e47..032424322 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -26,6 +26,7 @@ #include "qapi/error.h" #include "block/block_int.h" #include "block/qcow2.h" +#include "qemu/bswap.h" #include "qemu/error-report.h" #include "qemu/cutils.h" @@ -66,7 +67,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) for(i = 0; i < s->nb_snapshots; i++) { /* Read statically sized part of the snapshot header */ offset = align_offset(offset, 8); - ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h)); + ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); if (ret < 0) { goto fail; } @@ -85,7 +86,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) name_size = be16_to_cpu(h.name_size); /* Read extra data */ - ret = bdrv_pread(bs->file->bs, offset, &extra, + ret = bdrv_pread(bs->file, offset, &extra, MIN(sizeof(extra), extra_data_size)); if (ret < 0) { goto fail; @@ -104,7 +105,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) /* Read snapshot ID */ sn->id_str = g_malloc(id_str_size + 1); - ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size); + ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); if (ret < 0) { goto fail; } @@ -113,7 +114,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) /* Read snapshot name */ sn->name = g_malloc(name_size + 1); - ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size); + ret = bdrv_pread(bs->file, offset, sn->name, name_size); if (ret < 0) { goto fail; } @@ -216,25 +217,25 @@ static int qcow2_write_snapshots(BlockDriverState *bs) h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); - ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h)); + ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); if (ret < 0) { goto fail; } offset += sizeof(h); - ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra)); + ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); if (ret < 0) { goto fail; } offset += sizeof(extra); - ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size); + ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); if (ret < 0) { goto fail; } offset += id_str_size; - ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size); + ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); if (ret < 0) { goto fail; } @@ -256,7 +257,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); header_data.snapshots_offset = cpu_to_be64(snapshots_offset); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots), + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), &header_data, sizeof(header_data)); if (ret < 0) { goto fail; @@ -398,7 +399,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto fail; } - ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table, + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { goto fail; @@ -511,7 +512,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } - ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); if (ret < 0) { goto fail; @@ -529,7 +530,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } - ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table, + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, cur_l1_bytes); if (ret < 0) { goto fail; @@ -715,7 +716,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, return -ENOMEM; } - ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); if (ret < 0) { error_setg(errp, "Failed to read l1 table for snapshot"); diff --git a/block/qcow2.c b/block/qcow2.c index 470734be9..91ef4dfef 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -36,6 +36,7 @@ #include "trace.h" #include "qemu/option_int.h" #include "qemu/cutils.h" +#include "qemu/bswap.h" /* Differences with QCOW: @@ -106,7 +107,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attempting to read extended header in offset %lu\n", offset); #endif - ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext)); + ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); if (ret < 0) { error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " "pread fail from offset %" PRIu64, offset); @@ -134,7 +135,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, sizeof(bs->backing_format)); return 2; } - ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len); + ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " "Could not read format name"); @@ -150,7 +151,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_FEATURE_TABLE: if (p_feature_table != NULL) { void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); - ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len); + ret = bdrv_pread(bs->file, offset , feature_table, ext.len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " "Could not read table"); @@ -171,7 +172,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uext->len = ext.len; QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); - ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len); + ret = bdrv_pread(bs->file, offset , uext->data, uext->len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: unknown extension: " "Could not read data"); @@ -248,7 +249,7 @@ int qcow2_mark_dirty(BlockDriverState *bs) } val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); - ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features), + ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), &val, sizeof(val)); if (ret < 0) { return ret; @@ -816,7 +817,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, uint64_t ext_end; uint64_t l1_vm_state_index; - ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read qcow2 header"); goto fail; @@ -891,7 +892,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (header.header_length > sizeof(header)) { s->unknown_header_fields_size = header.header_length - sizeof(header); s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); - ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields, + ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, s->unknown_header_fields_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " @@ -967,13 +968,19 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (s->crypt_method_header) { if (bdrv_uses_whitelist() && s->crypt_method_header == QCOW_CRYPT_AES) { - error_report("qcow2 built-in AES encryption is deprecated"); - error_printf("Support for it will be removed in a future release.\n" - "You can use 'qemu-img convert' to switch to an\n" - "unencrypted qcow2 image, or a LUKS raw image.\n"); + error_setg(errp, + "Use of AES-CBC encrypted qcow2 images is no longer " + "supported in system emulators"); + error_append_hint(errp, + "You can use 'qemu-img convert' to convert your " + "image to an alternative supported format, such " + "as unencrypted qcow2, or raw with the LUKS " + "format instead.\n"); + ret = -ENOSYS; + goto fail; } - bs->encrypted = 1; + bs->encrypted = true; } s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ @@ -1059,7 +1066,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, ret = -ENOMEM; goto fail; } - ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read L1 table"); @@ -1115,7 +1122,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } - ret = bdrv_pread(bs->file->bs, header.backing_file_offset, + ret = bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read backing file name"); @@ -1192,7 +1199,11 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQcow2State *s = bs->opaque; - bs->bl.write_zeroes_alignment = s->cluster_sectors; + if (bs->encrypted) { + /* Encryption works on a sector granularity */ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; + } + bs->bl.pwrite_zeroes_alignment = s->cluster_size; } static int qcow2_set_key(BlockDriverState *bs, const char *key) @@ -1330,16 +1341,20 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, BDRVQcow2State *s = bs->opaque; uint64_t cluster_offset; int index_in_cluster, ret; + unsigned int bytes; int64_t status = 0; - *pnum = nb_sectors; + bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE); qemu_co_mutex_lock(&s->lock); - ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes, + &cluster_offset); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { return ret; } + *pnum = bytes >> BDRV_SECTOR_BITS; + if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && !s->cipher) { index_in_cluster = sector_num & (s->cluster_sectors - 1); @@ -1357,28 +1372,34 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, /* handle reading after the end of the backing file */ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors) + int64_t offset, int bytes) { + uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE; int n1; - if ((sector_num + nb_sectors) <= bs->total_sectors) - return nb_sectors; - if (sector_num >= bs->total_sectors) + + if ((offset + bytes) <= bs_size) { + return bytes; + } + + if (offset >= bs_size) { n1 = 0; - else - n1 = bs->total_sectors - sector_num; + } else { + n1 = bs_size - offset; + } - qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + qemu_iovec_memset(qiov, n1, 0, bytes - n1); return n1; } -static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, - int remaining_sectors, QEMUIOVector *qiov) +static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { BDRVQcow2State *s = bs->opaque; - int index_in_cluster, n1; + int offset_in_cluster, n1; int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ + unsigned int cur_bytes; /* number of bytes in current iteration */ uint64_t cluster_offset = 0; uint64_t bytes_done = 0; QEMUIOVector hd_qiov; @@ -1388,26 +1409,24 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, qemu_co_mutex_lock(&s->lock); - while (remaining_sectors != 0) { + while (bytes != 0) { /* prepare next request */ - cur_nr_sectors = remaining_sectors; + cur_bytes = MIN(bytes, INT_MAX); if (s->cipher) { - cur_nr_sectors = MIN(cur_nr_sectors, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + cur_bytes = MIN(cur_bytes, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); } - ret = qcow2_get_cluster_offset(bs, sector_num << 9, - &cur_nr_sectors, &cluster_offset); + ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset); if (ret < 0) { goto fail; } - index_in_cluster = sector_num & (s->cluster_sectors - 1); + offset_in_cluster = offset_into_cluster(s, offset); qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes); switch (ret) { case QCOW2_CLUSTER_UNALLOCATED: @@ -1415,18 +1434,17 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, if (bs->backing) { /* read from the base image */ n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, - sector_num, cur_nr_sectors); + offset, cur_bytes); if (n1 > 0) { QEMUIOVector local_qiov; qemu_iovec_init(&local_qiov, hd_qiov.niov); - qemu_iovec_concat(&local_qiov, &hd_qiov, 0, - n1 * BDRV_SECTOR_SIZE); + qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1); BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing->bs, sector_num, - n1, &local_qiov); + ret = bdrv_co_preadv(bs->backing, offset, n1, + &local_qiov, 0); qemu_co_mutex_lock(&s->lock); qemu_iovec_destroy(&local_qiov); @@ -1437,12 +1455,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, } } else { /* Note: in this case, no need to wait */ - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes); } break; case QCOW2_CLUSTER_ZERO: - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes); break; case QCOW2_CLUSTER_COMPRESSED: @@ -1453,8 +1471,8 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, } qemu_iovec_from_buf(&hd_qiov, 0, - s->cluster_cache + index_in_cluster * 512, - 512 * cur_nr_sectors); + s->cluster_cache + offset_in_cluster, + cur_bytes); break; case QCOW2_CLUSTER_NORMAL: @@ -1481,34 +1499,34 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, } } - assert(cur_nr_sectors <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - 512 * cur_nr_sectors); + qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); + ret = bdrv_co_preadv(bs->file, + cluster_offset + offset_in_cluster, + cur_bytes, &hd_qiov, 0); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; } if (bs->encrypted) { assert(s->cipher); + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0); Error *err = NULL; - if (qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, false, - &err) < 0) { + if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS, + cluster_data, cluster_data, + cur_bytes >> BDRV_SECTOR_BITS, + false, &err) < 0) { error_free(err); ret = -EIO; goto fail; } - qemu_iovec_from_buf(qiov, bytes_done, - cluster_data, 512 * cur_nr_sectors); + qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes); } break; @@ -1518,9 +1536,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, goto fail; } - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; + bytes -= cur_bytes; + offset += cur_bytes; + bytes_done += cur_bytes; } ret = 0; @@ -1533,23 +1551,21 @@ fail: return ret; } -static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, - int64_t sector_num, - int remaining_sectors, - QEMUIOVector *qiov) +static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { BDRVQcow2State *s = bs->opaque; - int index_in_cluster; + int offset_in_cluster; int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ + unsigned int cur_bytes; /* number of sectors in current iteration */ uint64_t cluster_offset; QEMUIOVector hd_qiov; uint64_t bytes_done = 0; uint8_t *cluster_data = NULL; QCowL2Meta *l2meta = NULL; - trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, - remaining_sectors); + trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes); qemu_iovec_init(&hd_qiov, qiov->niov); @@ -1557,22 +1573,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); - while (remaining_sectors != 0) { + while (bytes != 0) { l2meta = NULL; trace_qcow2_writev_start_part(qemu_coroutine_self()); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - cur_nr_sectors = remaining_sectors; - if (bs->encrypted && - cur_nr_sectors > - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { - cur_nr_sectors = - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; + offset_in_cluster = offset_into_cluster(s, offset); + cur_bytes = MIN(bytes, INT_MAX); + if (bs->encrypted) { + cur_bytes = MIN(cur_bytes, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + - offset_in_cluster); } - ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - &cur_nr_sectors, &cluster_offset, &l2meta); + ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, + &cluster_offset, &l2meta); if (ret < 0) { goto fail; } @@ -1580,8 +1595,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, assert((cluster_offset & 511) == 0); qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes); if (bs->encrypted) { Error *err = NULL; @@ -1600,8 +1614,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - if (qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, + if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS, + cluster_data, cluster_data, + cur_bytes >>BDRV_SECTOR_BITS, true, &err) < 0) { error_free(err); ret = -EIO; @@ -1609,13 +1624,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, } qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - cur_nr_sectors * 512); + qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); } ret = qcow2_pre_write_overlap_check(bs, 0, - cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, - cur_nr_sectors * BDRV_SECTOR_SIZE); + cluster_offset + offset_in_cluster, cur_bytes); if (ret < 0) { goto fail; } @@ -1623,10 +1636,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, qemu_co_mutex_unlock(&s->lock); BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), - (cluster_offset >> 9) + index_in_cluster); - ret = bdrv_co_writev(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); + cluster_offset + offset_in_cluster); + ret = bdrv_co_pwritev(bs->file, + cluster_offset + offset_in_cluster, + cur_bytes, &hd_qiov, 0); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; @@ -1652,10 +1665,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, l2meta = next; } - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); + bytes -= cur_bytes; + offset += cur_bytes; + bytes_done += cur_bytes; + trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes); } ret = 0; @@ -1757,13 +1770,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) qcow2_close(bs); - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - bs->drv = NULL; - return; - } - memset(s, 0, sizeof(BDRVQcow2State)); options = qdict_clone_shallow(bs->options); @@ -1970,7 +1976,7 @@ int qcow2_update_header(BlockDriverState *bs) } /* Write the new header */ - ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size); + ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); if (ret < 0) { goto fail; } @@ -2004,19 +2010,19 @@ static int qcow2_change_backing_file(BlockDriverState *bs, static int preallocate(BlockDriverState *bs) { - uint64_t nb_sectors; + uint64_t bytes; uint64_t offset; uint64_t host_offset = 0; - int num; + unsigned int cur_bytes; int ret; QCowL2Meta *meta; - nb_sectors = bdrv_nb_sectors(bs); + bytes = bdrv_getlength(bs); offset = 0; - while (nb_sectors) { - num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); - ret = qcow2_alloc_cluster_offset(bs, offset, &num, + while (bytes) { + cur_bytes = MIN(bytes, INT_MAX); + ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, &host_offset, &meta); if (ret < 0) { return ret; @@ -2042,8 +2048,8 @@ static int preallocate(BlockDriverState *bs) /* TODO Preallocate data if requested */ - nb_sectors -= num; - offset += num << BDRV_SECTOR_BITS; + bytes -= cur_bytes; + offset += cur_bytes; } /* @@ -2052,11 +2058,9 @@ static int preallocate(BlockDriverState *bs) * EOF). Extend the image to the last allocated sector. */ if (host_offset != 0) { - uint8_t buf[BDRV_SECTOR_SIZE]; - memset(buf, 0, BDRV_SECTOR_SIZE); - ret = bdrv_write(bs->file->bs, - (host_offset >> BDRV_SECTOR_BITS) + num - 1, - buf, 1); + uint8_t data = 0; + ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1, + &data, 1); if (ret < 0) { return ret; } @@ -2207,7 +2211,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = blk_pwrite(blk, 0, header, cluster_size); + ret = blk_pwrite(blk, 0, header, cluster_size, 0); g_free(header); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write qcow2 header"); @@ -2217,7 +2221,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Write a refcount table with one refcount block */ refcount_table = g_malloc0(2 * cluster_size); refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); + ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0); g_free(refcount_table); if (ret < 0) { @@ -2400,9 +2404,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, cluster_size, prealloc, opts, version, refcount_order, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } + error_propagate(errp, local_err); finish: g_free(backing_file); @@ -2411,35 +2413,81 @@ finish: return ret; } -static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) + +static bool is_zero_sectors(BlockDriverState *bs, int64_t start, + uint32_t count) +{ + int nr; + BlockDriverState *file; + int64_t res; + + if (!count) { + return true; + } + res = bdrv_get_block_status_above(bs, NULL, start, count, + &nr, &file); + return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count; +} + +static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { int ret; BDRVQcow2State *s = bs->opaque; - /* Emulate misaligned zero writes */ - if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { - return -ENOTSUP; + uint32_t head = offset % s->cluster_size; + uint32_t tail = (offset + count) % s->cluster_size; + + trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count); + + if (head || tail) { + int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS; + uint64_t off; + unsigned int nr; + + assert(head + count <= s->cluster_size); + + /* check whether remainder of cluster already reads as zero */ + if (!(is_zero_sectors(bs, cl_start, + DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) && + is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS, + DIV_ROUND_UP(-tail & (s->cluster_size - 1), + BDRV_SECTOR_SIZE)))) { + return -ENOTSUP; + } + + qemu_co_mutex_lock(&s->lock); + /* We can have new write after previous check */ + offset = cl_start << BDRV_SECTOR_BITS; + count = s->cluster_size; + nr = s->cluster_size; + ret = qcow2_get_cluster_offset(bs, offset, &nr, &off); + if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) { + qemu_co_mutex_unlock(&s->lock); + return -ENOTSUP; + } + } else { + qemu_co_mutex_lock(&s->lock); } + trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count); + /* Whatever is left can use real zero clusters */ - qemu_co_mutex_lock(&s->lock); - ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); + ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS); qemu_co_mutex_unlock(&s->lock); return ret; } -static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { int ret; BDRVQcow2State *s = bs->opaque; qemu_co_mutex_lock(&s->lock); - ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors, QCOW2_DISCARD_REQUEST, false); + ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS, + QCOW2_DISCARD_REQUEST, false); qemu_co_mutex_unlock(&s->lock); return ret; } @@ -2475,7 +2523,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset) /* write updated header.size */ offset = cpu_to_be64(offset); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size), + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), &offset, sizeof(uint64_t)); if (ret < 0) { return ret; @@ -2485,6 +2533,51 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset) return 0; } +typedef struct Qcow2WriteCo { + BlockDriverState *bs; + int64_t sector_num; + const uint8_t *buf; + int nb_sectors; + int ret; +} Qcow2WriteCo; + +static void qcow2_write_co_entry(void *opaque) +{ + Qcow2WriteCo *co = opaque; + QEMUIOVector qiov; + uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE; + uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE; + + struct iovec iov = (struct iovec) { + .iov_base = (uint8_t*) co->buf, + .iov_len = bytes, + }; + qemu_iovec_init_external(&qiov, &iov, 1); + + co->ret = qcow2_co_pwritev(co->bs, offset, bytes, &qiov, 0); +} + +/* Wrapper for non-coroutine contexts */ +static int qcow2_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + Coroutine *co; + AioContext *aio_context = bdrv_get_aio_context(bs); + Qcow2WriteCo data = { + .bs = bs, + .sector_num = sector_num, + .buf = buf, + .nb_sectors = nb_sectors, + .ret = -EINPROGRESS, + }; + co = qemu_coroutine_create(qcow2_write_co_entry, &data); + qemu_coroutine_enter(co); + while (data.ret == -EINPROGRESS) { + aio_poll(aio_context, true); + } + return data.ret; +} + /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, @@ -2519,7 +2612,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, return ret; } - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + out_buf = g_malloc(s->cluster_size); /* best compression, small window, no zlib header */ memset(&strm, 0, sizeof(strm)); @@ -2548,7 +2641,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, if (ret != Z_STREAM_END || out_len >= s->cluster_size) { /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + ret = qcow2_write(bs, sector_num, buf, s->cluster_sectors); if (ret < 0) { goto fail; } @@ -2567,7 +2660,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); - ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); if (ret < 0) { goto fail; } @@ -2616,8 +2709,8 @@ static int make_completely_empty(BlockDriverState *bs) /* After this call, neither the in-memory nor the on-disk refcount * information accurately describe the actual references */ - ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE, - l1_clusters * s->cluster_sectors, 0); + ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset, + l1_clusters * s->cluster_size, 0); if (ret < 0) { goto fail_broken_refcounts; } @@ -2630,9 +2723,8 @@ static int make_completely_empty(BlockDriverState *bs) * overwrite parts of the existing refcount and L1 table, which is not * an issue because the dirty flag is set, complete data loss is in fact * desired and partial data loss is consequently fine as well */ - ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE, - (2 + l1_clusters) * s->cluster_size / - BDRV_SECTOR_SIZE, 0); + ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size, + (2 + l1_clusters) * s->cluster_size, 0); /* This call (even if it failed overall) may have overwritten on-disk * refcount structures; in that case, the in-memory refcount information * will probably differ from the on-disk information which makes the BDS @@ -2647,10 +2739,10 @@ static int make_completely_empty(BlockDriverState *bs) /* "Create" an empty reftable (one cluster) directly after the image * header and an empty L1 table three clusters after the image header; * the cluster between those two will be used as the first refblock */ - cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); - cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); - cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset), + l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size); + l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size); + l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); if (ret < 0) { goto fail_broken_refcounts; @@ -2681,7 +2773,7 @@ static int make_completely_empty(BlockDriverState *bs) /* Enter the first refblock into the reftable */ rt_entry = cpu_to_be64(2 * s->cluster_size); - ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size, + ret = bdrv_pwrite_sync(bs->file, s->cluster_size, &rt_entry, sizeof(rt_entry)); if (ret < 0) { goto fail_broken_refcounts; @@ -2774,14 +2866,14 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) int ret; qemu_co_mutex_lock(&s->lock); - ret = qcow2_cache_flush(bs, s->l2_table_cache); + ret = qcow2_cache_write(bs, s->l2_table_cache); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); return ret; } if (qcow2_need_accurate_refcounts(s)) { - ret = qcow2_cache_flush(bs, s->refcount_block_cache); + ret = qcow2_cache_write(bs, s->refcount_block_cache); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); return ret; @@ -2861,36 +2953,20 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BDRVQcow2State *s = bs->opaque; - int64_t total_sectors = bs->total_sectors; - bool zero_beyond_eof = bs->zero_beyond_eof; - int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); - bs->zero_beyond_eof = false; - ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); - bs->zero_beyond_eof = zero_beyond_eof; - - /* bdrv_co_do_writev will have increased the total_sectors value to include - * the VM state - the VM state is however not an actual part of the block - * device, therefore, we need to restore the old value. */ - bs->total_sectors = total_sectors; - - return ret; + return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos, + qiov->size, qiov, 0); } -static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) +static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) { BDRVQcow2State *s = bs->opaque; - bool zero_beyond_eof = bs->zero_beyond_eof; - int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); - bs->zero_beyond_eof = false; - ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); - bs->zero_beyond_eof = zero_beyond_eof; - - return ret; + return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos, + qiov->size, qiov, 0); } /* @@ -3329,12 +3405,12 @@ BlockDriver bdrv_qcow2 = { .bdrv_co_get_block_status = qcow2_co_get_block_status, .bdrv_set_key = qcow2_set_key, - .bdrv_co_readv = qcow2_co_readv, - .bdrv_co_writev = qcow2_co_writev, + .bdrv_co_preadv = qcow2_co_preadv, + .bdrv_co_pwritev = qcow2_co_pwritev, .bdrv_co_flush_to_os = qcow2_co_flush_to_os, - .bdrv_co_write_zeroes = qcow2_co_write_zeroes, - .bdrv_co_discard = qcow2_co_discard, + .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, + .bdrv_co_pdiscard = qcow2_co_pdiscard, .bdrv_truncate = qcow2_truncate, .bdrv_write_compressed = qcow2_write_compressed, .bdrv_make_empty = qcow2_make_empty, diff --git a/block/qcow2.h b/block/qcow2.h index a063a3c1a..b36a7bf8a 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion { */ uint64_t offset; - /** Number of sectors to copy */ - int nb_sectors; + /** Number of bytes to copy */ + int nb_bytes; } Qcow2COWRegion; /** @@ -318,12 +318,6 @@ typedef struct QCowL2Meta /** Host offset of the first newly allocated cluster */ uint64_t alloc_offset; - /** - * Number of sectors from the start of the first allocated cluster to - * the end of the (possibly shortened) request - */ - int nb_available; - /** Number of newly allocated clusters */ int nb_clusters; @@ -471,8 +465,7 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m) static inline uint64_t l2meta_cow_end(QCowL2Meta *m) { - return m->offset + m->cow_end.offset - + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); + return m->offset + m->cow_end.offset + m->cow_end.nb_bytes; } static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) @@ -544,9 +537,10 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, int nb_sectors, bool enc, Error **errp); int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset); + unsigned int *bytes, uint64_t *cluster_offset); int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *host_offset, QCowL2Meta **m); + unsigned int *bytes, uint64_t *host_offset, + QCowL2Meta **m); uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size); @@ -583,6 +577,7 @@ int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, void *table); int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c); int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); void qcow2_cache_depends_on_flush(Qcow2Cache *c); diff --git a/block/qed-check.c b/block/qed-check.c index 622f30897..dcd4f036b 100644 --- a/block/qed-check.c +++ b/block/qed-check.c @@ -234,8 +234,7 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) } check.result->bfi.total_clusters = - (s->header.image_size + s->header.cluster_size - 1) / - s->header.cluster_size; + DIV_ROUND_UP(s->header.image_size, s->header.cluster_size); ret = qed_check_l1_table(&check, s->l1_table); if (ret == 0) { /* Only check for leaks if entire image was scanned successfully */ diff --git a/block/qed-table.c b/block/qed-table.c index 802945f5e..1a731dff5 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -16,6 +16,7 @@ #include "trace.h" #include "qemu/sockets.h" /* for EINPROGRESS on Windows */ #include "qed.h" +#include "qemu/bswap.h" typedef struct { GenericCB gencb; @@ -64,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); - bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov, + bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, qiov->size / BDRV_SECTOR_SIZE, qed_read_table_cb, read_table_cb); } @@ -153,7 +154,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, /* Adjust for offset into table */ offset += start * sizeof(uint64_t); - bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, &write_table_cb->qiov, write_table_cb->qiov.size / BDRV_SECTOR_SIZE, qed_write_table_cb, write_table_cb); diff --git a/block/qed.c b/block/qed.c index 0af52741d..426f3cb44 100644 --- a/block/qed.c +++ b/block/qed.c @@ -15,6 +15,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/timer.h" +#include "qemu/bswap.h" #include "trace.h" #include "qed.h" #include "qapi/qmp/qerror.h" @@ -85,7 +86,7 @@ int qed_write_header_sync(BDRVQEDState *s) int ret; qed_header_cpu_to_le(&s->header, &le); - ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le)); + ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); if (ret != sizeof(le)) { return ret; } @@ -122,7 +123,7 @@ static void qed_write_header_read_cb(void *opaque, int ret) /* Update header */ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); - bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov, + bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, write_header_cb->nsectors, qed_write_header_cb, write_header_cb); } @@ -142,8 +143,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, * them, and write back. */ - int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / - BDRV_SECTOR_SIZE; + int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE); size_t len = nsectors * BDRV_SECTOR_SIZE; QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), cb, opaque); @@ -155,7 +155,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, write_header_cb->iov.iov_len = len; qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); - bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors, + bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, qed_write_header_read_cb, write_header_cb); } @@ -218,7 +218,7 @@ static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, * * The string is NUL-terminated. */ -static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, +static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n, char *buf, size_t buflen) { int ret; @@ -389,7 +389,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->bs = bs; QSIMPLEQ_INIT(&s->allocating_write_reqs); - ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header)); + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); if (ret < 0) { return ret; } @@ -446,7 +446,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset, + ret = qed_read_string(bs->file, s->header.backing_filename_offset, s->header.backing_filename_size, bs->backing_file, sizeof(bs->backing_file)); if (ret < 0) { @@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQEDState *s = bs->opaque; - bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; + bs->bl.pwrite_zeroes_alignment = s->header.cluster_size; } /* We have nothing to do for QED reopen, stubs just return @@ -601,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, } qed_header_cpu_to_le(&header, &le_header); - ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); + ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0); if (ret < 0) { goto out; } ret = blk_pwrite(blk, sizeof(le_header), backing_file, - header.backing_filename_size); + header.backing_filename_size, 0); if (ret < 0) { goto out; } l1_table = g_malloc0(l1_size); - ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); + ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0); if (ret < 0) { goto out; } @@ -708,7 +708,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l } if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); + qemu_coroutine_enter(cb->co); } } @@ -800,7 +800,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, qemu_iovec_concat(*backing_qiov, qiov, 0, size); BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); - bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE, + bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE, *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); } @@ -837,7 +837,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret) } BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); - bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, qed_copy_from_backing_file_cb, copy_cb); } @@ -1087,7 +1087,7 @@ static void qed_aio_write_main(void *opaque, int ret) } BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, next_fn, acb); } @@ -1319,7 +1319,7 @@ static void qed_aio_read_data(void *opaque, int ret, } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE, + bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, qed_aio_next_io, acb); return; @@ -1418,21 +1418,21 @@ typedef struct { bool done; } QEDWriteZeroesCB; -static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret) { QEDWriteZeroesCB *cb = opaque; cb->done = true; cb->ret = ret; if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); + qemu_coroutine_enter(cb->co); } } -static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, + int count, + BdrvRequestFlags flags) { BlockAIOCB *blockacb; BDRVQEDState *s = bs->opaque; @@ -1440,25 +1440,22 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, QEMUIOVector qiov; struct iovec iov; - /* Refuse if there are untouched backing file sectors */ - if (bs->backing) { - if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } + /* Fall back if the request is not aligned */ + if (qed_offset_into_cluster(s, offset) || + qed_offset_into_cluster(s, count)) { + return -ENOTSUP; } /* Zero writes start without an I/O buffer. If a buffer becomes necessary * then it will be allocated during request processing. */ - iov.iov_base = NULL, - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + iov.iov_base = NULL; + iov.iov_len = count; qemu_iovec_init_external(&qiov, &iov, 1); - blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, - qed_co_write_zeroes_cb, &cb, + blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov, + count >> BDRV_SECTOR_BITS, + qed_co_pwrite_zeroes_cb, &cb, QED_AIOCB_WRITE | QED_AIOCB_ZERO); if (!blockacb) { return -EIO; @@ -1578,7 +1575,7 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs, } /* Write new header */ - ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len); + ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); g_free(buffer); if (ret == 0) { memcpy(&s->header, &new_header, sizeof(new_header)); @@ -1594,12 +1591,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) bdrv_qed_close(bs); - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - memset(s, 0, sizeof(BDRVQEDState)); ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); if (local_err) { @@ -1669,7 +1660,7 @@ static BlockDriver bdrv_qed = { .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, - .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, + .bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, diff --git a/block/quorum.c b/block/quorum.c index da15465a9..9cf876fb3 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -14,6 +14,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qdict.h" @@ -67,6 +68,9 @@ typedef struct QuorumVotes { typedef struct BDRVQuorumState { BdrvChild **children; /* children BlockDriverStates */ int num_children; /* children count */ + unsigned next_child_index; /* the index of the next child that should + * be added + */ int threshold; /* if less than threshold children reads gave the * same result a quorum error occurs. */ @@ -379,7 +383,7 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, continue; } QLIST_FOREACH(item, &version->items, next) { - bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num, + bdrv_aio_writev(s->children[item->index], acb->sector_num, acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, acb); } @@ -656,7 +660,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) } for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, + acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num, &acb->qcrs[i].qiov, acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); } @@ -674,7 +678,7 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, acb->qcrs[acb->child_iter].buf); acb->qcrs[acb->child_iter].aiocb = - bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, + bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num, &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, quorum_aio_cb, &acb->qcrs[acb->child_iter]); @@ -715,7 +719,7 @@ static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, int i; for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num, + acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num, qiov, nb_sectors, &quorum_aio_cb, &acb->qcrs[i]); } @@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs) return result; } -static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - BDRVQuorumState *s = bs->opaque; - Error *local_err = NULL; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_invalidate_cache(s->children[i]->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - } -} - static coroutine_fn int quorum_co_flush(BlockDriverState *bs) { BDRVQuorumState *s = bs->opaque; @@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto exit; } - if (s->num_children < 2) { + if (s->num_children < 1) { error_setg(&local_err, - "Number of provided children must be greater than 1"); + "Number of provided children must be 1 or more"); ret = -EINVAL; goto exit; } @@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, opened[i] = true; } + s->next_child_index = s->num_children; g_free(opened); goto exit; @@ -981,9 +971,7 @@ close_exit: exit: qemu_opts_del(opts); /* propagate error */ - if (local_err) { - error_propagate(errp, local_err); - } + error_propagate(errp, local_err); return ret; } @@ -999,25 +987,70 @@ static void quorum_close(BlockDriverState *bs) g_free(s->children); } -static void quorum_detach_aio_context(BlockDriverState *bs) +static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, + Error **errp) { BDRVQuorumState *s = bs->opaque; - int i; + BdrvChild *child; + char indexstr[32]; + int ret; - for (i = 0; i < s->num_children; i++) { - bdrv_detach_aio_context(s->children[i]->bs); + assert(s->num_children <= INT_MAX / sizeof(BdrvChild *)); + if (s->num_children == INT_MAX / sizeof(BdrvChild *) || + s->next_child_index == UINT_MAX) { + error_setg(errp, "Too many children"); + return; } + + ret = snprintf(indexstr, 32, "children.%u", s->next_child_index); + if (ret < 0 || ret >= 32) { + error_setg(errp, "cannot generate child name"); + return; + } + s->next_child_index++; + + bdrv_drained_begin(bs); + + /* We can safely add the child now */ + bdrv_ref(child_bs); + child = bdrv_attach_child(bs, child_bs, indexstr, &child_format); + s->children = g_renew(BdrvChild *, s->children, s->num_children + 1); + s->children[s->num_children++] = child; + + bdrv_drained_end(bs); } -static void quorum_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) +static void quorum_del_child(BlockDriverState *bs, BdrvChild *child, + Error **errp) { BDRVQuorumState *s = bs->opaque; int i; for (i = 0; i < s->num_children; i++) { - bdrv_attach_aio_context(s->children[i]->bs, new_context); + if (s->children[i] == child) { + break; + } } + + /* we have checked it in bdrv_del_child() */ + assert(i < s->num_children); + + if (s->num_children <= s->threshold) { + error_setg(errp, + "The number of children cannot be lower than the vote threshold %d", + s->threshold); + return; + } + + bdrv_drained_begin(bs); + + /* We can safely remove this child now */ + memmove(&s->children[i], &s->children[i + 1], + (s->num_children - i - 1) * sizeof(BdrvChild *)); + s->children = g_renew(BdrvChild *, s->children, --s->num_children); + bdrv_unref_child(bs, child); + + bdrv_drained_end(bs); } static void quorum_refresh_filename(BlockDriverState *bs, QDict *options) @@ -1070,10 +1103,9 @@ static BlockDriver bdrv_quorum = { .bdrv_aio_readv = quorum_aio_readv, .bdrv_aio_writev = quorum_aio_writev, - .bdrv_invalidate_cache = quorum_invalidate_cache, - .bdrv_detach_aio_context = quorum_detach_aio_context, - .bdrv_attach_aio_context = quorum_attach_aio_context, + .bdrv_add_child = quorum_add_child, + .bdrv_del_child = quorum_del_child, .is_filter = true, .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, diff --git a/block/raw-aio.h b/block/raw-aio.h deleted file mode 100644 index 811e37501..000000000 --- a/block/raw-aio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Declarations for AIO in the raw protocol - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ -#ifndef QEMU_RAW_AIO_H -#define QEMU_RAW_AIO_H - -#include "qemu/iov.h" - -/* AIO request types */ -#define QEMU_AIO_READ 0x0001 -#define QEMU_AIO_WRITE 0x0002 -#define QEMU_AIO_IOCTL 0x0004 -#define QEMU_AIO_FLUSH 0x0008 -#define QEMU_AIO_DISCARD 0x0010 -#define QEMU_AIO_WRITE_ZEROES 0x0020 -#define QEMU_AIO_TYPE_MASK \ - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ - QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) - -/* AIO flags */ -#define QEMU_AIO_MISALIGNED 0x1000 -#define QEMU_AIO_BLKDEV 0x2000 - - -/* linux-aio.c - Linux native implementation */ -#ifdef CONFIG_LINUX_AIO -void *laio_init(void); -void laio_cleanup(void *s); -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type); -void laio_detach_aio_context(void *s, AioContext *old_context); -void laio_attach_aio_context(void *s, AioContext *new_context); -void laio_io_plug(BlockDriverState *bs, void *aio_ctx); -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); -#endif - -#ifdef _WIN32 -typedef struct QEMUWin32AIOState QEMUWin32AIOState; -QEMUWin32AIOState *win32_aio_init(void); -void win32_aio_cleanup(QEMUWin32AIOState *aio); -int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); -BlockAIOCB *win32_aio_submit(BlockDriverState *bs, - QEMUWin32AIOState *aio, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type); -void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, - AioContext *old_context); -void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, - AioContext *new_context); -#endif - -#endif /* QEMU_RAW_AIO_H */ diff --git a/block/raw-posix.c b/block/raw-posix.c index 906d5c941..6ed754739 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -32,7 +32,7 @@ #include "trace.h" #include "block/thread-pool.h" #include "qemu/iov.h" -#include "raw-aio.h" +#include "block/raw-aio.h" #include "qapi/util.h" #include "qapi/qmp/qstring.h" @@ -137,10 +137,6 @@ typedef struct BDRVRawState { int open_flags; size_t buf_align; -#ifdef CONFIG_LINUX_AIO - int use_aio; - void *aio_ctx; -#endif #ifdef CONFIG_XFS bool is_xfs:1; #endif @@ -154,9 +150,6 @@ typedef struct BDRVRawState { typedef struct BDRVRawReopenState { int fd; int open_flags; -#ifdef CONFIG_LINUX_AIO - int use_aio; -#endif } BDRVRawReopenState; static int fd_open(BlockDriverState *bs); @@ -302,22 +295,22 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) /* For SCSI generic devices the alignment is not really used. With buffered I/O, we don't have any restrictions. */ if (bdrv_is_sg(bs) || !s->needs_alignment) { - bs->request_alignment = 1; + bs->bl.request_alignment = 1; s->buf_align = 1; return; } - bs->request_alignment = 0; + bs->bl.request_alignment = 0; s->buf_align = 0; /* Let's try to use the logical blocksize for the alignment. */ - if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) { - bs->request_alignment = 0; + if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { + bs->bl.request_alignment = 0; } #ifdef CONFIG_XFS if (s->is_xfs) { struct dioattr da; if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { - bs->request_alignment = da.d_miniosz; + bs->bl.request_alignment = da.d_miniosz; /* The kernel returns wrong information for d_mem */ /* s->buf_align = da.d_mem; */ } @@ -337,21 +330,21 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) qemu_vfree(buf); } - if (!bs->request_alignment) { + if (!bs->bl.request_alignment) { size_t align; buf = qemu_memalign(s->buf_align, max_align); for (align = 512; align <= max_align; align <<= 1) { if (raw_is_io_aligned(fd, buf, align)) { - bs->request_alignment = align; + bs->bl.request_alignment = align; break; } } qemu_vfree(buf); } - if (!s->buf_align || !bs->request_alignment) { - error_setg(errp, "Could not find working O_DIRECT alignment. " - "Try cache.direct=off."); + if (!s->buf_align || !bs->bl.request_alignment) { + error_setg(errp, "Could not find working O_DIRECT alignment"); + error_append_hint(errp, "Try cache.direct=off\n"); } } @@ -374,58 +367,15 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags) } } -static void raw_detach_aio_context(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - - if (s->use_aio) { - laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs)); - } -#endif -} - -static void raw_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - - if (s->use_aio) { - laio_attach_aio_context(s->aio_ctx, new_context); - } -#endif -} - #ifdef CONFIG_LINUX_AIO -static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) +static bool raw_use_aio(int bdrv_flags) { - int ret = -1; - assert(aio_ctx != NULL); - assert(use_aio != NULL); /* * Currently Linux do AIO only for files opened with O_DIRECT * specified so check NOCACHE flag too */ - if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == - (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { - - /* if non-NULL, laio_init() has already been run */ - if (*aio_ctx == NULL) { - *aio_ctx = laio_init(); - if (!*aio_ctx) { - goto error; - } - } - *use_aio = 1; - } else { - *use_aio = 0; - } - - ret = 0; - -error: - return ret; + return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == + (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO); } #endif @@ -494,13 +444,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->fd = fd; #ifdef CONFIG_LINUX_AIO - if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { - qemu_close(fd); - ret = -errno; - error_setg_errno(errp, -ret, "Could not set AIO state"); - goto fail; - } - if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) { + if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) { error_setg(errp, "aio=native was specified, but it requires " "cache.direct=on, which was not specified."); ret = -EINVAL; @@ -517,6 +461,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->has_discard = true; s->has_write_zeroes = true; + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { s->needs_alignment = true; } @@ -566,8 +511,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif - raw_attach_aio_context(bs, bdrv_get_aio_context(bs)); - ret = 0; fail: if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { @@ -581,15 +524,9 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; s->type = FTYPE_FILE; - ret = raw_open_common(bs, options, flags, 0, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; + return raw_open_common(bs, options, flags, 0, errp); } static int raw_reopen_prepare(BDRVReopenState *state, @@ -608,18 +545,6 @@ static int raw_reopen_prepare(BDRVReopenState *state, state->opaque = g_new0(BDRVRawReopenState, 1); raw_s = state->opaque; -#ifdef CONFIG_LINUX_AIO - raw_s->use_aio = s->use_aio; - - /* we can use s->aio_ctx instead of a copy, because the use_aio flag is - * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() - * won't override aio_ctx if aio_ctx is non-NULL */ - if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { - error_setg(errp, "Could not set AIO state"); - return -1; - } -#endif - if (s->type == FTYPE_CD) { raw_s->open_flags |= O_NONBLOCK; } @@ -644,15 +569,7 @@ static int raw_reopen_prepare(BDRVReopenState *state, if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { /* dup the original fd */ - /* TODO: use qemu fcntl wrapper */ -#ifdef F_DUPFD_CLOEXEC - raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0); -#else - raw_s->fd = dup(s->fd); - if (raw_s->fd != -1) { - qemu_set_cloexec(raw_s->fd); - } -#endif + raw_s->fd = qemu_dup(s->fd); if (raw_s->fd >= 0) { ret = fcntl_setfl(raw_s->fd, raw_s->open_flags); if (ret) { @@ -702,9 +619,6 @@ static void raw_reopen_commit(BDRVReopenState *state) qemu_close(s->fd); s->fd = raw_s->fd; -#ifdef CONFIG_LINUX_AIO - s->use_aio = raw_s->use_aio; -#endif g_free(state->opaque); state->opaque = NULL; @@ -728,9 +642,33 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +static int hdev_get_max_transfer_length(int fd) +{ +#ifdef BLKSECTGET + int max_sectors = 0; + if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) { + return max_sectors; + } else { + return -errno; + } +#else + return -ENOSYS; +#endif +} + static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; + struct stat st; + + if (!fstat(s->fd, &st)) { + if (S_ISBLK(st.st_mode)) { + int ret = hdev_get_max_transfer_length(s->fd); + if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) { + bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS); + } + } + } raw_probe_alignment(bs, s->fd, errp); bs->bl.min_mem_alignment = s->buf_align; @@ -1251,8 +1189,8 @@ static int aio_worker(void *arg) } static int paio_submit_co(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - int type) + int64_t offset, QEMUIOVector *qiov, + int count, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); ThreadPool *pool; @@ -1261,22 +1199,22 @@ static int paio_submit_co(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + acb->aio_nbytes = count; + acb->aio_offset = offset; if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; - assert(qiov->size == acb->aio_nbytes); + assert(qiov->size == count); } - trace_paio_submit_co(sector_num, nb_sectors, type); + trace_paio_submit_co(offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_co(pool, aio_worker, acb); } static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int64_t offset, QEMUIOVector *qiov, int count, BlockCompletionFunc *cb, void *opaque, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); @@ -1286,8 +1224,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + acb->aio_nbytes = count; + acb->aio_offset = offset; if (qiov) { acb->aio_iov = qiov->iov; @@ -1295,19 +1233,18 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, assert(qiov->size == acb->aio_nbytes); } - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + trace_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } -static BlockAIOCB *raw_aio_submit(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) +static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int type) { BDRVRawState *s = bs->opaque; if (fd_open(bs) < 0) - return NULL; + return -EIO; /* * Check if the underlying device requires requests to be aligned, @@ -1319,61 +1256,50 @@ static BlockAIOCB *raw_aio_submit(BlockDriverState *bs, if (!bdrv_qiov_is_aligned(bs, qiov)) { type |= QEMU_AIO_MISALIGNED; #ifdef CONFIG_LINUX_AIO - } else if (s->use_aio) { - return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, - nb_sectors, cb, opaque, type); + } else if (bs->open_flags & BDRV_O_NATIVE_AIO) { + LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); + assert(qiov->size == bytes); + return laio_co_submit(bs, aio, s->fd, offset, qiov, type); #endif } } - return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, - cb, opaque, type); + return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); } -static void raw_aio_plug(BlockDriverState *bs) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_plug(bs, s->aio_ctx); - } -#endif + return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); } -static void raw_aio_unplug(BlockDriverState *bs) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, true); - } -#endif + assert(flags == 0); + return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); } -static void raw_aio_flush_io_queue(BlockDriverState *bs) +static void raw_aio_plug(BlockDriverState *bs) { #ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, false); + if (bs->open_flags & BDRV_O_NATIVE_AIO) { + LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); + laio_io_plug(bs, aio); } #endif } -static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return raw_aio_submit(bs, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_READ); -} - -static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static void raw_aio_unplug(BlockDriverState *bs) { - return raw_aio_submit(bs, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_WRITE); +#ifdef CONFIG_LINUX_AIO + if (bs->open_flags & BDRV_O_NATIVE_AIO) { + LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); + laio_io_unplug(bs, aio); + } +#endif } static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, @@ -1391,13 +1317,6 @@ static void raw_close(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; - raw_detach_aio_context(bs); - -#ifdef CONFIG_LINUX_AIO - if (s->use_aio) { - laio_cleanup(s->aio_ctx); - } -#endif if (s->fd >= 0) { qemu_close(s->fd); s->fd = -1; @@ -1867,27 +1786,27 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, return ret | BDRV_BLOCK_OFFSET_VALID | start; } -static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, +static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, + int64_t offset, int count, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit(bs, s->fd, offset, NULL, count, cb, opaque, QEMU_AIO_DISCARD); } -static int coroutine_fn raw_co_write_zeroes( - BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn raw_co_pwrite_zeroes( + BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) { BDRVRawState *s = bs->opaque; if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_WRITE_ZEROES); } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_DISCARD); } return -ENOTSUP; @@ -1940,16 +1859,15 @@ BlockDriver bdrv_file = { .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_get_block_status = raw_co_get_block_status, - .bdrv_co_write_zeroes = raw_co_write_zeroes, + .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = raw_aio_discard, + .bdrv_aio_pdiscard = raw_aio_pdiscard, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1957,9 +1875,6 @@ BlockDriver bdrv_file = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - .create_opts = &raw_create_opts, }; @@ -2225,9 +2140,7 @@ hdev_open_Mac_error: ret = raw_open_common(bs, options, flags, 0, &local_err); if (ret < 0) { - if (local_err) { - error_propagate(errp, local_err); - } + error_propagate(errp, local_err); #if defined(__APPLE__) && defined(__MACH__) if (*bsd_path) { filename = bsd_path; @@ -2290,8 +2203,8 @@ static int fd_open(BlockDriverState *bs) return -EIO; } -static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, +static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, + int64_t offset, int count, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; @@ -2299,12 +2212,12 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, if (fd_open(bs) < 0) { return NULL; } - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit(bs, s->fd, offset, NULL, count, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } -static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { BDRVRawState *s = bs->opaque; int rc; @@ -2314,10 +2227,10 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, return rc; } if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } return -ENOTSUP; @@ -2389,16 +2302,15 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_opts = &raw_create_opts, - .bdrv_co_write_zeroes = hdev_co_write_zeroes, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = hdev_aio_discard, + .bdrv_aio_pdiscard = hdev_aio_pdiscard, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2408,9 +2320,6 @@ static BlockDriver bdrv_host_device = { .bdrv_probe_blocksizes = hdev_probe_blocksizes, .bdrv_probe_geometry = hdev_probe_geometry, - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - /* generic scsi device */ #ifdef __linux__ .bdrv_aio_ioctl = hdev_aio_ioctl, @@ -2433,17 +2342,11 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; s->type = FTYPE_CD; /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; + return raw_open_common(bs, options, flags, O_NONBLOCK, errp); } static int cdrom_probe_device(const char *filename) @@ -2522,13 +2425,13 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_create = hdev_create, .create_opts = &raw_create_opts, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2536,9 +2439,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - /* removable device support */ .bdrv_is_inserted = cdrom_is_inserted, .bdrv_eject = cdrom_eject, @@ -2561,9 +2461,7 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, ret = raw_open_common(bs, options, flags, 0, &local_err); if (ret) { - if (local_err) { - error_propagate(errp, local_err); - } + error_propagate(errp, local_err); return ret; } @@ -2658,13 +2556,12 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_create = hdev_create, .create_opts = &raw_create_opts, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2672,9 +2569,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - /* removable device support */ .bdrv_is_inserted = cdrom_is_inserted, .bdrv_eject = cdrom_eject, diff --git a/block/raw-win32.c b/block/raw-win32.c index fd2389153..56f45fea9 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -27,7 +27,7 @@ #include "qemu/timer.h" #include "block/block_int.h" #include "qemu/module.h" -#include "raw-aio.h" +#include "block/raw-aio.h" #include "trace.h" #include "block/thread-pool.h" #include "qemu/iov.h" @@ -142,7 +142,7 @@ static int aio_worker(void *arg) } static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int64_t offset, QEMUIOVector *qiov, int count, BlockCompletionFunc *cb, void *opaque, int type) { RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); @@ -155,11 +155,12 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; + assert(qiov->size == count); } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; + acb->aio_nbytes = count; + acb->aio_offset = offset; - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + trace_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } @@ -222,7 +223,7 @@ static void raw_attach_aio_context(BlockDriverState *bs, } } -static void raw_probe_alignment(BlockDriverState *bs) +static void raw_probe_alignment(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; DWORD sectorsPerCluster, freeClusters, totalClusters, count; @@ -230,14 +231,14 @@ static void raw_probe_alignment(BlockDriverState *bs) BOOL status; if (s->type == FTYPE_CD) { - bs->request_alignment = 2048; + bs->bl.request_alignment = 2048; return; } if (s->type == FTYPE_HARDDISK) { status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, NULL, 0, &dg, sizeof(dg), &count, NULL); if (status != 0) { - bs->request_alignment = dg.Geometry.BytesPerSector; + bs->bl.request_alignment = dg.Geometry.BytesPerSector; return; } /* try GetDiskFreeSpace too */ @@ -247,7 +248,7 @@ static void raw_probe_alignment(BlockDriverState *bs) GetDiskFreeSpace(s->drive_path, §orsPerCluster, &dg.Geometry.BytesPerSector, &freeClusters, &totalClusters); - bs->request_alignment = dg.Geometry.BytesPerSector; + bs->bl.request_alignment = dg.Geometry.BytesPerSector; } } @@ -365,7 +366,6 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs)); } - raw_probe_alignment(bs); ret = 0; fail: qemu_opts_del(opts); @@ -379,9 +379,10 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, BDRVRawState *s = bs->opaque; if (s->aio) { return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_READ); + nb_sectors, cb, opaque, QEMU_AIO_READ); } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_READ); } } @@ -393,9 +394,10 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, BDRVRawState *s = bs->opaque; if (s->aio) { return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_WRITE); + nb_sectors, cb, opaque, QEMU_AIO_WRITE); } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_WRITE); } } @@ -550,6 +552,7 @@ BlockDriver bdrv_file = { .bdrv_needs_filename = true, .bdrv_parse_filename = raw_parse_filename, .bdrv_file_open = raw_open, + .bdrv_refresh_limits = raw_probe_alignment, .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, diff --git a/block/raw_bsd.c b/block/raw_bsd.c index a6cc7e991..588d4080f 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -1,6 +1,6 @@ /* BlockDriver implementation for "raw" * - * Copyright (C) 2010, 2013, Red Hat, Inc. + * Copyright (C) 2010-2016 Red Hat, Inc. * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com> * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com> * @@ -50,33 +50,30 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state, return 0; } -static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static int coroutine_fn -raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov, int flags) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { void *buf = NULL; BlockDriver *drv; QEMUIOVector local_qiov; int ret; - if (bs->probed && sector_num == 0) { - /* As long as these conditions are true, we can't get partial writes to - * the probe buffer and can just directly check the request. */ + if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) { + /* Handling partial writes would be a pain - so we just + * require that guests have 512-byte request alignment if + * probing occurred */ QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); - - if (nb_sectors == 0) { - /* qemu_iovec_to_buf() would fail, but we want to return success - * instead of -EINVAL in this case. */ - return 0; - } + assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE); buf = qemu_try_blockalign(bs->file->bs, 512); if (!buf) { @@ -105,8 +102,7 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); + ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); fail: if (qiov == &local_qiov) { @@ -116,13 +112,6 @@ fail: return ret; } -static int coroutine_fn -raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, @@ -134,17 +123,17 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, (sector_num << BDRV_SECTOR_BITS); } -static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, + BdrvRequestFlags flags) { - return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); + return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags); } -static int coroutine_fn raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { - return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); + return bdrv_co_pdiscard(bs->file->bs, offset, count); } static int64_t raw_getlength(BlockDriverState *bs) @@ -159,7 +148,12 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { - bs->bl = bs->file->bs->bl; + if (bs->probed) { + /* To make it easier to protect the first sector, any probed + * image is restricted to read-modify-write on sub-sector + * operations. */ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; + } } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -197,20 +191,17 @@ static int raw_has_zero_init(BlockDriverState *bs) static int raw_create(const char *filename, QemuOpts *opts, Error **errp) { - Error *local_err = NULL; - int ret; - - ret = bdrv_create_file(filename, opts, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; + return bdrv_create_file(filename, opts, errp); } static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { bs->sg = bs->file->bs->sg; + bs->supported_write_flags = BDRV_REQ_FUA & + bs->file->bs->supported_write_flags; + bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags; if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, @@ -255,12 +246,10 @@ BlockDriver bdrv_raw = { .bdrv_open = &raw_open, .bdrv_close = &raw_close, .bdrv_create = &raw_create, - .bdrv_co_readv = &raw_co_readv, - .bdrv_co_writev = &raw_co_writev, - .bdrv_co_writev_flags = &raw_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_co_write_zeroes = &raw_co_write_zeroes, - .bdrv_co_discard = &raw_co_discard, + .bdrv_co_preadv = &raw_co_preadv, + .bdrv_co_pwritev = &raw_co_pwritev, + .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, + .bdrv_co_pdiscard = &raw_co_pdiscard, .bdrv_co_get_block_status = &raw_co_get_block_status, .bdrv_truncate = &raw_truncate, .bdrv_getlength = &raw_getlength, diff --git a/block/rbd.c b/block/rbd.c index 5bc5b3253..0106fea45 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -290,7 +290,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf, if (only_read_conf_file) { ret = rados_conf_read_file(cluster, value); if (ret < 0) { - error_setg(errp, "error reading conf file %s", value); + error_setg_errno(errp, -ret, "error reading conf file %s", + value); break; } } @@ -299,7 +300,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf, } else if (!only_read_conf_file) { ret = rados_conf_set(cluster, name, value); if (ret < 0) { - error_setg(errp, "invalid conf option %s", name); + error_setg_errno(errp, -ret, "invalid conf option %s", name); ret = -EINVAL; break; } @@ -354,9 +355,10 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); - if (rados_create(&cluster, clientname) < 0) { - error_setg(errp, "error initializing"); - return -EIO; + ret = rados_create(&cluster, clientname); + if (ret < 0) { + error_setg_errno(errp, -ret, "error initializing"); + return ret; } if (strstr(conf, "conf=") == NULL) { @@ -381,21 +383,27 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) return -EIO; } - if (rados_connect(cluster) < 0) { - error_setg(errp, "error connecting"); + ret = rados_connect(cluster); + if (ret < 0) { + error_setg_errno(errp, -ret, "error connecting"); rados_shutdown(cluster); - return -EIO; + return ret; } - if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { - error_setg(errp, "error opening pool %s", pool); + ret = rados_ioctx_create(cluster, pool, &io_ctx); + if (ret < 0) { + error_setg_errno(errp, -ret, "error opening pool %s", pool); rados_shutdown(cluster); - return -EIO; + return ret; } ret = rbd_create(io_ctx, name, bytes, &obj_order); rados_ioctx_destroy(io_ctx); rados_shutdown(cluster); + if (ret < 0) { + error_setg_errno(errp, -ret, "error rbd create"); + return ret; + } return ret; } @@ -500,7 +508,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, clientname = qemu_rbd_parse_clientname(conf, clientname_buf); r = rados_create(&s->cluster, clientname); if (r < 0) { - error_setg(errp, "error initializing"); + error_setg_errno(errp, -r, "error initializing"); goto failed_opts; } @@ -546,19 +554,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, r = rados_connect(s->cluster); if (r < 0) { - error_setg(errp, "error connecting"); + error_setg_errno(errp, -r, "error connecting"); goto failed_shutdown; } r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); if (r < 0) { - error_setg(errp, "error opening pool %s", pool); + error_setg_errno(errp, -r, "error opening pool %s", pool); goto failed_shutdown; } r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); if (r < 0) { - error_setg(errp, "error reading header from %s", s->name); + error_setg_errno(errp, -r, "error reading header from %s", s->name); goto failed_open; } @@ -641,9 +649,9 @@ static int rbd_aio_flush_wrapper(rbd_image_t image, } static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, - int64_t sector_num, + int64_t off, QEMUIOVector *qiov, - int nb_sectors, + int64_t size, BlockCompletionFunc *cb, void *opaque, RBDAIOCmd cmd) @@ -651,7 +659,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, RBDAIOCB *acb; RADOSCB *rcb = NULL; rbd_completion_t c; - int64_t off, size; char *buf; int r; @@ -660,6 +667,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); acb->cmd = cmd; acb->qiov = qiov; + assert(!qiov || qiov->size == size); if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { @@ -679,9 +687,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, buf = acb->bounce; - off = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - rcb = g_new(RADOSCB, 1); rcb->acb = acb; rcb->buf = buf; @@ -731,7 +736,8 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, RBD_AIO_READ); } @@ -742,7 +748,8 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, RBD_AIO_WRITE); } @@ -875,10 +882,8 @@ static int qemu_rbd_snap_rollback(BlockDriverState *bs, const char *snapshot_name) { BDRVRBDState *s = bs->opaque; - int r; - r = rbd_snap_rollback(s->image, snapshot_name); - return r; + return rbd_snap_rollback(s->image, snapshot_name); } static int qemu_rbd_snap_list(BlockDriverState *bs, @@ -925,13 +930,13 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, } #ifdef LIBRBD_SUPPORTS_DISCARD -static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs, + int64_t offset, + int count, + BlockCompletionFunc *cb, + void *opaque) { - return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, + return rbd_start_aio(bs, offset, NULL, count, cb, opaque, RBD_AIO_DISCARD); } #endif @@ -995,7 +1000,7 @@ static BlockDriver bdrv_rbd = { #endif #ifdef LIBRBD_SUPPORTS_DISCARD - .bdrv_aio_discard = qemu_rbd_aio_discard, + .bdrv_aio_pdiscard = qemu_rbd_aio_pdiscard, #endif .bdrv_snapshot_create = qemu_rbd_snap_create, diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a3382..66e1cb2b2 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) #undef DPRINTF #ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ - do { \ - fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ - } while (0) +#define DEBUG_SDOG_PRINT 1 #else -#define DPRINTF(fmt, args...) +#define DEBUG_SDOG_PRINT 0 #endif +#define DPRINTF(fmt, args...) \ + do { \ + if (DEBUG_SDOG_PRINT) { \ + fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ + } \ + } while (0) typedef struct SheepdogAIOCB SheepdogAIOCB; @@ -492,7 +495,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) { - qemu_coroutine_enter(acb->coroutine, NULL); + qemu_coroutine_enter(acb->coroutine); qemu_aio_unref(acb); } @@ -633,7 +636,7 @@ static void restart_co_req(void *opaque) { Coroutine *co = opaque; - qemu_coroutine_enter(co, NULL); + qemu_coroutine_enter(co); } typedef struct SheepdogReqCo { @@ -723,8 +726,8 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, if (qemu_in_coroutine()) { do_co_req(&srco); } else { - co = qemu_coroutine_create(do_co_req); - qemu_coroutine_enter(co, &srco); + co = qemu_coroutine_create(do_co_req, &srco); + qemu_coroutine_enter(co); while (!srco.finished) { aio_poll(aio_context, true); } @@ -922,17 +925,17 @@ static void co_read_response(void *opaque) BDRVSheepdogState *s = opaque; if (!s->co_recv) { - s->co_recv = qemu_coroutine_create(aio_read_response); + s->co_recv = qemu_coroutine_create(aio_read_response, opaque); } - qemu_coroutine_enter(s->co_recv, opaque); + qemu_coroutine_enter(s->co_recv); } static void co_write_request(void *opaque) { BDRVSheepdogState *s = opaque; - qemu_coroutine_enter(s->co_send, NULL); + qemu_coroutine_enter(s->co_send); } /* @@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp) if (ret < 0) { goto out; } - ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); + ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0); if (ret < 0) { goto out; } @@ -2781,17 +2784,24 @@ static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, return ret; } -static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, - int64_t pos, int size) +static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) { BDRVSheepdogState *s = bs->opaque; + void *buf; + int ret; - return do_load_save_vmstate(s, data, pos, size, 1); + buf = qemu_blockalign(bs, qiov->size); + ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1); + qemu_iovec_from_buf(qiov, 0, buf, qiov->size); + qemu_vfree(buf); + + return ret; } -static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset, + int count) { SheepdogAIOCB *acb; BDRVSheepdogState *s = bs->opaque; @@ -2801,7 +2811,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, uint32_t zero = 0; if (!s->discard_supported) { - return 0; + return 0; } memset(&discard_iov, 0, sizeof(discard_iov)); @@ -2810,7 +2820,10 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, iov.iov_len = sizeof(zero); discard_iov.iov = &iov; discard_iov.niov = 1; - acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors); + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((count & (BDRV_SECTOR_SIZE - 1)) == 0); + acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); acb->aiocb_type = AIOCB_DISCARD_OBJ; acb->aio_done_func = sd_finish_aiocb; @@ -2944,7 +2957,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, @@ -2980,7 +2993,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, @@ -3016,7 +3029,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, diff --git a/block/snapshot.c b/block/snapshot.c index e9d721df6..bf5c2ca5e 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -358,9 +358,7 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); } - if (local_err) { - error_propagate(errp, local_err); - } + error_propagate(errp, local_err); return ret; } @@ -373,9 +371,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) { bool ok = true; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; - while (ok && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -383,8 +382,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) ok = bdrv_can_snapshot(bs); } aio_context_release(ctx); + if (!ok) { + goto fail; + } } +fail: *first_bad_bs = bs; return ok; } @@ -393,10 +396,11 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, Error **err) { int ret = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; QEMUSnapshotInfo sn1, *snapshot = &sn1; - while (ret == 0 && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -405,8 +409,12 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); } aio_context_release(ctx); + if (ret < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return ret; } @@ -415,9 +423,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) { int err = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; - while (err == 0 && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -425,8 +434,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) err = bdrv_snapshot_goto(bs, name); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } @@ -435,9 +448,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) { QEMUSnapshotInfo sn; int err = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; - while (err == 0 && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -445,8 +459,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) err = bdrv_snapshot_find(bs, &sn, name); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } @@ -457,9 +475,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState **first_bad_bs) { int err = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; - while (err == 0 && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -471,23 +490,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, err = bdrv_snapshot_create(bs, sn); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } BlockDriverState *bdrv_all_find_vmstate_bs(void) { - bool not_found = true; - BlockDriverState *bs = NULL; + BlockDriverState *bs; + BdrvNextIterator it; - while (not_found && (bs = bdrv_next(bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); + bool found; aio_context_acquire(ctx); - not_found = !bdrv_can_snapshot(bs); + found = bdrv_can_snapshot(bs); aio_context_release(ctx); + + if (found) { + break; + } } return bs; } diff --git a/block/ssh.c b/block/ssh.c index 06928ed93..5ce12b633 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -508,36 +508,73 @@ static int authenticate(BDRVSSHState *s, const char *user, Error **errp) return ret; } +static QemuOptsList ssh_runtime_opts = { + .name = "ssh", + .head = QTAILQ_HEAD_INITIALIZER(ssh_runtime_opts.head), + .desc = { + { + .name = "host", + .type = QEMU_OPT_STRING, + .help = "Host to connect to", + }, + { + .name = "port", + .type = QEMU_OPT_NUMBER, + .help = "Port to connect to", + }, + { + .name = "path", + .type = QEMU_OPT_STRING, + .help = "Path of the image on the host", + }, + { + .name = "user", + .type = QEMU_OPT_STRING, + .help = "User as which to connect", + }, + { + .name = "host_key_check", + .type = QEMU_OPT_STRING, + .help = "Defines how and what to check the host key against", + }, + }, +}; + static int connect_to_ssh(BDRVSSHState *s, QDict *options, int ssh_flags, int creat_mode, Error **errp) { int r, ret; + QemuOpts *opts = NULL; + Error *local_err = NULL; const char *host, *user, *path, *host_key_check; int port; - if (!qdict_haskey(options, "host")) { + opts = qemu_opts_create(&ssh_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { ret = -EINVAL; - error_setg(errp, "No hostname was specified"); + error_propagate(errp, local_err); goto err; } - host = qdict_get_str(options, "host"); - if (qdict_haskey(options, "port")) { - port = qdict_get_int(options, "port"); - } else { - port = 22; + host = qemu_opt_get(opts, "host"); + if (!host) { + ret = -EINVAL; + error_setg(errp, "No hostname was specified"); + goto err; } - if (!qdict_haskey(options, "path")) { + port = qemu_opt_get_number(opts, "port", 22); + + path = qemu_opt_get(opts, "path"); + if (!path) { ret = -EINVAL; error_setg(errp, "No path was specified"); goto err; } - path = qdict_get_str(options, "path"); - if (qdict_haskey(options, "user")) { - user = qdict_get_str(options, "user"); - } else { + user = qemu_opt_get(opts, "user"); + if (!user) { user = g_get_user_name(); if (!user) { error_setg_errno(errp, errno, "Can't get user name"); @@ -546,9 +583,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, } } - if (qdict_haskey(options, "host_key_check")) { - host_key_check = qdict_get_str(options, "host_key_check"); - } else { + host_key_check = qemu_opt_get(opts, "host_key_check"); + if (!host_key_check) { host_key_check = "yes"; } @@ -612,21 +648,14 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, goto err; } + qemu_opts_del(opts); + r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); if (r < 0) { sftp_error_setg(errp, s, "failed to read file attributes"); return -EINVAL; } - /* Delete the options we've used; any not deleted will cause the - * block layer to give an error about unused options. - */ - qdict_del(options, "host"); - qdict_del(options, "port"); - qdict_del(options, "user"); - qdict_del(options, "path"); - qdict_del(options, "host_key_check"); - return 0; err: @@ -646,6 +675,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, } s->session = NULL; + qemu_opts_del(opts); + return ret; } @@ -777,7 +808,7 @@ static void restart_coroutine(void *opaque) DPRINTF("co=%p", co); - qemu_coroutine_enter(co, NULL); + qemu_coroutine_enter(co); } static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) diff --git a/block/stream.c b/block/stream.c index 332b9a183..31874817c 100644 --- a/block/stream.c +++ b/block/stream.c @@ -39,7 +39,7 @@ typedef struct StreamBlockJob { char *backing_file_str; } StreamBlockJob; -static int coroutine_fn stream_populate(BlockDriverState *bs, +static int coroutine_fn stream_populate(BlockBackend *blk, int64_t sector_num, int nb_sectors, void *buf) { @@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs, qemu_iovec_init_external(&qiov, &iov, 1); /* Copy-on-read the unallocated clusters */ - return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); + return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov, + BDRV_REQ_COPY_ON_READ); } typedef struct { @@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque) { StreamBlockJob *s = container_of(job, StreamBlockJob, common); StreamCompleteData *data = opaque; + BlockDriverState *bs = blk_bs(job->blk); BlockDriverState *base = s->base; if (!block_job_is_cancelled(&s->common) && data->reached_end && @@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque) base_fmt = base->drv->format_name; } } - data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); - bdrv_set_backing_hd(job->bs, base); + data->ret = bdrv_change_backing_file(bs, base_id, base_fmt); + bdrv_set_backing_hd(bs, base); } g_free(s->backing_file_str); @@ -88,10 +90,12 @@ static void coroutine_fn stream_run(void *opaque) { StreamBlockJob *s = opaque; StreamCompleteData *data; - BlockDriverState *bs = s->common.bs; + BlockBackend *blk = s->common.blk; + BlockDriverState *bs = blk_bs(blk); BlockDriverState *base = s->base; int64_t sector_num = 0; int64_t end = -1; + uint64_t delay_ns = 0; int error = 0; int ret = 0; int n = 0; @@ -120,10 +124,8 @@ static void coroutine_fn stream_run(void *opaque) } for (sector_num = 0; sector_num < end; sector_num += n) { - uint64_t delay_ns = 0; bool copy; -wait: /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ @@ -153,18 +155,11 @@ wait: } trace_stream_one_iteration(s, sector_num, n, ret); if (copy) { - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); - if (delay_ns > 0) { - goto wait; - } - } - ret = stream_populate(bs, sector_num, n, buf); + ret = stream_populate(blk, sector_num, n, buf); } if (ret < 0) { BlockErrorAction action = - block_job_error_action(&s->common, s->common.bs, s->on_error, - true, -ret); + block_job_error_action(&s->common, s->on_error, true, -ret); if (action == BLOCK_ERROR_ACTION_STOP) { n = 0; continue; @@ -180,6 +175,9 @@ wait: /* Publish progress */ s->common.offset += n * BDRV_SECTOR_SIZE; + if (copy && s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, n); + } } if (!base) { @@ -216,22 +214,15 @@ static const BlockJobDriver stream_job_driver = { .set_speed = stream_set_speed, }; -void stream_start(BlockDriverState *bs, BlockDriverState *base, - const char *backing_file_str, int64_t speed, - BlockdevOnError on_error, - BlockCompletionFunc *cb, - void *opaque, Error **errp) +void stream_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, const char *backing_file_str, + int64_t speed, BlockdevOnError on_error, + BlockCompletionFunc *cb, void *opaque, Error **errp) { StreamBlockJob *s; - if ((on_error == BLOCKDEV_ON_ERROR_STOP || - on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); - return; - } - - s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); + s = block_job_create(job_id, &stream_job_driver, bs, speed, + cb, opaque, errp); if (!s) { return; } @@ -240,7 +231,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, s->backing_file_str = g_strdup(backing_file_str); s->on_error = on_error; - s->common.co = qemu_coroutine_create(stream_run); + s->common.co = qemu_coroutine_create(stream_run, s); trace_stream_start(bs, base, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); + qemu_coroutine_enter(s->common.co); } diff --git a/block/throttle-groups.c b/block/throttle-groups.c index 4920e0949..59545e287 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -23,13 +23,14 @@ */ #include "qemu/osdep.h" +#include "sysemu/block-backend.h" #include "block/throttle-groups.h" #include "qemu/queue.h" #include "qemu/thread.h" #include "sysemu/qtest.h" /* The ThrottleGroup structure (with its ThrottleState) is shared - * among different BlockDriverState and it's independent from + * among different BlockBackends and it's independent from * AioContext, so in order to use it from different threads it needs * its own locking. * @@ -39,26 +40,26 @@ * The whole ThrottleGroup structure is private and invisible to * outside users, that only use it through its ThrottleState. * - * In addition to the ThrottleGroup structure, BlockDriverState has + * In addition to the ThrottleGroup structure, BlockBackendPublic has * fields that need to be accessed by other members of the group and - * therefore also need to be protected by this lock. Once a BDS is - * registered in a group those fields can be accessed by other threads - * any time. + * therefore also need to be protected by this lock. Once a + * BlockBackend is registered in a group those fields can be accessed + * by other threads any time. * * Again, all this is handled internally and is mostly transparent to * the outside. The 'throttle_timers' field however has an additional * constraint because it may be temporarily invalid (see for example * bdrv_set_aio_context()). Therefore in this file a thread will - * access some other BDS's timers only after verifying that that BDS - * has throttled requests in the queue. + * access some other BlockBackend's timers only after verifying that + * that BlockBackend has throttled requests in the queue. */ typedef struct ThrottleGroup { char *name; /* This is constant during the lifetime of the group */ QemuMutex lock; /* This lock protects the following four fields */ ThrottleState ts; - QLIST_HEAD(, BlockDriverState) head; - BlockDriverState *tokens[2]; + QLIST_HEAD(, BlockBackendPublic) head; + BlockBackend *tokens[2]; bool any_timer_armed[2]; /* These two are protected by the global throttle_groups_lock */ @@ -132,93 +133,98 @@ void throttle_group_unref(ThrottleState *ts) qemu_mutex_unlock(&throttle_groups_lock); } -/* Get the name from a BlockDriverState's ThrottleGroup. The name (and - * the pointer) is guaranteed to remain constant during the lifetime - * of the group. +/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer) + * is guaranteed to remain constant during the lifetime of the group. * - * @bs: a BlockDriverState that is member of a throttling group + * @blk: a BlockBackend that is member of a throttling group * @ret: the name of the group. */ -const char *throttle_group_get_name(BlockDriverState *bs) +const char *throttle_group_get_name(BlockBackend *blk) { - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); return tg->name; } -/* Return the next BlockDriverState in the round-robin sequence, - * simulating a circular list. +/* Return the next BlockBackend in the round-robin sequence, simulating a + * circular list. * * This assumes that tg->lock is held. * - * @bs: the current BlockDriverState - * @ret: the next BlockDriverState in the sequence + * @blk: the current BlockBackend + * @ret: the next BlockBackend in the sequence */ -static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs) +static BlockBackend *throttle_group_next_blk(BlockBackend *blk) { - ThrottleState *ts = bs->throttle_state; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - BlockDriverState *next = QLIST_NEXT(bs, round_robin); + BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin); if (!next) { - return QLIST_FIRST(&tg->head); + next = QLIST_FIRST(&tg->head); } - return next; + return blk_by_public(next); } -/* Return the next BlockDriverState in the round-robin sequence with - * pending I/O requests. +/* Return the next BlockBackend in the round-robin sequence with pending I/O + * requests. * * This assumes that tg->lock is held. * - * @bs: the current BlockDriverState + * @blk: the current BlockBackend * @is_write: the type of operation (read/write) - * @ret: the next BlockDriverState with pending requests, or bs - * if there is none. + * @ret: the next BlockBackend with pending requests, or blk if there is + * none. */ -static BlockDriverState *next_throttle_token(BlockDriverState *bs, - bool is_write) +static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write) { - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - BlockDriverState *token, *start; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); + BlockBackend *token, *start; start = token = tg->tokens[is_write]; /* get next bs round in round robin style */ - token = throttle_group_next_bs(token); - while (token != start && !token->pending_reqs[is_write]) { - token = throttle_group_next_bs(token); + token = throttle_group_next_blk(token); + while (token != start && !blkp->pending_reqs[is_write]) { + token = throttle_group_next_blk(token); } /* If no IO are queued for scheduling on the next round robin token * then decide the token is the current bs because chances are * the current bs get the current request queued. */ - if (token == start && !token->pending_reqs[is_write]) { - token = bs; + if (token == start && !blkp->pending_reqs[is_write]) { + token = blk; } return token; } -/* Check if the next I/O request for a BlockDriverState needs to be - * throttled or not. If there's no timer set in this group, set one - * and update the token accordingly. +/* Check if the next I/O request for a BlockBackend needs to be throttled or + * not. If there's no timer set in this group, set one and update the token + * accordingly. * * This assumes that tg->lock is held. * - * @bs: the current BlockDriverState + * @blk: the current BlockBackend * @is_write: the type of operation (read/write) * @ret: whether the I/O request needs to be throttled or not */ -static bool throttle_group_schedule_timer(BlockDriverState *bs, - bool is_write) +static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write) { - ThrottleState *ts = bs->throttle_state; - ThrottleTimers *tt = &bs->throttle_timers; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleState *ts = blkp->throttle_state; + ThrottleTimers *tt = &blkp->throttle_timers; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); bool must_wait; + if (blkp->io_limits_disabled) { + return false; + } + /* Check if any of the timers in this group is already armed */ if (tg->any_timer_armed[is_write]) { return true; @@ -226,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs, must_wait = throttle_schedule_timer(ts, tt, is_write); - /* If a timer just got armed, set bs as the current token */ + /* If a timer just got armed, set blk as the current token */ if (must_wait) { - tg->tokens[is_write] = bs; + tg->tokens[is_write] = blk; tg->any_timer_armed[is_write] = true; } @@ -239,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs, * * This assumes that tg->lock is held. * - * @bs: the current BlockDriverState + * @blk: the current BlockBackend * @is_write: the type of operation (read/write) */ -static void schedule_next_request(BlockDriverState *bs, bool is_write) +static void schedule_next_request(BlockBackend *blk, bool is_write) { - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); bool must_wait; - BlockDriverState *token; + BlockBackend *token; /* Check if there's any pending request to schedule next */ - token = next_throttle_token(bs, is_write); - if (!token->pending_reqs[is_write]) { + token = next_throttle_token(blk, is_write); + if (!blkp->pending_reqs[is_write]) { return; } @@ -259,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write) /* If it doesn't have to wait, queue it for immediate execution */ if (!must_wait) { - /* Give preference to requests from the current bs */ + /* Give preference to requests from the current blk */ if (qemu_in_coroutine() && - qemu_co_queue_next(&bs->throttled_reqs[is_write])) { - token = bs; + qemu_co_queue_next(&blkp->throttled_reqs[is_write])) { + token = blk; } else { - ThrottleTimers *tt = &token->throttle_timers; + ThrottleTimers *tt = &blkp->throttle_timers; int64_t now = qemu_clock_get_ns(tt->clock_type); timer_mod(tt->timers[is_write], now + 1); tg->any_timer_armed[is_write] = true; @@ -277,53 +284,67 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write) * if necessary, and schedule the next request using a round robin * algorithm. * - * @bs: the current BlockDriverState + * @blk: the current BlockBackend * @bytes: the number of bytes for this I/O * @is_write: the type of operation (read/write) */ -void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, +void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, unsigned int bytes, bool is_write) { bool must_wait; - BlockDriverState *token; + BlockBackend *token; - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); qemu_mutex_lock(&tg->lock); /* First we check if this I/O has to be throttled. */ - token = next_throttle_token(bs, is_write); + token = next_throttle_token(blk, is_write); must_wait = throttle_group_schedule_timer(token, is_write); /* Wait if there's a timer set or queued requests of this type */ - if (must_wait || bs->pending_reqs[is_write]) { - bs->pending_reqs[is_write]++; + if (must_wait || blkp->pending_reqs[is_write]) { + blkp->pending_reqs[is_write]++; qemu_mutex_unlock(&tg->lock); - qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + qemu_co_queue_wait(&blkp->throttled_reqs[is_write]); qemu_mutex_lock(&tg->lock); - bs->pending_reqs[is_write]--; + blkp->pending_reqs[is_write]--; } /* The I/O will be executed, so do the accounting */ - throttle_account(bs->throttle_state, is_write, bytes); + throttle_account(blkp->throttle_state, is_write, bytes); /* Schedule the next request */ - schedule_next_request(bs, is_write); + schedule_next_request(blk, is_write); qemu_mutex_unlock(&tg->lock); } +void throttle_group_restart_blk(BlockBackend *blk) +{ + BlockBackendPublic *blkp = blk_get_public(blk); + int i; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&blkp->throttled_reqs[i])) { + ; + } + } +} + /* Update the throttle configuration for a particular group. Similar * to throttle_config(), but guarantees atomicity within the * throttling group. * - * @bs: a BlockDriverState that is member of the group + * @blk: a BlockBackend that is a member of the group * @cfg: the configuration to set */ -void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) +void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg) { - ThrottleTimers *tt = &bs->throttle_timers; - ThrottleState *ts = bs->throttle_state; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleTimers *tt = &blkp->throttle_timers; + ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); qemu_mutex_lock(&tg->lock); /* throttle_config() cancels the timers */ @@ -335,18 +356,22 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) } throttle_config(ts, tt, cfg); qemu_mutex_unlock(&tg->lock); + + qemu_co_enter_next(&blkp->throttled_reqs[0]); + qemu_co_enter_next(&blkp->throttled_reqs[1]); } /* Get the throttle configuration from a particular group. Similar to * throttle_get_config(), but guarantees atomicity within the * throttling group. * - * @bs: a BlockDriverState that is member of the group + * @blk: a BlockBackend that is a member of the group * @cfg: the configuration will be written here */ -void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) +void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg) { - ThrottleState *ts = bs->throttle_state; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); qemu_mutex_lock(&tg->lock); throttle_get_config(ts, cfg); @@ -356,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) /* ThrottleTimers callback. This wakes up a request that was waiting * because it had been throttled. * - * @bs: the BlockDriverState whose request had been throttled + * @blk: the BlockBackend whose request had been throttled * @is_write: the type of operation (read/write) */ -static void timer_cb(BlockDriverState *bs, bool is_write) +static void timer_cb(BlockBackend *blk, bool is_write) { - ThrottleState *ts = bs->throttle_state; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); bool empty_queue; @@ -371,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write) qemu_mutex_unlock(&tg->lock); /* Run the request that was waiting for this timer */ - empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]); + empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]); /* If the request queue was empty then we have to take care of * scheduling the next one */ if (empty_queue) { qemu_mutex_lock(&tg->lock); - schedule_next_request(bs, is_write); + schedule_next_request(blk, is_write); qemu_mutex_unlock(&tg->lock); } } @@ -392,17 +418,17 @@ static void write_timer_cb(void *opaque) timer_cb(opaque, true); } -/* Register a BlockDriverState in the throttling group, also - * initializing its timers and updating its throttle_state pointer to - * point to it. If a throttling group with that name does not exist - * yet, it will be created. +/* Register a BlockBackend in the throttling group, also initializing its + * timers and updating its throttle_state pointer to point to it. If a + * throttling group with that name does not exist yet, it will be created. * - * @bs: the BlockDriverState to insert + * @blk: the BlockBackend to insert * @groupname: the name of the group */ -void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) +void throttle_group_register_blk(BlockBackend *blk, const char *groupname) { int i; + BlockBackendPublic *blkp = blk_get_public(blk); ThrottleState *ts = throttle_group_incref(groupname); ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); int clock_type = QEMU_CLOCK_REALTIME; @@ -412,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) clock_type = QEMU_CLOCK_VIRTUAL; } - bs->throttle_state = ts; + blkp->throttle_state = ts; qemu_mutex_lock(&tg->lock); - /* If the ThrottleGroup is new set this BlockDriverState as the token */ + /* If the ThrottleGroup is new set this BlockBackend as the token */ for (i = 0; i < 2; i++) { if (!tg->tokens[i]) { - tg->tokens[i] = bs; + tg->tokens[i] = blk; } } - QLIST_INSERT_HEAD(&tg->head, bs, round_robin); + QLIST_INSERT_HEAD(&tg->head, blkp, round_robin); - throttle_timers_init(&bs->throttle_timers, - bdrv_get_aio_context(bs), + throttle_timers_init(&blkp->throttle_timers, + blk_get_aio_context(blk), clock_type, read_timer_cb, write_timer_cb, - bs); + blk); qemu_mutex_unlock(&tg->lock); } -/* Unregister a BlockDriverState from its group, removing it from the - * list, destroying the timers and setting the throttle_state pointer - * to NULL. +/* Unregister a BlockBackend from its group, removing it from the list, + * destroying the timers and setting the throttle_state pointer to NULL. * - * The BlockDriverState must not have pending throttled requests, so - * the caller has to drain them first. + * The BlockBackend must not have pending throttled requests, so the caller has + * to drain them first. * * The group will be destroyed if it's empty after this operation. * - * @bs: the BlockDriverState to remove + * @blk: the BlockBackend to remove */ -void throttle_group_unregister_bs(BlockDriverState *bs) +void throttle_group_unregister_blk(BlockBackend *blk) { - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); int i; - assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0); - assert(qemu_co_queue_empty(&bs->throttled_reqs[0])); - assert(qemu_co_queue_empty(&bs->throttled_reqs[1])); + assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0); + assert(qemu_co_queue_empty(&blkp->throttled_reqs[0])); + assert(qemu_co_queue_empty(&blkp->throttled_reqs[1])); qemu_mutex_lock(&tg->lock); for (i = 0; i < 2; i++) { - if (tg->tokens[i] == bs) { - BlockDriverState *token = throttle_group_next_bs(bs); - /* Take care of the case where this is the last bs in the group */ - if (token == bs) { + if (tg->tokens[i] == blk) { + BlockBackend *token = throttle_group_next_blk(blk); + /* Take care of the case where this is the last blk in the group */ + if (token == blk) { token = NULL; } tg->tokens[i] = token; } } - /* remove the current bs from the list */ - QLIST_REMOVE(bs, round_robin); - throttle_timers_destroy(&bs->throttle_timers); + /* remove the current blk from the list */ + QLIST_REMOVE(blkp, round_robin); + throttle_timers_destroy(&blkp->throttle_timers); qemu_mutex_unlock(&tg->lock); throttle_group_unref(&tg->ts); - bs->throttle_state = NULL; + blkp->throttle_state = NULL; } static void throttle_groups_init(void) diff --git a/block/trace-events b/block/trace-events new file mode 100644 index 000000000..05fa13c89 --- /dev/null +++ b/block/trace-events @@ -0,0 +1,116 @@ +# See docs/tracing.txt for syntax documentation. + +# block.c +bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\"" +bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d" + +# block/block-backend.c +blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x" +blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x" + +# block/io.c +bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p" +bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p" +bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" +bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" +bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x" +bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u" + +# block/stream.c +stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d" +stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p" + +# block/commit.c +commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d" +commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p" + +# block/mirror.c +mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p" +mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64 +mirror_before_flush(void *s) "s %p" +mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64 +mirror_before_sleep(void *s, int64_t cnt, int synced, uint64_t delay_ns) "s %p dirty count %"PRId64" synced %d delay %"PRIu64"ns" +mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d" +mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d" +mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d" +mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d" +mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" +mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" + +# block/backup.c +backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d" +backup_do_cow_return(void *job, int64_t sector_num, int nb_sectors, int ret) "job %p sector_num %"PRId64" nb_sectors %d ret %d" +backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64 +backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64 +backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d" +backup_do_cow_write_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d" + +# blockdev.c +qmp_block_job_cancel(void *job) "job %p" +qmp_block_job_pause(void *job) "job %p" +qmp_block_job_resume(void *job) "job %p" +qmp_block_job_complete(void *job) "job %p" +block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d" +qmp_block_stream(void *bs, void *job) "bs %p job %p" + +# block/raw-win32.c +# block/raw-posix.c +paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" +paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" + +# block/qcow2.c +qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d" +qcow2_writev_done_req(void *co, int ret) "co %p ret %d" +qcow2_writev_start_part(void *co) "co %p" +qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d" +qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64 +qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d" +qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d" + +# block/qcow2-cluster.c +qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d" +qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64 +qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64 +qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d" +qcow2_cluster_alloc_phys(void *co) "co %p" +qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d" + +qcow2_l2_allocate(void *bs, int l1_index) "bs %p l1_index %d" +qcow2_l2_allocate_get_empty(void *bs, int l1_index) "bs %p l1_index %d" +qcow2_l2_allocate_write_l2(void *bs, int l1_index) "bs %p l1_index %d" +qcow2_l2_allocate_write_l1(void *bs, int l1_index) "bs %p l1_index %d" +qcow2_l2_allocate_done(void *bs, int l1_index, int ret) "bs %p l1_index %d ret %d" + +# block/qcow2-cache.c +qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset %" PRIx64 " read_from_disk %d" +qcow2_cache_get_replace_entry(void *co, int c, int i) "co %p is_l2_cache %d index %d" +qcow2_cache_get_read(void *co, int c, int i) "co %p is_l2_cache %d index %d" +qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d" +qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d" +qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d" + +# block/qed-l2-cache.c +qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p" +qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d" +qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d" + +# block/qed-table.c +qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p" +qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d" +qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u" +qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d" + +# block/qed.c +qed_need_check_timer_cb(void *s) "s %p" +qed_start_need_check_timer(void *s) "s %p" +qed_cancel_need_check_timer(void *s) "s %p" +qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d" +qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x" +qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64 +qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" +qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" +qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 +qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 +qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" diff --git a/block/vdi.c b/block/vdi.c index 75d4819ed..8a1cf9792 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -54,6 +54,7 @@ #include "block/block_int.h" #include "sysemu/block-backend.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include "migration/migration.h" #include "qemu/coroutine.h" #include "qemu/cutils.h" @@ -402,7 +403,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, logout("\n"); - ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1); + ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1); if (ret < 0) { goto fail; } @@ -499,7 +500,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap, + ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); if (ret < 0) { goto fail_free_bmap; @@ -557,98 +558,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; } -static int vdi_co_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors) +static int coroutine_fn +vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVdiState *s = bs->opaque; + QEMUIOVector local_qiov; uint32_t bmap_entry; uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; + uint32_t offset_in_block; + uint32_t n_bytes; + uint64_t bytes_done = 0; int ret = 0; logout("\n"); - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } + qemu_iovec_init(&local_qiov, qiov->niov); + + while (ret >= 0 && bytes > 0) { + block_index = offset / s->block_size; + offset_in_block = offset % s->block_size; + n_bytes = MIN(bytes, s->block_size - offset_in_block); - logout("will read %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); + logout("will read %u bytes starting at offset %" PRIu64 "\n", + n_bytes, offset); /* prepare next AIO request */ bmap_entry = le32_to_cpu(s->bmap[block_index]); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Block not allocated, return zeros, no need to wait. */ - memset(buf, 0, n_sectors * SECTOR_SIZE); + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); ret = 0; } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; - ret = bdrv_read(bs->file->bs, offset, buf, n_sectors); + uint64_t data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + offset_in_block; + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->file, data_offset, n_bytes, + &local_qiov, 0); } - logout("%u sectors read\n", n_sectors); + logout("%u bytes read\n", n_bytes); - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } + qemu_iovec_destroy(&local_qiov); + return ret; } -static int vdi_co_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVdiState *s = bs->opaque; + QEMUIOVector local_qiov; uint32_t bmap_entry; uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; + uint32_t offset_in_block; + uint32_t n_bytes; uint32_t bmap_first = VDI_UNALLOCATED; uint32_t bmap_last = VDI_UNALLOCATED; uint8_t *block = NULL; + uint64_t bytes_done = 0; int ret = 0; logout("\n"); - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } + qemu_iovec_init(&local_qiov, qiov->niov); + + while (ret >= 0 && bytes > 0) { + block_index = offset / s->block_size; + offset_in_block = offset % s->block_size; + n_bytes = MIN(bytes, s->block_size - offset_in_block); - logout("will write %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); + logout("will write %u bytes starting at offset %" PRIu64 "\n", + n_bytes, offset); /* prepare next AIO request */ bmap_entry = le32_to_cpu(s->bmap[block_index]); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Allocate new block and write to it. */ - uint64_t offset; + uint64_t data_offset; bmap_entry = s->header.blocks_allocated; s->bmap[block_index] = cpu_to_le32(bmap_entry); s->header.blocks_allocated++; - offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors; + data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size; if (block == NULL) { block = g_malloc(s->block_size); bmap_first = block_index; } bmap_last = block_index; /* Copy data to be written to new block and zero unused parts. */ - memset(block, 0, sector_in_block * SECTOR_SIZE); - memcpy(block + sector_in_block * SECTOR_SIZE, - buf, n_sectors * SECTOR_SIZE); - memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, - (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + memset(block, 0, offset_in_block); + qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block, + n_bytes); + memset(block + offset_in_block + n_bytes, 0, + s->block_size - n_bytes - offset_in_block); /* Note that this coroutine does not yield anywhere from reading the * bmap entry until here, so in regards to all the coroutines trying @@ -658,12 +670,12 @@ static int vdi_co_write(BlockDriverState *bs, * acquire the lock and thus the padded cluster is written before * the other coroutines can write to the affected area. */ qemu_co_mutex_lock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors); + ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size); qemu_co_mutex_unlock(&s->write_lock); } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; + uint64_t data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + offset_in_block; qemu_co_mutex_lock(&s->write_lock); /* This lock is only used to make sure the following write operation * is executed after the write issued by the coroutine allocating @@ -674,16 +686,23 @@ static int vdi_co_write(BlockDriverState *bs, * that that write operation has returned (there may be other writes * in flight, but they do not concern this very operation). */ qemu_co_mutex_unlock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, buf, n_sectors); + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_pwritev(bs->file, data_offset, n_bytes, + &local_qiov, 0); } - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; - logout("%u sectors written\n", n_sectors); + logout("%u bytes written\n", n_bytes); } + qemu_iovec_destroy(&local_qiov); + logout("finished data write\n"); if (ret < 0) { return ret; @@ -694,12 +713,13 @@ static int vdi_co_write(BlockDriverState *bs, VdiHeader *header = (VdiHeader *) block; uint8_t *base; uint64_t offset; + uint32_t n_sectors; logout("now writing modified header\n"); assert(VDI_IS_ALLOCATED(bmap_first)); *header = s->header; vdi_header_to_le(header); - ret = bdrv_write(bs->file->bs, 0, block, 1); + ret = bdrv_write(bs->file, 0, block, 1); g_free(block); block = NULL; @@ -717,7 +737,7 @@ static int vdi_co_write(BlockDriverState *bs, base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; logout("will write %u block map sectors starting from entry %u\n", n_sectors, bmap_first); - ret = bdrv_write(bs->file->bs, offset, base, n_sectors); + ret = bdrv_write(bs->file, offset, base, n_sectors); } return ret; @@ -808,7 +828,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - ret = blk_pwrite(blk, offset, &header, sizeof(header)); + ret = blk_pwrite(blk, offset, &header, sizeof(header), 0); if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; @@ -829,7 +849,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - ret = blk_pwrite(blk, offset, bmap, bmap_size); + ret = blk_pwrite(blk, offset, bmap, bmap_size, 0); if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; @@ -903,9 +923,9 @@ static BlockDriver bdrv_vdi = { .bdrv_co_get_block_status = vdi_co_get_block_status, .bdrv_make_empty = vdi_make_empty, - .bdrv_read = vdi_co_read, + .bdrv_co_preadv = vdi_co_preadv, #if defined(CONFIG_VDI_WRITE) - .bdrv_write = vdi_co_write, + .bdrv_co_pwritev = vdi_co_pwritev, #endif .bdrv_get_info = vdi_get_info, diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c index da33cd38e..c306b90d5 100644 --- a/block/vhdx-endian.c +++ b/block/vhdx-endian.c @@ -18,6 +18,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "qemu/bswap.h" #include "block/vhdx.h" #include <uuid/uuid.h> diff --git a/block/vhdx-log.c b/block/vhdx-log.c index 7ea7187fc..02eb10431 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -23,6 +23,7 @@ #include "block/block_int.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include "block/vhdx.h" @@ -83,7 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, offset = log->offset + read; - ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader)); + ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader)); if (ret < 0) { goto exit; } @@ -143,7 +144,7 @@ static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, } offset = log->offset + read; - ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE); + ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; } @@ -193,7 +194,7 @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, /* full */ break; } - ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp, + ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; @@ -465,7 +466,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, /* count is only > 1 if we are writing zeroes */ for (i = 0; i < count; i++) { - ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer, + ret = bdrv_pwrite_sync(bs->file, file_offset, buffer, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; @@ -944,7 +945,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, if (i == 0 && leading_length) { /* partial sector at the front of the buffer */ - ret = bdrv_pread(bs->file->bs, file_offset, merged_sector, + ret = bdrv_pread(bs->file, file_offset, merged_sector, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; @@ -954,7 +955,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, sector_write = merged_sector; } else if (i == sectors - 1 && trailing_length) { /* partial sector at the end of the buffer */ - ret = bdrv_pread(bs->file->bs, + ret = bdrv_pread(bs->file, file_offset, merged_sector + trailing_length, VHDX_LOG_SECTOR_SIZE - trailing_length); diff --git a/block/vhdx.c b/block/vhdx.c index 2b7b33240..75ef2b1c2 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -22,11 +22,11 @@ #include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/crc32c.h" +#include "qemu/bswap.h" #include "block/vhdx.h" #include "migration/migration.h" #include <uuid/uuid.h> -#include <glib.h> /* Options for VHDX creation */ @@ -298,9 +298,10 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) * and then update the header checksum. Header is converted to proper * endianness before being written to the specified file offset */ -static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, +static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr, uint64_t offset, bool read) { + BlockDriverState *bs_file = file->bs; uint8_t *buffer = NULL; int ret; VHDXHeader *header_le; @@ -315,7 +316,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); if (read) { /* if true, we can't assume the extra reserved bytes are 0 */ - ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); + ret = bdrv_pread(file, offset, buffer, VHDX_HEADER_SIZE); if (ret < 0) { goto exit; } @@ -329,7 +330,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, vhdx_header_le_export(hdr, header_le); vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, offsetof(VHDXHeader, checksum)); - ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); + ret = bdrv_pwrite_sync(file, offset, header_le, sizeof(VHDXHeader)); exit: qemu_vfree(buffer); @@ -378,7 +379,7 @@ static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, inactive_header->log_guid = *log_guid; } - ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true); + ret = vhdx_write_header(bs->file, inactive_header, header_offset, true); if (ret < 0) { goto exit; } @@ -430,7 +431,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, /* We have to read the whole VHDX_HEADER_SIZE instead of * sizeof(VHDXHeader), because the checksum is over the whole * region */ - ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer, + ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE); if (ret < 0) { goto fail; @@ -447,7 +448,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } } - ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer, + ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); if (ret < 0) { goto fail; @@ -521,7 +522,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) * whole block */ buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); - ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer, + ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer, VHDX_HEADER_BLOCK_SIZE); if (ret < 0) { goto fail; @@ -634,7 +635,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); - ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer, + ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer, VHDX_METADATA_TABLE_MAX_SIZE); if (ret < 0) { goto exit; @@ -737,7 +738,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) goto exit; } - ret = bdrv_pread(bs->file->bs, + ret = bdrv_pread(bs->file, s->metadata_entries.file_parameters_entry.offset + s->metadata_rt.file_offset, &s->params, @@ -772,7 +773,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) /* determine virtual disk size, logical sector size, * and phys sector size */ - ret = bdrv_pread(bs->file->bs, + ret = bdrv_pread(bs->file, s->metadata_entries.virtual_disk_size_entry.offset + s->metadata_rt.file_offset, &s->virtual_disk_size, @@ -780,7 +781,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) if (ret < 0) { goto exit; } - ret = bdrv_pread(bs->file->bs, + ret = bdrv_pread(bs->file, s->metadata_entries.logical_sector_size_entry.offset + s->metadata_rt.file_offset, &s->logical_sector_size, @@ -788,7 +789,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) if (ret < 0) { goto exit; } - ret = bdrv_pread(bs->file->bs, + ret = bdrv_pread(bs->file, s->metadata_entries.phys_sector_size_entry.offset + s->metadata_rt.file_offset, &s->physical_sector_size, @@ -905,7 +906,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, QLIST_INIT(&s->regions); /* validate the file signature */ - ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t)); + ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); if (ret < 0) { goto fail; } @@ -964,7 +965,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length); + ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); if (ret < 0) { goto fail; } @@ -1117,7 +1118,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, break; case PAYLOAD_BLOCK_FULLY_PRESENT: qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, + ret = bdrv_co_readv(bs->file, sinfo.file_offset >> BDRV_SECTOR_BITS, sinfo.sectors_avail, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1326,7 +1327,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, } /* block exists, so we can just overwrite it */ qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file->bs, + ret = bdrv_co_writev(bs->file, sinfo.file_offset >> BDRV_SECTOR_BITS, sectors_to_write, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1387,9 +1388,11 @@ exit: * There are 2 headers, and the highest sequence number will represent * the active header */ -static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, +static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size, uint32_t log_size) { + BlockDriverState *bs = blk_bs(blk); + BdrvChild *child; int ret = 0; VHDXHeader *hdr = NULL; @@ -1404,12 +1407,18 @@ static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, vhdx_guid_generate(&hdr->file_write_guid); vhdx_guid_generate(&hdr->data_write_guid); - ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); + /* XXX Ugly way to get blk->root, but that's a feature, not a bug. This + * hack makes it obvious that vhdx_write_header() bypasses the BlockBackend + * here, which it really shouldn't be doing. */ + child = QLIST_FIRST(&bs->parents); + assert(!QLIST_NEXT(child, next_parent)); + + ret = vhdx_write_header(child, hdr, VHDX_HEADER1_OFFSET, false); if (ret < 0) { goto exit; } hdr->sequence_number++; - ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); + ret = vhdx_write_header(child, hdr, VHDX_HEADER2_OFFSET, false); if (ret < 0) { goto exit; } @@ -1442,7 +1451,7 @@ exit: * The first 64KB of the Metadata section is reserved for the metadata * header and entries; beyond that, the metadata items themselves reside. */ -static int vhdx_create_new_metadata(BlockDriverState *bs, +static int vhdx_create_new_metadata(BlockBackend *blk, uint64_t image_size, uint32_t block_size, uint32_t sector_size, @@ -1538,13 +1547,13 @@ static int vhdx_create_new_metadata(BlockDriverState *bs, VHDX_META_FLAGS_IS_VIRTUAL_DISK; vhdx_metadata_entry_le_export(&md_table_entry[4]); - ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); + ret = blk_pwrite(blk, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE, 0); if (ret < 0) { goto exit; } - ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, - VHDX_METADATA_ENTRY_BUFFER_SIZE); + ret = blk_pwrite(blk, metadata_offset + (64 * KiB), entry_buffer, + VHDX_METADATA_ENTRY_BUFFER_SIZE, 0); if (ret < 0) { goto exit; } @@ -1564,7 +1573,7 @@ exit: * Fixed images: default state of the BAT is fully populated, with * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. */ -static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, +static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, uint64_t image_size, VHDXImageType type, bool use_zero_blocks, uint64_t file_offset, uint32_t length) @@ -1588,12 +1597,12 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, if (type == VHDX_TYPE_DYNAMIC) { /* All zeroes, so we can just extend the file - the end of the BAT * is the furthest thing we have written yet */ - ret = bdrv_truncate(bs, data_file_offset); + ret = blk_truncate(blk, data_file_offset); if (ret < 0) { goto exit; } } else if (type == VHDX_TYPE_FIXED) { - ret = bdrv_truncate(bs, data_file_offset + image_size); + ret = blk_truncate(blk, data_file_offset + image_size); if (ret < 0) { goto exit; } @@ -1604,7 +1613,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, if (type == VHDX_TYPE_FIXED || use_zero_blocks || - bdrv_has_zero_init(bs) == 0) { + bdrv_has_zero_init(blk_bs(blk)) == 0) { /* for a fixed file, the default BAT entry is not zero */ s->bat = g_try_malloc0(length); if (length && s->bat == NULL) { @@ -1620,12 +1629,12 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, sinfo.file_offset = data_file_offset + (sector_num << s->logical_sector_size_bits); sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); - vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, + vhdx_update_bat_table_entry(blk_bs(blk), s, &sinfo, &unused, &unused, block_state); cpu_to_le64s(&s->bat[sinfo.bat_idx]); sector_num += s->sectors_per_block; } - ret = bdrv_pwrite(bs, file_offset, s->bat, length); + ret = blk_pwrite(blk, file_offset, s->bat, length, 0); if (ret < 0) { goto exit; } @@ -1645,7 +1654,7 @@ exit: * to create the BAT itself, we will also cause the BAT to be * created. */ -static int vhdx_create_new_region_table(BlockDriverState *bs, +static int vhdx_create_new_region_table(BlockBackend *blk, uint64_t image_size, uint32_t block_size, uint32_t sector_size, @@ -1720,21 +1729,21 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, /* The region table gives us the data we need to create the BAT, * so do that now */ - ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, + ret = vhdx_create_bat(blk, s, image_size, type, use_zero_blocks, bat_file_offset, bat_length); if (ret < 0) { goto exit; } /* Now write out the region headers to disk */ - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, - VHDX_HEADER_BLOCK_SIZE); + ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE, 0); if (ret < 0) { goto exit; } - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, - VHDX_HEADER_BLOCK_SIZE); + ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE, 0); if (ret < 0) { goto exit; } @@ -1856,13 +1865,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, &creator_items, NULL); signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature), + 0); if (ret < 0) { goto delete_and_exit; } if (creator) { ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); + creator, creator_items * sizeof(gunichar2), 0); if (ret < 0) { goto delete_and_exit; } @@ -1870,13 +1880,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) /* Creates (B),(C) */ - ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size); + ret = vhdx_create_new_headers(blk, image_size, log_size); if (ret < 0) { goto delete_and_exit; } /* Creates (D),(E),(G) explicitly. (F) created as by-product */ - ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512, + ret = vhdx_create_new_region_table(blk, image_size, block_size, 512, log_size, use_zero_blocks, image_type, &metadata_offset); if (ret < 0) { @@ -1884,7 +1894,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } /* Creates (H) */ - ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512, + ret = vhdx_create_new_metadata(blk, image_size, block_size, 512, metadata_offset, image_type); if (ret < 0) { goto delete_and_exit; diff --git a/block/vmdk.c b/block/vmdk.c index 45f9d3c5b..46d474e44 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -30,10 +30,10 @@ #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include "migration/migration.h" #include "qemu/cutils.h" #include <zlib.h> -#include <glib.h> #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') @@ -252,7 +252,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) int ret; desc = g_malloc0(DESC_SIZE); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { g_free(desc); return 0; @@ -286,7 +286,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) desc = g_malloc0(DESC_SIZE); tmp_desc = g_malloc0(DESC_SIZE); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { goto out; } @@ -306,7 +306,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) pstrcat(desc, DESC_SIZE, tmp_desc); } - ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE); out: g_free(desc); @@ -350,7 +350,7 @@ static int vmdk_parent_open(BlockDriverState *bs) int ret; desc = g_malloc0(DESC_SIZE + 1); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { goto out; } @@ -454,7 +454,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, return -ENOMEM; } - ret = bdrv_pread(extent->file->bs, + ret = bdrv_pread(extent->file, extent->l1_table_offset, extent->l1_table, l1_size); @@ -474,7 +474,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, ret = -ENOMEM; goto fail_l1; } - ret = bdrv_pread(extent->file->bs, + ret = bdrv_pread(extent->file, extent->l1_backup_table_offset, extent->l1_backup_table, l1_size); @@ -508,7 +508,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, VMDK3Header header; VmdkExtent *extent; - ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); + ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read header from file '%s'", @@ -538,14 +538,13 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, QDict *options, Error **errp); -static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, - Error **errp) +static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp) { int64_t size; char *buf; int ret; - size = bdrv_getlength(file); + size = bdrv_getlength(file->bs); if (size < 0) { error_setg_errno(errp, -size, "Could not access file"); return NULL; @@ -586,7 +585,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, int64_t l1_backup_offset = 0; bool compressed; - ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); + ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read header from file '%s'", @@ -596,7 +595,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (header.capacity == 0) { uint64_t desc_offset = le64_to_cpu(header.desc_offset); if (desc_offset) { - char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp); + char *buf = vmdk_read_desc(file, desc_offset << 9, errp); if (!buf) { return -EINVAL; } @@ -636,7 +635,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } QEMU_PACKED eos_marker; } QEMU_PACKED footer; - ret = bdrv_pread(file->bs, + ret = bdrv_pread(file, bs->file->bs->total_sectors * 512 - 1536, &footer, sizeof(footer)); if (ret < 0) { @@ -874,7 +873,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, extent->flat_start_offset = flat_offset << 9; } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ - char *buf = vmdk_read_desc(extent_file->bs, 0, errp); + char *buf = vmdk_read_desc(extent_file, 0, errp); if (!buf) { ret = -EINVAL; } else { @@ -943,7 +942,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, BDRVVmdkState *s = bs->opaque; uint32_t magic; - buf = vmdk_read_desc(bs->file->bs, 0, errp); + buf = vmdk_read_desc(bs->file, 0, errp); if (!buf) { return -EINVAL; } @@ -997,9 +996,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) for (i = 0; i < s->num_extents; i++) { if (!s->extents[i].flat) { - bs->bl.write_zeroes_alignment = - MAX(bs->bl.write_zeroes_alignment, - s->extents[i].cluster_sectors); + bs->bl.pwrite_zeroes_alignment = + MAX(bs->bl.pwrite_zeroes_alignment, + s->extents[i].cluster_sectors << BDRV_SECTOR_BITS); } } } @@ -1016,27 +1015,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) */ static int get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, - uint64_t cluster_sector_num, - uint64_t sector_num, - uint64_t skip_start_sector, - uint64_t skip_end_sector) + uint64_t cluster_offset, + uint64_t offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; uint8_t *whole_grain; /* For COW, align request sector_num to cluster start */ - sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + offset = QEMU_ALIGN_DOWN(offset, cluster_bytes); whole_grain = qemu_blockalign(bs, cluster_bytes); if (!bs->backing) { - memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); - memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, - cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + memset(whole_grain, 0, skip_start_bytes); + memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes); } - assert(skip_end_sector <= extent->cluster_sectors); + assert(skip_end_bytes <= cluster_bytes); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ if (bs->backing && !vmdk_is_cid_valid(bs)) { @@ -1045,42 +1043,43 @@ static int get_whole_cluster(BlockDriverState *bs, } /* Read backing data before skip range */ - if (skip_start_sector > 0) { + if (skip_start_bytes > 0) { if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num, - whole_grain, skip_start_sector); + ret = bdrv_pread(bs->backing, offset, whole_grain, + skip_start_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } - ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain, - skip_start_sector); + ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain, + skip_start_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } /* Read backing data after skip range */ - if (skip_end_sector < extent->cluster_sectors) { + if (skip_end_bytes < cluster_bytes) { if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); + ret = bdrv_pread(bs->backing, offset + skip_end_bytes, + whole_grain + skip_end_bytes, + cluster_bytes - skip_end_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } - ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); + ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes, + whole_grain + skip_end_bytes, + cluster_bytes - skip_end_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } + ret = VMDK_OK; exit: qemu_vfree(whole_grain); return ret; @@ -1091,8 +1090,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, { offset = cpu_to_le32(offset); /* update L2 table */ - if (bdrv_pwrite_sync( - extent->file->bs, + if (bdrv_pwrite_sync(extent->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { @@ -1101,8 +1099,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; - if (bdrv_pwrite_sync( - extent->file->bs, + if (bdrv_pwrite_sync(extent->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { @@ -1142,8 +1139,8 @@ static int get_cluster_offset(BlockDriverState *bs, uint64_t offset, bool allocate, uint64_t *cluster_offset, - uint64_t skip_start_sector, - uint64_t skip_end_sector) + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1191,8 +1188,7 @@ static int get_cluster_offset(BlockDriverState *bs, } } l2_table = extent->l2_cache + (min_index * extent->l2_size); - if (bdrv_pread( - extent->file->bs, + if (bdrv_pread(extent->file, (int64_t)l2_offset * 512, l2_table, extent->l2_size * sizeof(uint32_t) @@ -1206,13 +1202,6 @@ static int get_cluster_offset(BlockDriverState *bs, l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; cluster_sector = le32_to_cpu(l2_table[l2_index]); - if (m_data) { - m_data->valid = 1; - m_data->l1_index = l1_index; - m_data->l2_index = l2_index; - m_data->l2_offset = l2_offset; - m_data->l2_cache_entry = &l2_table[l2_index]; - } if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { zeroed = true; } @@ -1230,13 +1219,18 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - ret = get_whole_cluster(bs, extent, - cluster_sector, - offset >> BDRV_SECTOR_BITS, - skip_start_sector, skip_end_sector); + ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; } + if (m_data) { + m_data->valid = 1; + m_data->l1_index = l1_index; + m_data->l2_index = l2_index; + m_data->l2_offset = l2_offset; + m_data->l2_cache_entry = &l2_table[l2_index]; + } } *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; @@ -1259,15 +1253,24 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ + uint64_t extent_begin_offset, extent_relative_offset; + uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + + extent_begin_offset = + (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; + extent_relative_offset = offset - extent_begin_offset; + return extent_relative_offset % cluster_size; +} + static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { - uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num; - - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - return index_in_cluster; + uint64_t offset; + offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE); + return offset / BDRV_SECTOR_SIZE; } static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, @@ -1319,38 +1322,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, } static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, const uint8_t *buf, - int nb_sectors, int64_t sector_num) + int64_t offset_in_cluster, QEMUIOVector *qiov, + uint64_t qiov_offset, uint64_t n_bytes, + uint64_t offset) { int ret; VmdkGrainMarker *data = NULL; uLongf buf_len; - const uint8_t *write_buf = buf; - int write_len = nb_sectors * 512; + QEMUIOVector local_qiov; + struct iovec iov; int64_t write_offset; int64_t write_end_sector; if (extent->compressed) { + void *compressed_data; + if (!extent->has_marker) { ret = -EINVAL; goto out; } buf_len = (extent->cluster_sectors << 9) * 2; data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); - if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || - buf_len == 0) { + + compressed_data = g_malloc(n_bytes); + qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes); + ret = compress(data->data, &buf_len, compressed_data, n_bytes); + g_free(compressed_data); + + if (ret != Z_OK || buf_len == 0) { ret = -EINVAL; goto out; } - data->lba = sector_num; + + data->lba = offset >> BDRV_SECTOR_BITS; data->size = buf_len; - write_buf = (uint8_t *)data; - write_len = buf_len + sizeof(VmdkGrainMarker); + + n_bytes = buf_len + sizeof(VmdkGrainMarker); + iov = (struct iovec) { + .iov_base = data, + .iov_len = n_bytes, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + } else { + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes); } + write_offset = cluster_offset + offset_in_cluster, - ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len); + ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes, + &local_qiov, 0); - write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); + write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE); if (extent->compressed) { extent->next_cluster_sector = write_end_sector; @@ -1359,19 +1381,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, write_end_sector); } - if (ret != write_len) { - ret = ret < 0 ? ret : -EIO; + if (ret < 0) { goto out; } ret = 0; out: g_free(data); + if (!extent->compressed) { + qemu_iovec_destroy(&local_qiov); + } return ret; } static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, uint8_t *buf, - int nb_sectors) + int64_t offset_in_cluster, QEMUIOVector *qiov, + int bytes) { int ret; int cluster_bytes, buf_bytes; @@ -1383,21 +1407,20 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, if (!extent->compressed) { - ret = bdrv_pread(extent->file->bs, - cluster_offset + offset_in_cluster, - buf, nb_sectors * 512); - if (ret == nb_sectors * 512) { - return 0; - } else { - return -EIO; + ret = bdrv_co_preadv(extent->file, + cluster_offset + offset_in_cluster, bytes, + qiov, 0); + if (ret < 0) { + return ret; } + return 0; } cluster_bytes = extent->cluster_sectors * 512; /* Read two clusters in case GrainMarker + compressed data > one cluster */ buf_bytes = cluster_bytes * 2; cluster_buf = g_malloc(buf_bytes); uncomp_buf = g_malloc(cluster_bytes); - ret = bdrv_pread(extent->file->bs, + ret = bdrv_pread(extent->file, cluster_offset, cluster_buf, buf_bytes); if (ret < 0) { @@ -1422,11 +1445,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, } if (offset_in_cluster < 0 || - offset_in_cluster + nb_sectors * 512 > buf_len) { + offset_in_cluster + bytes > buf_len) { ret = -EINVAL; goto out; } - memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); + qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes); ret = 0; out: @@ -1435,64 +1458,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, return ret; } -static int vmdk_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVmdkState *s = bs->opaque; int ret; - uint64_t n, index_in_cluster; + uint64_t n_bytes, offset_in_cluster; VmdkExtent *extent = NULL; + QEMUIOVector local_qiov; uint64_t cluster_offset; + uint64_t bytes_done = 0; - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_co_mutex_lock(&s->lock); + + while (bytes > 0) { + extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); if (!extent) { - return -EIO; + ret = -EIO; + goto fail; } ret = get_cluster_offset(bs, extent, NULL, - sector_num << 9, false, &cluster_offset, - 0, 0); - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } + offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); + + n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { if (!vmdk_is_cid_valid(bs)) { - return -EINVAL; + ret = -EINVAL; + goto fail; } - ret = bdrv_read(bs->backing->bs, sector_num, buf, n); + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->backing, offset, n_bytes, + &local_qiov, 0); if (ret < 0) { - return ret; + goto fail; } } else { - memset(buf, 0, 512 * n); + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); } } else { - ret = vmdk_read_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n); + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster, + &local_qiov, n_bytes); if (ret) { - return ret; + goto fail; } } - nb_sectors -= n; - sector_num += n; - buf += n * 512; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} -static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVmdkState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vmdk_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&local_qiov); + return ret; } @@ -1506,38 +1538,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, * * Returns: error code with 0 for success. */ -static int vmdk_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors, - bool zeroed, bool zero_dry_run) +static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + bool zeroed, bool zero_dry_run) { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; int ret; - int64_t index_in_cluster, n; + int64_t offset_in_cluster, n_bytes; uint64_t cluster_offset; + uint64_t bytes_done = 0; VmdkMetaData m_data; - if (sector_num > bs->total_sectors) { - error_report("Wrong offset: sector_num=0x%" PRIx64 + if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { + error_report("Wrong offset: offset=0x%" PRIx64 " total_sectors=0x%" PRIx64, - sector_num, bs->total_sectors); + offset, bs->total_sectors); return -EIO; } - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); + while (bytes > 0) { + extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); if (!extent) { return -EIO; } - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); + n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); + + ret = get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), - &cluster_offset, - index_in_cluster, index_in_cluster + n); + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1546,7 +1578,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -EIO; } else { /* allocate */ - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + ret = get_cluster_offset(bs, extent, &m_data, offset, true, &cluster_offset, 0, 0); } } @@ -1556,9 +1588,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (zeroed) { /* Do zeroed write, buf is ignored */ if (extent->has_zero_grain && - index_in_cluster == 0 && - n >= extent->cluster_sectors) { - n = extent->cluster_sectors; + offset_in_cluster == 0 && + n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) { + n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE; if (!zero_dry_run) { /* update L2 tables */ if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) @@ -1570,9 +1602,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -ENOTSUP; } } else { - ret = vmdk_write_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n, sector_num); + ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster, + qiov, bytes_done, n_bytes, offset); if (ret) { return ret; } @@ -1585,9 +1616,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } } } - nb_sectors -= n; - sector_num += n; - buf += n * 512; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; /* update CID on the first write every time the virtual disk is * opened */ @@ -1602,43 +1633,84 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return 0; } -static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVmdkState *s = bs->opaque; qemu_co_mutex_lock(&s->lock); - ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false); qemu_co_mutex_unlock(&s->lock); return ret; } +typedef struct VmdkWriteCompressedCo { + BlockDriverState *bs; + int64_t sector_num; + const uint8_t *buf; + int nb_sectors; + int ret; +} VmdkWriteCompressedCo; + +static void vmdk_co_write_compressed(void *opaque) +{ + VmdkWriteCompressedCo *co = opaque; + QEMUIOVector local_qiov; + uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE; + uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE; + + struct iovec iov = (struct iovec) { + .iov_base = (uint8_t*) co->buf, + .iov_len = bytes, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + + co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false); +} + static int vmdk_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { BDRVVmdkState *s = bs->opaque; + if (s->num_extents == 1 && s->extents[0].compressed) { - return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + Coroutine *co; + AioContext *aio_context = bdrv_get_aio_context(bs); + VmdkWriteCompressedCo data = { + .bs = bs, + .sector_num = sector_num, + .buf = buf, + .nb_sectors = nb_sectors, + .ret = -EINPROGRESS, + }; + co = qemu_coroutine_create(vmdk_co_write_compressed, &data); + qemu_coroutine_enter(co); + while (data.ret == -EINPROGRESS) { + aio_poll(aio_context, true); + } + return data.ret; } else { return -ENOTSUP; } } -static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, + int bytes, + BdrvRequestFlags flags) { int ret; BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); /* write zeroes could fail if sectors not aligned to cluster, test it with * dry_run == true before really updating image */ - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); + ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true); if (!ret) { - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); + ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false); } qemu_co_mutex_unlock(&s->lock); return ret; @@ -1728,12 +1800,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); + ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); + ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1753,7 +1825,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, gd_buf[i] = cpu_to_le32(tmp); } ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + gd_buf, gd_buf_size, 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1765,7 +1837,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, gd_buf[i] = cpu_to_le32(tmp); } ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + gd_buf, gd_buf_size, 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1829,8 +1901,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size = 0, filesize; char *adapter_type = NULL; char *backing_file = NULL; + char *hw_version = NULL; char *fmt = NULL; - int flags = 0; int ret = 0; bool flat, split, compress; GString *ext_desc_lines; @@ -1861,7 +1933,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) "# The Disk Data Base\n" "#DDB\n" "\n" - "ddb.virtualHWVersion = \"%d\"\n" + "ddb.virtualHWVersion = \"%s\"\n" "ddb.geometry.cylinders = \"%" PRId64 "\"\n" "ddb.geometry.heads = \"%" PRIu32 "\"\n" "ddb.geometry.sectors = \"63\"\n" @@ -1878,8 +1950,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) BDRV_SECTOR_SIZE); adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION); if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { - flags |= BLOCK_FLAG_COMPAT6; + if (strcmp(hw_version, "undefined")) { + error_setg(errp, + "compat6 cannot be enabled with hwversion set"); + ret = -EINVAL; + goto exit; + } + g_free(hw_version); + hw_version = g_strdup("6"); + } + if (strcmp(hw_version, "undefined") == 0) { + g_free(hw_version); + hw_version = g_strdup("4"); } fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { @@ -2001,7 +2085,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) fmt, parent_desc_line, ext_desc_lines->str, - (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + hw_version, total_size / (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), number_heads, @@ -2028,7 +2112,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) blk_set_allow_write_beyond_eof(new_blk, true); - ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); + ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; @@ -2047,6 +2131,7 @@ exit: } g_free(adapter_type); g_free(backing_file); + g_free(hw_version); g_free(fmt); g_free(desc); g_free(path); @@ -2250,27 +2335,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } -static void vmdk_detach_aio_context(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_extents; i++) { - bdrv_detach_aio_context(s->extents[i].file->bs); - } -} - -static void vmdk_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVVmdkState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_extents; i++) { - bdrv_attach_aio_context(s->extents[i].file->bs, new_context); - } -} - static QemuOptsList vmdk_create_opts = { .name = "vmdk-create-opts", .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), @@ -2298,6 +2362,12 @@ static QemuOptsList vmdk_create_opts = { .def_value_str = "off" }, { + .name = BLOCK_OPT_HWVERSION, + .type = QEMU_OPT_STRING, + .help = "VMDK hardware version", + .def_value_str = "undefined" + }, + { .name = BLOCK_OPT_SUBFMT, .type = QEMU_OPT_STRING, .help = @@ -2321,10 +2391,10 @@ static BlockDriver bdrv_vmdk = { .bdrv_open = vmdk_open, .bdrv_check = vmdk_check, .bdrv_reopen_prepare = vmdk_reopen_prepare, - .bdrv_read = vmdk_co_read, - .bdrv_write = vmdk_co_write, + .bdrv_co_preadv = vmdk_co_preadv, + .bdrv_co_pwritev = vmdk_co_pwritev, .bdrv_write_compressed = vmdk_write_compressed, - .bdrv_co_write_zeroes = vmdk_co_write_zeroes, + .bdrv_co_pwrite_zeroes = vmdk_co_pwrite_zeroes, .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, @@ -2334,8 +2404,6 @@ static BlockDriver bdrv_vmdk = { .bdrv_get_specific_info = vmdk_get_specific_info, .bdrv_refresh_limits = vmdk_refresh_limits, .bdrv_get_info = vmdk_get_info, - .bdrv_detach_aio_context = vmdk_detach_aio_context, - .bdrv_attach_aio_context = vmdk_attach_aio_context, .supports_backing = true, .create_opts = &vmdk_create_opts, diff --git a/block/vpc.c b/block/vpc.c index 3e2ea698d..43707ed22 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -29,6 +29,7 @@ #include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" +#include "qemu/bswap.h" #if defined(CONFIG_UUID) #include <uuid/uuid.h> #endif @@ -236,7 +237,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); + ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE); if (ret < 0) { error_setg(errp, "Unable to read VHD header"); goto fail; @@ -256,7 +257,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, } /* If a fixed disk, the footer is found only at the end of the file */ - ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf, + ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE); if (ret < 0) { goto fail; @@ -327,7 +328,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, } if (disk_type == VHD_DYNAMIC) { - ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf, + ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE); if (ret < 0) { error_setg(errp, "Error reading dynamic VHD header"); @@ -384,7 +385,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable, + ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable, pagetable_size); if (ret < 0) { error_setg(errp, "Error reading pagetable"); @@ -454,22 +455,21 @@ static int vpc_reopen_prepare(BDRVReopenState *state, * The parameter write must be 1 if the offset will be used for a write * operation (the block bitmaps is updated then), 0 otherwise. */ -static inline int64_t get_sector_offset(BlockDriverState *bs, - int64_t sector_num, int write) +static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, + bool write) { BDRVVPCState *s = bs->opaque; - uint64_t offset = sector_num * 512; uint64_t bitmap_offset, block_offset; - uint32_t pagetable_index, pageentry_index; + uint32_t pagetable_index, offset_in_block; pagetable_index = offset / s->block_size; - pageentry_index = (offset % s->block_size) / 512; + offset_in_block = offset % s->block_size; if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) return -1; /* not allocated */ bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; - block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + block_offset = bitmap_offset + s->bitmap_size + offset_in_block; /* We must ensure that we don't write to any sectors which are marked as unused in the bitmap. We get away with setting all bits in the block @@ -481,12 +481,18 @@ static inline int64_t get_sector_offset(BlockDriverState *bs, s->last_bitmap_offset = bitmap_offset; memset(bitmap, 0xff, s->bitmap_size); - bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size); + bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); } return block_offset; } +static inline int64_t get_sector_offset(BlockDriverState *bs, + int64_t sector_num, bool write) +{ + return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write); +} + /* * Writes the footer to the end of the image file. This is needed when the * file grows as it overwrites the old footer @@ -499,7 +505,7 @@ static int rewrite_footer(BlockDriverState* bs) BDRVVPCState *s = bs->opaque; int64_t offset = s->free_data_block_offset; - ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE); + ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE); if (ret < 0) return ret; @@ -513,7 +519,7 @@ static int rewrite_footer(BlockDriverState* bs) * * Returns the sectors' offset in the image file on success and < 0 on error */ -static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +static int64_t alloc_block(BlockDriverState* bs, int64_t offset) { BDRVVPCState *s = bs->opaque; int64_t bat_offset; @@ -522,19 +528,18 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) uint8_t bitmap[s->bitmap_size]; /* Check if sector_num is valid */ - if ((sector_num < 0) || (sector_num > bs->total_sectors)) - return -1; + if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) { + return -EINVAL; + } /* Write entry into in-memory BAT */ - index = (sector_num * 512) / s->block_size; - if (s->pagetable[index] != 0xFFFFFFFF) - return -1; - + index = offset / s->block_size; + assert(s->pagetable[index] == 0xFFFFFFFF); s->pagetable[index] = s->free_data_block_offset / 512; /* Initialize the block's bitmap */ memset(bitmap, 0xff, s->bitmap_size); - ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap, + ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap, s->bitmap_size); if (ret < 0) { return ret; @@ -549,15 +554,15 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) /* Write BAT entry to disk */ bat_offset = s->bat_offset + (4 * index); bat_value = cpu_to_be32(s->pagetable[index]); - ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4); + ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4); if (ret < 0) goto fail; - return get_sector_offset(bs, sector_num, 0); + return get_image_offset(bs, offset, false); fail: s->free_data_block_offset -= (s->block_size + s->bitmap_size); - return -1; + return ret; } static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -573,104 +578,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } -static int vpc_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVPCState *s = bs->opaque; int ret; - int64_t offset; - int64_t sectors, sectors_per_block; + int64_t image_offset; + int64_t n_bytes; + int64_t bytes_done = 0; VHDFooter *footer = (VHDFooter *) s->footer_buf; + QEMUIOVector local_qiov; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0); } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 0); - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } + qemu_co_mutex_lock(&s->lock); + qemu_iovec_init(&local_qiov, qiov->niov); + + while (bytes > 0) { + image_offset = get_image_offset(bs, offset, false); + n_bytes = MIN(bytes, s->block_size - (offset % s->block_size)); - if (offset == -1) { - memset(buf, 0, sectors * BDRV_SECTOR_SIZE); + if (image_offset == -1) { + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); } else { - ret = bdrv_pread(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->file, image_offset, n_bytes, + &local_qiov, 0); + if (ret < 0) { + goto fail; } } - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} -static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: + qemu_iovec_destroy(&local_qiov); qemu_co_mutex_unlock(&s->lock); + return ret; } -static int vpc_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVPCState *s = bs->opaque; - int64_t offset; - int64_t sectors, sectors_per_block; + int64_t image_offset; + int64_t n_bytes; + int64_t bytes_done = 0; int ret; VHDFooter *footer = (VHDFooter *) s->footer_buf; + QEMUIOVector local_qiov; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors); + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0); } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 1); - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } + qemu_co_mutex_lock(&s->lock); + qemu_iovec_init(&local_qiov, qiov->niov); + + while (bytes > 0) { + image_offset = get_image_offset(bs, offset, true); + n_bytes = MIN(bytes, s->block_size - (offset % s->block_size)); - if (offset == -1) { - offset = alloc_block(bs, sector_num); - if (offset < 0) - return -1; + if (image_offset == -1) { + image_offset = alloc_block(bs, offset); + if (image_offset < 0) { + ret = image_offset; + goto fail; + } } - ret = bdrv_pwrite(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes, + &local_qiov, 0); + if (ret < 0) { + goto fail; } - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} - -static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_write(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: + qemu_iovec_destroy(&local_qiov); qemu_co_mutex_unlock(&s->lock); + return ret; } @@ -783,13 +789,13 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0); if (ret < 0) { goto fail; } offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0); if (ret < 0) { goto fail; } @@ -799,7 +805,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = blk_pwrite(blk, offset, buf, 512); + ret = blk_pwrite(blk, offset, buf, 512, 0); if (ret < 0) { goto fail; } @@ -826,7 +832,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, /* Write the header */ offset = 512; - ret = blk_pwrite(blk, offset, buf, 1024); + ret = blk_pwrite(blk, offset, buf, 1024, 0); if (ret < 0) { goto fail; } @@ -848,7 +854,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, return ret; } - ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); + ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0); if (ret < 0) { return ret; } @@ -1056,8 +1062,8 @@ static BlockDriver bdrv_vpc = { .bdrv_reopen_prepare = vpc_reopen_prepare, .bdrv_create = vpc_create, - .bdrv_read = vpc_co_read, - .bdrv_write = vpc_co_write, + .bdrv_co_preadv = vpc_co_preadv, + .bdrv_co_pwritev = vpc_co_pwritev, .bdrv_co_get_block_status = vpc_co_get_block_status, .bdrv_get_info = vpc_get_info, diff --git a/block/vvfat.c b/block/vvfat.c index 183fc4f04..ba2620f3c 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -27,6 +27,7 @@ #include "qapi/error.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qemu/bswap.h" #include "migration/migration.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qbool.h" @@ -113,15 +114,12 @@ static inline int array_ensure_allocated(array_t* array, int index) static inline void* array_get_next(array_t* array) { unsigned int next = array->next; - void* result; if (array_ensure_allocated(array, next) < 0) return NULL; array->next = next + 1; - result = array_get(array, next); - - return result; + return array_get(array, next); } static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { @@ -343,9 +341,8 @@ typedef struct BDRVVVFATState { unsigned int current_cluster; /* write support */ - BlockDriverState* write_target; char* qcow_filename; - BlockDriverState* qcow; + BdrvChild* qcow; void* fat2; char* used_clusters; array_t commits; @@ -983,7 +980,7 @@ static int init_directories(BDRVVVFATState* s, static BDRVVVFATState *vvv = NULL; #endif -static int enable_write_target(BDRVVVFATState *s, Error **errp); +static int enable_write_target(BlockDriverState *bs, Error **errp); static int is_consistent(BDRVVVFATState *s); static QemuOptsList runtime_opts = { @@ -1160,8 +1157,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, s->current_cluster=0xffffffff; /* read only is the default for safety */ - bs->read_only = 1; - s->qcow = s->write_target = NULL; + bs->read_only = true; + s->qcow = NULL; s->qcow_filename = NULL; s->fat2 = NULL; s->downcase_short_names = 1; @@ -1172,11 +1169,11 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); if (qemu_opt_get_bool(opts, "rw", false)) { - ret = enable_write_target(s, errp); + ret = enable_write_target(bs, errp); if (ret < 0) { goto fail; } - bs->read_only = 0; + bs->read_only = false; } bs->total_sectors = cyls * heads * secs; @@ -1210,6 +1207,11 @@ fail: return ret; } +static void vvfat_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ +} + static inline void vvfat_close_current_file(BDRVVVFATState *s) { if(s->current_mapping) { @@ -1388,9 +1390,10 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, return -1; if (s->qcow) { int n; - if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) { -DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); - if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) { + if (bdrv_is_allocated(s->qcow->bs, sector_num, nb_sectors-i, &n)) { + DLOG(fprintf(stderr, "sectors %d+%d allocated\n", + (int)sector_num, n)); + if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) { return -1; } i += n - 1; @@ -1421,14 +1424,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); return 0; } -static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVVFATState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + void *buf; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + buf = g_try_malloc(bytes); + if (bytes && buf == NULL) { + return -ENOMEM; + } + qemu_co_mutex_lock(&s->lock); ret = vvfat_read(bs, sector_num, buf, nb_sectors); qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_from_buf(qiov, 0, buf, bytes); + g_free(buf); + return ret; } @@ -1649,12 +1669,15 @@ static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) int was_modified = 0; int i, dummy; - if (s->qcow == NULL) - return 0; + if (s->qcow == NULL) { + return 0; + } - for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) - was_modified = bdrv_is_allocated(s->qcow, - cluster2sector(s, cluster_num) + i, 1, &dummy); + for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) { + was_modified = bdrv_is_allocated(s->qcow->bs, + cluster2sector(s, cluster_num) + i, + 1, &dummy); + } return was_modified; } @@ -1803,11 +1826,16 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, vvfat_close_current_file(s); for (i = 0; i < s->sectors_per_cluster; i++) { - if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) { - if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) { + int res; + + res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy); + if (!res) { + res = vvfat_read(s->bs, offset, s->cluster_buffer, 1); + if (res) { return -1; } - if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) { + res = bdrv_write(s->qcow, offset, s->cluster_buffer, 1); + if (res) { return -2; } } @@ -1941,8 +1969,7 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)) /* check file size with FAT */ cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); if (cluster_count != - (le32_to_cpu(direntries[i].size) + s->cluster_size - - 1) / s->cluster_size) { + DIV_ROUND_UP(le32_to_cpu(direntries[i].size), s->cluster_size)) { DLOG(fprintf(stderr, "Cluster count mismatch\n")); goto fail; } @@ -2764,8 +2791,8 @@ static int do_commit(BDRVVVFATState* s) return ret; } - if (s->qcow->drv->bdrv_make_empty) { - s->qcow->drv->bdrv_make_empty(s->qcow); + if (s->qcow->bs->drv->bdrv_make_empty) { + s->qcow->bs->drv->bdrv_make_empty(s->qcow->bs); } memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); @@ -2880,14 +2907,31 @@ DLOG(checkpoint()); return 0; } -static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVVFATState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + void *buf; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + buf = g_try_malloc(bytes); + if (bytes && buf == NULL) { + return -ENOMEM; + } + qemu_iovec_to_buf(qiov, 0, buf, bytes); + qemu_co_mutex_lock(&s->lock); ret = vvfat_write(bs, sector_num, buf, nb_sectors); qemu_co_mutex_unlock(&s->lock); + + g_free(buf); + return ret; } @@ -2904,26 +2948,39 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA; } -static int write_target_commit(BlockDriverState *bs, int64_t sector_num, - const uint8_t* buffer, int nb_sectors) { +static int coroutine_fn +write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); return try_commit(s); } static void write_target_close(BlockDriverState *bs) { BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - bdrv_unref(s->qcow); + bdrv_unref_child(s->bs, s->qcow); g_free(s->qcow_filename); } static BlockDriver vvfat_write_target = { .format_name = "vvfat_write_target", - .bdrv_write = write_target_commit, + .bdrv_co_pwritev = write_target_commit, .bdrv_close = write_target_close, }; -static int enable_write_target(BDRVVVFATState *s, Error **errp) +static void vvfat_qcow_options(int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options) { + *child_flags = BDRV_O_RDWR | BDRV_O_NO_FLUSH; +} + +static const BdrvChildRole child_vvfat_qcow = { + .inherit_options = vvfat_qcow_options, +}; + +static int enable_write_target(BlockDriverState *bs, Error **errp) +{ + BDRVVVFATState *s = bs->opaque; BlockDriver *bdrv_qcow = NULL; BlockDriverState *backing; QemuOpts *opts = NULL; @@ -2960,12 +3017,13 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp) goto err; } - s->qcow = NULL; options = qdict_new(); - qdict_put(options, "driver", qstring_from_str("qcow")); - ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp); - if (ret < 0) { + qdict_put(options, "write-target.driver", qstring_from_str("qcow")); + s->qcow = bdrv_open_child(s->qcow_filename, options, "write-target", bs, + &child_vvfat_qcow, false, errp); + QDECREF(options); + if (!s->qcow) { + ret = -EINVAL; goto err; } @@ -3012,10 +3070,11 @@ static BlockDriver bdrv_vvfat = { .bdrv_parse_filename = vvfat_parse_filename, .bdrv_file_open = vvfat_open, + .bdrv_refresh_limits = vvfat_refresh_limits, .bdrv_close = vvfat_close, - .bdrv_read = vvfat_co_read, - .bdrv_write = vvfat_co_write, + .bdrv_co_preadv = vvfat_co_preadv, + .bdrv_co_pwritev = vvfat_co_pwritev, .bdrv_co_get_block_status = vvfat_co_get_block_status, }; diff --git a/block/win32-aio.c b/block/win32-aio.c index 2d509a9a7..95e3ab154 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -27,7 +27,7 @@ #include "block/block_int.h" #include "qemu/module.h" #include "block/aio.h" -#include "raw-aio.h" +#include "block/raw-aio.h" #include "qemu/event_notifier.h" #include "qemu/iov.h" #include <windows.h> |