From 3158f4a51894e46ecb593bffbfd12824e1d6534a Mon Sep 17 00:00:00 2001 From: Yonghee Han Date: Wed, 27 Jul 2016 16:40:17 +0900 Subject: Imported Upstream version 2.4.1 Change-Id: I0b584f569cb0e0f4eac13cdb79e110c2dbc34bfc --- block/Makefile.objs | 8 +- block/backup.c | 160 ++- block/blkdebug.c | 20 +- block/blkverify.c | 4 +- block/block-backend.c | 16 + block/commit.c | 3 +- block/curl.c | 16 +- block/dmg.c | 1 + block/io.c | 2610 +++++++++++++++++++++++++++++++++++++++++++++++ block/iscsi.c | 169 ++- block/mirror.c | 117 ++- block/nfs.c | 2 +- block/null.c | 66 +- block/parallels.c | 679 ++++++++++-- block/qapi.c | 65 +- block/qcow.c | 119 ++- block/qcow2-cache.c | 174 ++-- block/qcow2-cluster.c | 137 +-- block/qcow2-refcount.c | 73 +- block/qcow2-snapshot.c | 7 +- block/qcow2.c | 129 ++- block/qcow2.h | 26 +- block/qed.c | 8 +- block/quorum.c | 102 +- block/raw-posix.c | 115 ++- block/raw-win32.c | 1 + block/rbd.c | 66 +- block/sheepdog.c | 155 ++- block/snapshot.c | 19 +- block/ssh.c | 4 +- block/stream.c | 5 +- block/throttle-groups.c | 501 +++++++++ block/vdi.c | 6 +- block/vhdx-log.c | 1 + block/vhdx.c | 10 +- block/vmdk.c | 83 +- block/vpc.c | 6 +- block/vvfat.c | 37 +- 38 files changed, 4854 insertions(+), 866 deletions(-) create mode 100644 block/io.c create mode 100644 block/throttle-groups.c (limited to 'block') diff --git a/block/Makefile.objs b/block/Makefile.objs index db2933e46..58ef2ef3f 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,15 +1,16 @@ -block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o -block-obj-$(CONFIG_QUORUM) += quorum.o +block-obj-y += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o mirror.o +block-obj-y += null.o mirror.o io.o +block-obj-y += throttle-groups.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o @@ -37,6 +38,7 @@ gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) archipelago.o-libs := $(ARCHIPELAGO_LIBS) +block-obj-m += dmg.o dmg.o-libs := $(BZIP2_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/backup.c b/block/backup.c index 1c535b1ab..965654d52 100644 --- a/block/backup.c +++ b/block/backup.c @@ -19,6 +19,7 @@ #include "block/block.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #define BACKUP_CLUSTER_BITS 16 @@ -37,6 +38,8 @@ typedef struct CowRequest { typedef struct BackupBlockJob { BlockJob common; BlockDriverState *target; + /* bitmap for sync=incremental */ + BdrvDirtyBitmap *sync_bitmap; MirrorSyncMode sync_mode; RateLimit limit; BlockdevOnError on_source_error; @@ -195,7 +198,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) BackupBlockJob *s = container_of(job, BackupBlockJob, common); if (speed < 0) { - error_set(errp, QERR_INVALID_PARAMETER, "speed"); + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); return; } ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); @@ -242,6 +245,91 @@ static void backup_complete(BlockJob *job, void *opaque) g_free(data); } +static bool coroutine_fn yield_and_check(BackupBlockJob *job) +{ + if (block_job_is_cancelled(&job->common)) { + return true; + } + + /* we need to yield so that bdrv_drain_all() returns. + * (without, VM does not reboot) + */ + if (job->common.speed) { + uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, + job->sectors_read); + job->sectors_read = 0; + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); + } else { + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); + } + + if (block_job_is_cancelled(&job->common)) { + return true; + } + + return false; +} + +static int coroutine_fn backup_run_incremental(BackupBlockJob *job) +{ + bool error_is_read; + int ret = 0; + int clusters_per_iter; + uint32_t granularity; + int64_t sector; + int64_t cluster; + int64_t end; + int64_t last_cluster = -1; + BlockDriverState *bs = job->common.bs; + HBitmapIter hbi; + + granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); + clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); + bdrv_dirty_iter_init(job->sync_bitmap, &hbi); + + /* Find the next dirty sector(s) */ + while ((sector = hbitmap_iter_next(&hbi)) != -1) { + cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + + /* Fake progress updates for any clusters we skipped */ + if (cluster != last_cluster + 1) { + job->common.offset += ((cluster - last_cluster - 1) * + BACKUP_CLUSTER_SIZE); + } + + for (end = cluster + clusters_per_iter; cluster < end; cluster++) { + do { + if (yield_and_check(job)) { + return ret; + } + ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, + BACKUP_SECTORS_PER_CLUSTER, &error_is_read); + if ((ret < 0) && + backup_error_action(job, error_is_read, -ret) == + BLOCK_ERROR_ACTION_REPORT) { + return ret; + } + } while (ret < 0); + } + + /* If the bitmap granularity is smaller than the backup granularity, + * we need to advance the iterator pointer to the next cluster. */ + if (granularity < BACKUP_CLUSTER_SIZE) { + bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); + } + + last_cluster = cluster - 1; + } + + /* Play some final catchup with the progress meter */ + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + if (last_cluster + 1 < end) { + job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); + } + + return ret; +} + static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; @@ -259,8 +347,7 @@ static void coroutine_fn backup_run(void *opaque) qemu_co_rwlock_init(&job->flush_rwlock); start = 0; - end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE, - BACKUP_SECTORS_PER_CLUSTER); + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); job->bitmap = hbitmap_alloc(end, 0); @@ -278,28 +365,13 @@ static void coroutine_fn backup_run(void *opaque) qemu_coroutine_yield(); job->common.busy = true; } + } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { + ret = backup_run_incremental(job); } else { /* Both FULL and TOP SYNC_MODE's require copying.. */ for (; start < end; start++) { bool error_is_read; - - if (block_job_is_cancelled(&job->common)) { - break; - } - - /* we need to yield so that qemu_aio_flush() returns. - * (without, VM does not reboot) - */ - if (job->common.speed) { - uint64_t delay_ns = ratelimit_calculate_delay( - &job->limit, job->sectors_read); - job->sectors_read = 0; - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); - } else { - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); - } - - if (block_job_is_cancelled(&job->common)) { + if (yield_and_check(job)) { break; } @@ -357,6 +429,18 @@ static void coroutine_fn backup_run(void *opaque) qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); + if (job->sync_bitmap) { + BdrvDirtyBitmap *bm; + if (ret < 0 || block_job_is_cancelled(&job->common)) { + /* Merge the successor back into the parent, delete nothing. */ + bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); + assert(bm); + } else { + /* Everything is fine, delete this bitmap and install the backup. */ + bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); + assert(bm); + } + } hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); @@ -369,6 +453,7 @@ static void coroutine_fn backup_run(void *opaque) void backup_start(BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, @@ -388,7 +473,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { - error_set(errp, QERR_INVALID_PARAMETER, "on-source-error"); + error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); return; } @@ -412,17 +497,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, return; } + if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { + if (!sync_bitmap) { + error_setg(errp, "must provide a valid bitmap name for " + "\"incremental\" sync mode"); + return; + } + + /* Create a new bitmap, and freeze/disable this one. */ + if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { + return; + } + } else if (sync_bitmap) { + error_setg(errp, + "a sync_bitmap was provided to backup_run, " + "but received an incompatible sync_mode (%s)", + MirrorSyncMode_lookup[sync_mode]); + return; + } + len = bdrv_getlength(bs); if (len < 0) { error_setg_errno(errp, -len, "unable to get length for '%s'", bdrv_get_device_name(bs)); - return; + goto error; } BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp); if (!job) { - return; + goto error; } bdrv_op_block_all(target, job->common.blocker); @@ -431,7 +535,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, job->on_target_error = on_target_error; job->target = target; job->sync_mode = sync_mode; + job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? + sync_bitmap : NULL; job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); qemu_coroutine_enter(job->common.co, job); + return; + + error: + if (sync_bitmap) { + bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); + } } diff --git a/block/blkdebug.c b/block/blkdebug.c index 63611e0a3..bc247f46f 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -216,10 +216,9 @@ static int get_event_by_name(const char *name, BlkDebugEvent *event) struct add_rule_data { BDRVBlkdebugState *s; int action; - Error **errp; }; -static int add_rule(QemuOpts *opts, void *opaque) +static int add_rule(void *opaque, QemuOpts *opts, Error **errp) { struct add_rule_data *d = opaque; BDRVBlkdebugState *s = d->s; @@ -230,10 +229,10 @@ static int add_rule(QemuOpts *opts, void *opaque) /* Find the right event for the rule */ event_name = qemu_opt_get(opts, "event"); if (!event_name) { - error_setg(d->errp, "Missing event name for rule"); + error_setg(errp, "Missing event name for rule"); return -1; } else if (get_event_by_name(event_name, &event) < 0) { - error_setg(d->errp, "Invalid event name \"%s\"", event_name); + error_setg(errp, "Invalid event name \"%s\"", event_name); return -1; } @@ -319,8 +318,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, d.s = s; d.action = ACTION_INJECT_ERROR; - d.errp = &local_err; - qemu_opts_foreach(&inject_error_opts, add_rule, &d, 1); + qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err); if (local_err) { error_propagate(errp, local_err); ret = -EINVAL; @@ -328,7 +326,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, } d.action = ACTION_SET_STATE; - qemu_opts_foreach(&set_state_opts, add_rule, &d, 1); + qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err); if (local_err) { error_propagate(errp, local_err); ret = -EINVAL; @@ -431,7 +429,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, /* Open the backing file */ assert(bs->file == NULL); ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", - flags | BDRV_O_PROTOCOL, false, &local_err); + bs, &child_file, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto out; @@ -721,6 +719,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs) return bdrv_getlength(bs->file); } +static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file, offset); +} + static void blkdebug_refresh_filename(BlockDriverState *bs) { QDict *opts; @@ -779,6 +782,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_file_open = blkdebug_open, .bdrv_close = blkdebug_close, .bdrv_getlength = blkdebug_getlength, + .bdrv_truncate = blkdebug_truncate, .bdrv_refresh_filename = blkdebug_refresh_filename, .bdrv_aio_readv = blkdebug_aio_readv, diff --git a/block/blkverify.c b/block/blkverify.c index 438dff8bc..d277e6322 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -125,7 +125,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, /* Open the raw file */ assert(bs->file == NULL); ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, - "raw", flags | BDRV_O_PROTOCOL, false, &local_err); + "raw", bs, &child_file, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto fail; @@ -134,7 +134,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, /* Open the test file */ assert(s->test_file == NULL); ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, - "test", flags, false, &local_err); + "test", bs, &child_format, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); s->test_file = NULL; diff --git a/block/block-backend.c b/block/block-backend.c index 48b6e4c05..aee8a1202 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, return bdrv_write(blk->bs, sector_num, buf, nb_sectors); } +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; @@ -689,6 +700,11 @@ int blk_flush_all(void) return bdrv_flush_all(); } +void blk_drain(BlockBackend *blk) +{ + bdrv_drain(blk->bs); +} + void blk_drain_all(void) { bdrv_drain_all(); diff --git a/block/commit.c b/block/commit.c index cfa2bbebc..7312a5bdc 100644 --- a/block/commit.c +++ b/block/commit.c @@ -15,6 +15,7 @@ #include "trace.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" enum { @@ -186,7 +187,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) CommitBlockJob *s = container_of(job, CommitBlockJob, common); if (speed < 0) { - error_set(errp, QERR_INVALID_PARAMETER, "speed"); + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); return; } ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); diff --git a/block/curl.c b/block/curl.c index bbee3ca17..032cc8ae2 100644 --- a/block/curl.c +++ b/block/curl.c @@ -22,8 +22,10 @@ * THE SOFTWARE. */ #include "qemu-common.h" +#include "qemu/error-report.h" #include "block/block_int.h" #include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" #include // #define DEBUG_CURL @@ -297,6 +299,18 @@ static void curl_multi_check_completion(BDRVCURLState *s) /* ACBs for successful messages get completed in curl_read_cb */ if (msg->data.result != CURLE_OK) { int i; + static int errcount = 100; + + /* Don't lose the original error message from curl, since + * it contains extra data. + */ + if (errcount > 0) { + error_report("curl: %s", state->errmsg); + if (--errcount == 0) { + error_report("curl: further errors suppressed"); + } + } + for (i = 0; i < CURL_NUM_ACB; i++) { CURLAIOCB *acb = state->acb[i]; @@ -304,7 +318,7 @@ static void curl_multi_check_completion(BDRVCURLState *s) continue; } - acb->common.cb(acb->common.opaque, -EIO); + acb->common.cb(acb->common.opaque, -EPROTO); qemu_aio_unref(acb); state->acb[i] = NULL; } diff --git a/block/dmg.c b/block/dmg.c index 825c49d59..9f2528169 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -24,6 +24,7 @@ #include "qemu-common.h" #include "block/block_int.h" #include "qemu/bswap.h" +#include "qemu/error-report.h" #include "qemu/module.h" #include #ifdef CONFIG_BZIP2 diff --git a/block/io.c b/block/io.c new file mode 100644 index 000000000..d4bc83b33 --- /dev/null +++ b/block/io.c @@ -0,0 +1,2610 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "block/blockjob.h" +#include "block/block_int.h" +#include "block/throttle-groups.h" +#include "qemu/error-report.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) +{ + int i; + + throttle_group_config(bs, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; + + bs->io_limits_enabled = false; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } + } + + bs->io_limits_enabled = enabled; + + return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + bdrv_start_throttled_reqs(bs); + throttle_group_unregister_bs(bs); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) +{ + assert(!bs->io_limits_enabled); + throttle_group_register_bs(bs, group); + bs->io_limits_enabled = true; +} + +void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) +{ + /* this bs is not part of any group */ + if (!bs->throttle_state) { + return; + } + + /* this bs is a part of the same group than the one we want */ + if (!g_strcmp0(throttle_group_get_name(bs), group)) { + return; + } + + /* need to change the group this bs belong to */ + bdrv_io_limits_disable(bs); + bdrv_io_limits_enable(bs, group); +} + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } +} + +void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BlockDriver *drv = bs->drv; + Error *local_err = NULL; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; + bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.min_mem_alignment = 512; + bs->bl.opt_mem_alignment = getpagesize(); + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.max_transfer_length = + MIN_NON_ZERO(bs->bl.max_transfer_length, + bs->backing_hd->bl.max_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + bs->bl.min_mem_alignment = + MAX(bs->bl.min_mem_alignment, + bs->backing_hd->bl.min_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + drv->bdrv_refresh_limits(bs, errp); + } +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + if (bs->file && bdrv_requests_pending(bs->file)) { + return true; + } + if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { + return true; + } + return false; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + * + * Only this BlockDriverState's AioContext is run, so in-flight requests must + * not depend on events in other AioContexts. In that case, use + * bdrv_drain_all() instead. + */ +void bdrv_drain(BlockDriverState *bs) +{ + bool busy = true; + + while (busy) { + /* Keep iterating */ + bdrv_flush_io_queue(bs); + busy = bdrv_requests_pending(bs); + busy |= aio_poll(bdrv_get_aio_context(bs), busy); + } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + */ +void bdrv_drain_all(void) +{ + /* Always run first iteration so any pending completion BHs run */ + bool busy = true; + BlockDriverState *bs = NULL; + GSList *aio_ctxs = NULL, *ctx; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_pause(bs->job); + } + aio_context_release(aio_context); + + if (!g_slist_find(aio_ctxs, aio_context)) { + aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); + } + } + + /* Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a + * coroutine can submit an I/O request to another device in response to + * request completion. Therefore we must keep looping until there was no + * more activity rather than simply draining each device independently. + */ + while (busy) { + busy = false; + + for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { + AioContext *aio_context = ctx->data; + bs = NULL; + + aio_context_acquire(aio_context); + while ((bs = bdrv_next(bs))) { + if (aio_context == bdrv_get_aio_context(bs)) { + bdrv_flush_io_queue(bs); + if (bdrv_requests_pending(bs)) { + busy = true; + aio_poll(aio_context, busy); + } + } + } + busy |= aio_poll(aio_context, false); + aio_context_release(aio_context); + } + } + + bs = NULL; + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_resume(bs->job); + } + aio_context_release(aio_context); + } + g_slist_free(aio_ctxs); +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + if (req->serialising) { + req->bs->serialising_in_flight--; + } + + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t offset, + unsigned int bytes, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t offset, unsigned int bytes) +{ + /* aaaa bbbb */ + if (offset >= req->overlap_offset + req->overlap_bytes) { + return false; + } + /* bbbb aaaa */ + if (req->overlap_offset >= offset + bytes) { + return false; + } + return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ + BlockDriverState *bs = self->bs; + BdrvTrackedRequest *req; + bool retry; + bool waited = false; + + if (!bs->serialising_in_flight) { + return false; + } + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } + } + } + } while (retry); + + return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { + return -EIO; + } + + if (!bdrv_is_inserted(bs)) { + return -ENOMEDIUM; + } + + if (offset < 0) { + return -EIO; + } + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EIO; + } + + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t offset; + QEMUIOVector *qiov; + bool is_write; + int ret; + BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .offset = offset, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + .flags = flags, + }; + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); +} + +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ + int64_t target_sectors, ret, nb_sectors, sector_num = 0; + int n; + + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; + } + + for (;;) { + nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + if (nb_sectors <= 0) { + return 0; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; + int ret; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; + } + + return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = bytes, + }; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors, 0); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + wait_serialising_requests(req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + /* Forward the request to the BlockDriver */ + if (!bs->zero_beyond_eof) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF */ + int64_t total_sectors, max_nb_sectors; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; + goto out; + } + + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (nb_sectors < max_nb_sectors) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else if (max_nb_sectors > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, 0, + max_nb_sectors * BDRV_SECTOR_SIZE); + + ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, + &local_qiov); + + qemu_iovec_destroy(&local_qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } + +out: + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + throttle_group_co_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov = {0}; + int ret = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, + BDRV_REQUEST_MAX_SECTORS); + + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } + } + + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + MAX_WRITE_ZEROES_BOUNCE_BUFFER); + num = MIN(num, max_xfer_len); + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_xfer_len) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } + +fail: + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + + if (ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, + int64_t offset, + unsigned int bytes, + BdrvRequestFlags flags, + BdrvTrackedRequest *req) +{ + uint8_t *buf = NULL; + QEMUIOVector local_qiov; + struct iovec iov; + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + unsigned int head_padding_bytes, tail_padding_bytes; + int ret = 0; + + head_padding_bytes = offset & (align - 1); + tail_padding_bytes = align - ((offset + bytes) & (align - 1)); + + + assert(flags & BDRV_REQ_ZERO_WRITE); + if (head_padding_bytes || tail_padding_bytes) { + buf = qemu_blockalign(bs, align); + iov = (struct iovec) { + .iov_base = buf, + .iov_len = align, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + } + if (head_padding_bytes) { + uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); + + /* RMW the unaligned part before head. */ + mark_request_serialising(req, align); + wait_serialising_requests(req); + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, + align, &local_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + memset(buf + head_padding_bytes, 0, zero_bytes); + ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, + &local_qiov, + flags & ~BDRV_REQ_ZERO_WRITE); + if (ret < 0) { + goto fail; + } + offset += zero_bytes; + bytes -= zero_bytes; + } + + assert(!bytes || (offset & (align - 1)) == 0); + if (bytes >= align) { + /* Write the aligned part in the middle. */ + uint64_t aligned_bytes = bytes & ~(align - 1); + ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, + NULL, flags); + if (ret < 0) { + goto fail; + } + bytes -= aligned_bytes; + offset += aligned_bytes; + } + + assert(!bytes || (offset & (align - 1)) == 0); + if (bytes) { + assert(align == tail_padding_bytes + bytes); + /* RMW the unaligned part after tail. */ + mark_request_serialising(req, align); + wait_serialising_requests(req); + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, req, offset, align, + align, &local_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + memset(buf, 0, bytes); + ret = bdrv_aligned_pwritev(bs, req, offset, align, + &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); + } +fail: + qemu_vfree(buf); + return ret; + +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EPERM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + throttle_group_co_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (!qiov) { + ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); + goto out; + } + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); +out: + tracked_request_end(&req); + return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE | flags); +} + +int bdrv_flush_all(void) +{ + BlockDriverState *bs = NULL; + int result = 0; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; + + aio_context_acquire(aio_context); + ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + aio_context_release(aio_context); + } + + return result; +} + +typedef struct BdrvCoGetBlockStatusData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int64_t ret; + bool done; +} BdrvCoGetBlockStatusData; + +/* + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t total_sectors; + int64_t n; + int64_t ret, ret2; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; + } + + if (sector_num >= total_sectors) { + *pnum = 0; + return 0; + } + + n = total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_get_block_status) { + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } + + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + } + + if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { + ret |= BDRV_BLOCK_ALLOCATED; + } else { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing_hd) { + BlockDriverState *bs2 = bs->backing_hd; + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { + ret |= BDRV_BLOCK_ZERO; + } + } + } + + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int file_pnum; + + ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + if (!file_pnum) { + /* !file_pnum indicates an offset at or beyond the EOF; it is + * perfectly valid for the format block driver to point to such + * offsets, so catch it and mark everything as zero */ + ret |= BDRV_BLOCK_ZERO; + } else { + /* Limit request to the range reported by the protocol driver */ + *pnum = file_pnum; + ret |= (ret2 & BDRV_BLOCK_ZERO); + } + } + } + + return ret; +} + +static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, + int *pnum) +{ + BlockDriverState *p; + int64_t ret = 0; + + assert(bs != base); + for (p = bs; p != base; p = p->backing_hd) { + ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); + if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { + break; + } + /* [sector_num, pnum] unallocated on this layer, which could be only + * the first part of [sector_num, nb_sectors]. */ + nb_sectors = MIN(nb_sectors, *pnum); + } + return ret; +} + +/* Coroutine wrapper for bdrv_get_block_status_above() */ +static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) +{ + BdrvCoGetBlockStatusData *data = opaque; + + data->ret = bdrv_co_get_block_status_above(data->bs, data->base, + data->sector_num, + data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_get_block_status_above(). + * + * See bdrv_co_get_block_status_above() for details. + */ +int64_t bdrv_get_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + Coroutine *co; + BdrvCoGetBlockStatusData data = { + .bs = bs, + .base = base, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_above_co_entry(&data); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + aio_poll(aio_context, true); + } + } + return data.ret; +} + +int64_t bdrv_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + return bdrv_get_block_status_above(bs, bs->backing_hd, + sector_num, nb_sectors, pnum); +} + +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n = nb_sectors; + + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } + + intermediate = intermediate->backing_hd; + } + + *pnum = n; + return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (!drv->bdrv_write_compressed) { + return -ENOTSUP; + } + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file, qiov, pos); + } + + return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file, buf, pos, size); + return -ENOTSUP; +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, + cb, opaque, true); +} + + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + // Add tail of first request, if necessary + if (qiov->size < reqs[outidx].qiov->size) { + qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, + reqs[outidx].qiov->size - qiov->size); + } + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); + } + + return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { + BlockAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockAIOCBSync *acb = opaque; + + if (!acb->is_write && acb->ret >= 0) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_try_blockalign(bs, qiov->size); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockAIOCBCoroutine { + BlockAIOCB common; + BlockRequest req; + bool is_write; + bool need_bh; + bool *done; + QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +{ + if (!acb->need_bh) { + acb->common.cb(acb->common.opaque, acb->req.error); + qemu_aio_unref(acb); + } +} + +static void bdrv_co_em_bh(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + + assert(!acb->need_bh); + qemu_bh_delete(acb->bh); + bdrv_co_complete(acb); +} + +static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +{ + acb->need_bh = false; + if (acb->req.error != -EINPROGRESS) { + BlockDriverState *bs = acb->common.bs; + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); + } +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } + + bdrv_co_complete(acb); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->req.flags = flags; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_slice_free1(acb->aiocb_info->aiocb_size, acb); + } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || + bdrv_is_sg(bs)) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + return ret; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + return bdrv_co_flush(bs->file); +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + DiscardCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + int max_discard, ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } else if (bs->read_only) { + return -EPERM; + } + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; + } + return 0; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + DiscardCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_ioctl) + return drv->bdrv_ioctl(bs, req, buf); + return -ENOTSUP; +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_aio_ioctl) + return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + return NULL; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ + return qemu_memalign(bdrv_opt_mem_align(bs), size); +} + +void *qemu_blockalign0(BlockDriverState *bs, size_t size) +{ + return memset(qemu_blockalign(bs, size), 0, size); +} + +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +{ + void *mem = qemu_try_blockalign(bs, size); + + if (mem) { + memset(mem, 0, size); + } + + return mem; +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + size_t alignment = bdrv_min_mem_align(bs); + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { + return false; + } + } + + return true; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file); + } + bdrv_start_throttled_reqs(bs); +} diff --git a/block/iscsi.c b/block/iscsi.c index be8af46ad..93f1ee4c6 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,7 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg - * Copyright (c) 2012-2014 Peter Lieven + * Copyright (c) 2012-2015 Peter Lieven * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -38,6 +38,7 @@ #include "qemu/iov.h" #include "sysemu/sysemu.h" #include "qmp-commands.h" +#include "qapi/qmp/qstring.h" #include #include @@ -57,9 +58,6 @@ typedef struct IscsiLun { int events; QEMUTimer *nop_timer; QEMUTimer *event_timer; - uint8_t lbpme; - uint8_t lbprz; - uint8_t has_write_same; struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; unsigned char *zeroblock; @@ -67,6 +65,12 @@ typedef struct IscsiLun { int cluster_sectors; bool use_16_for_rw; bool write_protected; + bool lbpme; + bool lbprz; + bool dpofua; + bool has_write_same; + bool force_next_flush; + bool request_timed_out; } IscsiLun; typedef struct IscsiTask { @@ -79,6 +83,7 @@ typedef struct IscsiTask { QEMUBH *bh; IscsiLun *iscsilun; QEMUTimer retry_timer; + bool force_next_flush; } IscsiTask; typedef struct IscsiAIOCB { @@ -96,11 +101,12 @@ typedef struct IscsiAIOCB { #endif } IscsiAIOCB; -#define EVENT_INTERVAL 250 +/* libiscsi uses time_t so its enough to process events every second */ +#define EVENT_INTERVAL 1000 #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) -static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048}; +static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768}; /* this threshold is a trade-off knob to choose between * the potential additional overhead of an extra GET_LBA_STATUS request @@ -163,6 +169,19 @@ static inline unsigned exp_random(double mean) return -mean * log((double)rand() / RAND_MAX); } +/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced + * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION + * macro was introduced in 1.11.0. So use the API_VERSION macro as + * a hint that the macros are defined and define them ourselves + * otherwise to keep the required libiscsi version at 1.9.0 */ +#if !defined(LIBISCSI_API_VERSION) +#define QEMU_SCSI_STATUS_TASK_SET_FULL 0x28 +#define QEMU_SCSI_STATUS_TIMEOUT 0x0f000002 +#else +#define QEMU_SCSI_STATUS_TASK_SET_FULL SCSI_STATUS_TASK_SET_FULL +#define QEMU_SCSI_STATUS_TIMEOUT SCSI_STATUS_TIMEOUT +#endif + static void iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, void *command_data, void *opaque) @@ -183,10 +202,19 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, iTask->do_retry = 1; goto out; } - if (status == SCSI_STATUS_BUSY) { + if (status == SCSI_STATUS_BUSY || + status == QEMU_SCSI_STATUS_TIMEOUT || + status == QEMU_SCSI_STATUS_TASK_SET_FULL) { unsigned retry_time = exp_random(iscsi_retry_times[iTask->retries - 1]); - error_report("iSCSI Busy (retry #%u in %u ms): %s", + if (status == QEMU_SCSI_STATUS_TIMEOUT) { + /* make sure the request is rescheduled AFTER the + * reconnect is initiated */ + retry_time = EVENT_INTERVAL * 2; + iTask->iscsilun->request_timed_out = true; + } + error_report("iSCSI Busy/TaskSetFull/TimeOut" + " (retry #%u in %u ms): %s", iTask->retries, retry_time, iscsi_get_error(iscsi)); aio_timer_init(iTask->iscsilun->aio_context, @@ -199,6 +227,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, } } error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); + } else { + iTask->iscsilun->force_next_flush |= iTask->force_next_flush; } out: @@ -268,20 +298,26 @@ iscsi_set_events(IscsiLun *iscsilun) iscsilun); iscsilun->events = ev; } - - /* newer versions of libiscsi may return zero events. In this - * case start a timer to ensure we are able to return to service - * once this situation changes. */ - if (!ev) { - timer_mod(iscsilun->event_timer, - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); - } } -static void iscsi_timed_set_events(void *opaque) +static void iscsi_timed_check_events(void *opaque) { IscsiLun *iscsilun = opaque; + + /* check for timed out requests */ + iscsi_service(iscsilun->iscsi, 0); + + if (iscsilun->request_timed_out) { + iscsilun->request_timed_out = false; + iscsi_reconnect(iscsilun->iscsi); + } + + /* newer versions of libiscsi may return zero events. Ensure we are able + * to return to service once this situation changes. */ iscsi_set_events(iscsilun); + + timer_mod(iscsilun->event_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); } static void @@ -369,6 +405,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; + int fua; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; @@ -384,15 +421,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: + fua = iscsilun->dpofua && !bs->enable_write_cache; + iTask.force_next_flush = !fua; if (iscsilun->use_16_for_rw) { iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, + iscsilun->block_size, 0, 0, fua, 0, 0, iscsi_co_generic_cb, &iTask); } else { iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, + iscsilun->block_size, 0, 0, fua, 0, 0, iscsi_co_generic_cb, &iTask); } if (iTask.task == NULL) { @@ -460,7 +499,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, *pnum = nb_sectors; /* LUN does not support logical block provisioning */ - if (iscsilun->lbpme == 0) { + if (!iscsilun->lbpme) { goto out; } @@ -616,12 +655,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; - if (bs->sg) { + if (!iscsilun->force_next_flush) { return 0; } + iscsilun->force_next_flush = false; iscsi_co_init_iscsitask(iscsilun, &iTask); - retry: if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { @@ -917,6 +956,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, } iscsi_co_init_iscsitask(iscsilun, &iTask); + iTask.force_next_flush = true; retry: if (use_16_for_ws) { iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, @@ -1080,16 +1120,37 @@ static char *parse_initiator_name(const char *target) return iscsi_name; } +static int parse_timeout(const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *timeout; + + list = qemu_find_opts("iscsi"); + if (list) { + opts = qemu_opts_find(list, target); + if (!opts) { + opts = QTAILQ_FIRST(&list->head); + } + if (opts) { + timeout = qemu_opt_get(opts, "timeout"); + if (timeout) { + return atoi(timeout); + } + } + } + + return 0; +} + static void iscsi_nop_timed_event(void *opaque) { IscsiLun *iscsilun = opaque; - if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) { + if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) { error_report("iSCSI: NOP timeout. Reconnecting..."); - iscsi_reconnect(iscsilun->iscsi); - } - - if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { + iscsilun->request_timed_out = true; + } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages."); return; } @@ -1121,8 +1182,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) } else { iscsilun->block_size = rc16->block_length; iscsilun->num_blocks = rc16->returned_lba + 1; - iscsilun->lbpme = rc16->lbpme; - iscsilun->lbprz = rc16->lbprz; + iscsilun->lbpme = !!rc16->lbpme; + iscsilun->lbprz = !!rc16->lbprz; iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); } } @@ -1153,6 +1214,10 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) if (task == NULL || task->status != SCSI_STATUS_GOOD) { error_setg(errp, "iSCSI: failed to send readcapacity10 command."); + } else if (!iscsilun->block_size || + iscsilun->block_size % BDRV_SECTOR_SIZE) { + error_setg(errp, "iSCSI: the target returned an invalid " + "block size of %d.", iscsilun->block_size); } if (task) { scsi_free_scsi_task(task); @@ -1247,17 +1312,21 @@ static void iscsi_attach_aio_context(BlockDriverState *bs, timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); - /* Prepare a timer for a delayed call to iscsi_set_events */ + /* Set up a timer for periodic calls to iscsi_set_events and to + * scan for command timeout */ iscsilun->event_timer = aio_timer_new(iscsilun->aio_context, QEMU_CLOCK_REALTIME, SCALE_MS, - iscsi_timed_set_events, iscsilun); + iscsi_timed_check_events, iscsilun); + timer_mod(iscsilun->event_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); } -static bool iscsi_is_write_protected(IscsiLun *iscsilun) +static void iscsi_modesense_sync(IscsiLun *iscsilun) { struct scsi_task *task; struct scsi_mode_sense *ms = NULL; - bool wrprotected = false; + iscsilun->write_protected = false; + iscsilun->dpofua = false; task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun, 1, SCSI_MODESENSE_PC_CURRENT, @@ -1278,13 +1347,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun) iscsi_get_error(iscsilun->iscsi)); goto out; } - wrprotected = ms->device_specific_parameter & 0x80; + iscsilun->write_protected = ms->device_specific_parameter & 0x80; + iscsilun->dpofua = ms->device_specific_parameter & 0x10; out: if (task) { scsi_free_scsi_task(task); } - return wrprotected; } /* @@ -1304,14 +1373,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, QemuOpts *opts; Error *local_err = NULL; const char *filename; - int i, ret = 0; - - if ((BDRV_SECTOR_SIZE % 512) != 0) { - error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. " - "BDRV_SECTOR_SIZE(%lld) is not a multiple " - "of 512", BDRV_SECTOR_SIZE); - return -EINVAL; - } + int i, ret = 0, timeout = 0; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); @@ -1381,6 +1443,16 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, goto out; } + /* timeout handling is broken in libiscsi before 1.15.0 */ + timeout = parse_timeout(iscsi_url->target); +#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621 + iscsi_set_timeout(iscsi, timeout); +#else + if (timeout) { + error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0"); + } +#endif + if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { error_setg(errp, "iSCSI: Failed to connect to LUN : %s", iscsi_get_error(iscsi)); @@ -1403,7 +1475,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, scsi_free_scsi_task(task); task = NULL; - iscsilun->write_protected = iscsi_is_write_protected(iscsilun); + iscsi_modesense_sync(iscsilun); + /* Check the write protect flag of the LUN if we want to write */ if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && iscsilun->write_protected) { @@ -1481,7 +1554,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; - if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) { + if (iscsilun->lbprz) { iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); if (iscsilun->allocationmap == NULL) { ret = -ENOMEM; @@ -1655,7 +1728,7 @@ out: static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { IscsiLun *iscsilun = bs->opaque; - bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; + bdi->unallocated_blocks_are_zero = iscsilun->lbprz; bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; return 0; @@ -1728,6 +1801,10 @@ static QemuOptsList qemu_iscsi_opts = { .name = "initiator-name", .type = QEMU_OPT_STRING, .help = "Initiator iqn name to use when connecting", + },{ + .name = "timeout", + .type = QEMU_OPT_NUMBER, + .help = "Request timeout in seconds (default 0 = no timeout)", }, { /* end of list */ } }, diff --git a/block/mirror.c b/block/mirror.c index bd079a439..b2fb4b9b1 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -14,11 +14,13 @@ #include "trace.h" #include "block/blockjob.h" #include "block/block_int.h" +#include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "qemu/bitmap.h" #define SLICE_TIME 100000000ULL /* ns */ #define MAX_IN_FLIGHT 16 +#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) /* The mirroring buffer is a list of granularity-sized chunks. * Free chunks are organized in a list. @@ -58,6 +60,7 @@ typedef struct MirrorBlockJob { int sectors_in_flight; int ret; bool unmap; + bool waiting_for_io; } MirrorBlockJob; typedef struct MirrorOp { @@ -112,11 +115,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) qemu_iovec_destroy(&op->qiov); g_slice_free(MirrorOp, op); - /* Enter coroutine when it is not sleeping. The coroutine sleeps to - * rate-limit itself. The coroutine will eventually resume since there is - * a sleep timeout so don't wake it early. - */ - if (s->common.busy) { + if (s->waiting_for_io) { qemu_coroutine_enter(s->common.co, NULL); } } @@ -126,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret) MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { - BlockDriverState *source = s->common.bs; BlockErrorAction action; - bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num, - op->nb_sectors); + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); action = mirror_error_action(s, false, -ret); if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; @@ -144,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret) MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { - BlockDriverState *source = s->common.bs; BlockErrorAction action; - bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num, - op->nb_sectors); + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); action = mirror_error_action(s, true, -ret); if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; @@ -173,10 +168,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { - bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi); + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, - bdrv_get_dirty_count(source, s->dirty_bitmap)); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(s->sector_num >= 0); } @@ -206,7 +200,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Wait for I/O to this cluster (from a previous iteration) to be done. */ while (test_bit(next_chunk, s->in_flight_bitmap)) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } do { @@ -242,7 +238,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) */ while (nb_chunks == 0 && s->buf_free_count < added_chunks) { trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } if (s->buf_free_count < nb_chunks + added_chunks) { trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); @@ -291,8 +289,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) next_sector += sectors_per_chunk; } - bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num, - nb_sectors); + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); /* Copy the dirty cluster. */ s->in_flight++; @@ -337,7 +334,9 @@ static void mirror_free_init(MirrorBlockJob *s) static void mirror_drain(MirrorBlockJob *s) { while (s->in_flight > 0) { + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } } @@ -392,7 +391,7 @@ static void coroutine_fn mirror_run(void *opaque) MirrorBlockJob *s = opaque; MirrorExitData *data; BlockDriverState *bs = s->common.bs; - int64_t sector_num, end, sectors_per_chunk, length; + int64_t sector_num, end, length; uint64_t last_pause_ns; BlockDriverInfo bdi; char backing_filename[2]; /* we only need 2 characters because we are only @@ -446,16 +445,28 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base = s->base; for (sector_num = 0; sector_num < end; ) { - int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; - ret = bdrv_is_allocated_above(bs, base, - sector_num, next - sector_num, &n); + /* Just to make sure we are not exceeding int limit. */ + int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, + end - sector_num); + int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + + if (now - last_pause_ns > SLICE_TIME) { + last_pause_ns = now; + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); + } + + if (block_job_is_cancelled(&s->common)) { + goto immediate_exit; + } + + ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); if (ret < 0) { goto immediate_exit; @@ -463,16 +474,13 @@ static void coroutine_fn mirror_run(void *opaque) assert(n > 0); if (ret == 1) { - bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n); - sector_num = next; - } else { - sector_num += n; + bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); } + sector_num += n; } } - bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi); - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); for (;;) { uint64_t delay_ns = 0; int64_t cnt; @@ -483,7 +491,7 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); /* s->common.offset contains the number of bytes already processed so * far, cnt is the number of dirty sectors remaining and * s->sectors_in_flight is the number of sectors currently being @@ -492,7 +500,7 @@ static void coroutine_fn mirror_run(void *opaque) (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; /* Note that even when no rate limit is applied we need to yield - * periodically with no pending I/O so that qemu_aio_flush() returns. + * periodically with no pending I/O so that bdrv_drain_all() returns. * We do so every SLICE_TIME nanoseconds, or when there is an error, * or when the source is clean, whichever comes first. */ @@ -501,13 +509,12 @@ static void coroutine_fn mirror_run(void *opaque) if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); - if (delay_ns == 0) { - continue; - } } } @@ -533,7 +540,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = s->should_complete || block_job_is_cancelled(&s->common); - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); } } @@ -548,7 +555,7 @@ static void coroutine_fn mirror_run(void *opaque) */ trace_mirror_before_drain(s, cnt); bdrv_drain(bs); - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); } ret = 0; @@ -599,7 +606,7 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); if (speed < 0) { - error_set(errp, QERR_INVALID_PARAMETER, "speed"); + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); return; } ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); @@ -624,8 +631,8 @@ static void mirror_complete(BlockJob *job, Error **errp) return; } if (!s->synced) { - error_set(errp, QERR_BLOCK_JOB_NOT_READY, - bdrv_get_device_name(job->bs)); + error_setg(errp, QERR_BLOCK_JOB_NOT_READY, + bdrv_get_device_name(job->bs)); return; } @@ -651,7 +658,7 @@ static void mirror_complete(BlockJob *job, Error **errp) } s->should_complete = true; - block_job_resume(job); + block_job_enter(&s->common); } static const BlockJobDriver mirror_job_driver = { @@ -673,7 +680,7 @@ static const BlockJobDriver commit_active_job_driver = { static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, const char *replaces, - int64_t speed, int64_t granularity, + int64_t speed, uint32_t granularity, int64_t buf_size, BlockdevOnError on_source_error, BlockdevOnError on_target_error, @@ -686,15 +693,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, MirrorBlockJob *s; if (granularity == 0) { - /* Choose the default granularity based on the target file's cluster - * size, clamped between 4k and 64k. */ - BlockDriverInfo bdi; - if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) { - granularity = MAX(4096, bdi.cluster_size); - granularity = MIN(65536, granularity); - } else { - granularity = 65536; - } + granularity = bdrv_get_default_bitmap_granularity(target); } assert ((granularity & (granularity - 1)) == 0); @@ -702,10 +701,18 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { - error_set(errp, QERR_INVALID_PARAMETER, "on-source-error"); + error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); return; } + if (buf_size < 0) { + error_setg(errp, "Invalid parameter 'buf-size'"); + return; + } + + if (buf_size == 0) { + buf_size = DEFAULT_MIRROR_BUF_SIZE; + } s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { @@ -719,11 +726,13 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->is_none_mode = is_none_mode; s->base = base; s->granularity = granularity; - s->buf_size = MAX(buf_size, granularity); + s->buf_size = ROUND_UP(buf_size, granularity); s->unmap = unmap; - s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp); + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { + g_free(s->replaces); + block_job_release(bs); return; } bdrv_set_enable_write_cache(s->target, true); @@ -736,7 +745,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, void mirror_start(BlockDriverState *bs, BlockDriverState *target, const char *replaces, - int64_t speed, int64_t granularity, int64_t buf_size, + int64_t speed, uint32_t granularity, int64_t buf_size, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, bool unmap, @@ -746,6 +755,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, bool is_none_mode; BlockDriverState *base; + if (mode == MIRROR_SYNC_MODE_INCREMENTAL) { + error_setg(errp, "Sync mode 'incremental' not supported"); + return; + } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; mirror_start_job(bs, target, replaces, diff --git a/block/nfs.c b/block/nfs.c index c026ff688..02eb4e464 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) aio_poll(client->aio_context, true); } - return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize); + return (task.ret < 0 ? task.ret : st.st_blocks * 512); } static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) diff --git a/block/null.c b/block/null.c index ec2bd27a4..7d083233f 100644 --- a/block/null.c +++ b/block/null.c @@ -12,8 +12,11 @@ #include "block/block_int.h" +#define NULL_OPT_LATENCY "latency-ns" + typedef struct { int64_t length; + int64_t latency_ns; } BDRVNullState; static QemuOptsList runtime_opts = { @@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_SIZE, .help = "size of the null block", }, + { + .name = NULL_OPT_LATENCY, + .type = QEMU_OPT_NUMBER, + .help = "nanoseconds (approximated) to wait " + "before completing request", + }, { /* end of list */ } }, }; @@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, { QemuOpts *opts; BDRVNullState *s = bs->opaque; + int ret = 0; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &error_abort); s->length = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); + s->latency_ns = + qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0); + if (s->latency_ns < 0) { + error_setg(errp, "latency-ns is invalid"); + ret = -EINVAL; + } qemu_opts_del(opts); - return 0; + return ret; } static void null_close(BlockDriverState *bs) @@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs) return s->length; } +static coroutine_fn int null_co_common(BlockDriverState *bs) +{ + BDRVNullState *s = bs->opaque; + + if (s->latency_ns) { + co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME, + s->latency_ns); + } + return 0; +} + static coroutine_fn int null_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return 0; + return null_co_common(bs); } static coroutine_fn int null_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return 0; + return null_co_common(bs); } static coroutine_fn int null_co_flush(BlockDriverState *bs) { - return 0; + return null_co_common(bs); } typedef struct { BlockAIOCB common; QEMUBH *bh; + QEMUTimer timer; } NullAIOCB; static const AIOCBInfo null_aiocb_info = { @@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque) qemu_aio_unref(acb); } +static void null_timer_cb(void *opaque) +{ + NullAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, 0); + timer_deinit(&acb->timer); + qemu_aio_unref(acb); +} + static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { NullAIOCB *acb; + BDRVNullState *s = bs->opaque; acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); - qemu_bh_schedule(acb->bh); + /* Only emulate latency after vcpu is running. */ + if (s->latency_ns) { + aio_timer_init(bdrv_get_aio_context(bs), &acb->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + null_timer_cb, acb); + timer_mod_ns(&acb->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns); + } else { + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); + qemu_bh_schedule(acb->bh); + } return &acb->common; } @@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs, return null_aio_common(bs, cb, opaque); } +static int null_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + static BlockDriver bdrv_null_co = { .format_name = "null-co", .protocol_name = "null-co", @@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = { .bdrv_co_readv = null_co_readv, .bdrv_co_writev = null_co_writev, .bdrv_co_flush_to_disk = null_co_flush, + .bdrv_reopen_prepare = null_reopen_prepare, }; static BlockDriver bdrv_null_aio = { @@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = { .bdrv_aio_readv = null_aio_readv, .bdrv_aio_writev = null_aio_writev, .bdrv_aio_flush = null_aio_flush, + .bdrv_reopen_prepare = null_reopen_prepare, }; static void bdrv_null_init(void) diff --git a/block/parallels.c b/block/parallels.c index 4f9cd8dd2..046b56844 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -2,8 +2,12 @@ * Block driver for Parallels disk image format * * Copyright (c) 2007 Alex Beregszaszi + * Copyright (c) 2015 Denis V. Lunev * - * This code is based on comparing different disk images created by Parallels. + * This code was originally based on comparing different disk images created + * by Parallels. Currently it is based on opened OpenVZ sources + * available at + * http://git.openvz.org/?p=ploop;a=summary * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,63 +30,539 @@ #include "qemu-common.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qemu/bitmap.h" +#include "qapi/util.h" /**************************************************************/ #define HEADER_MAGIC "WithoutFreeSpace" #define HEADER_MAGIC2 "WithouFreSpacExt" #define HEADER_VERSION 2 -#define HEADER_SIZE 64 +#define HEADER_INUSE_MAGIC (0x746F6E59) + +#define DEFAULT_CLUSTER_SIZE 1048576 /* 1 MiB */ + // always little-endian -struct parallels_header { +typedef struct ParallelsHeader { char magic[16]; // "WithoutFreeSpace" uint32_t version; uint32_t heads; uint32_t cylinders; uint32_t tracks; - uint32_t catalog_entries; + uint32_t bat_entries; uint64_t nb_sectors; uint32_t inuse; uint32_t data_off; char padding[12]; -} QEMU_PACKED; +} QEMU_PACKED ParallelsHeader; + + +typedef enum ParallelsPreallocMode { + PRL_PREALLOC_MODE_FALLOCATE = 0, + PRL_PREALLOC_MODE_TRUNCATE = 1, + PRL_PREALLOC_MODE_MAX = 2, +} ParallelsPreallocMode; + +static const char *prealloc_mode_lookup[] = { + "falloc", + "truncate", + NULL, +}; + typedef struct BDRVParallelsState { + /** Locking is conservative, the lock protects + * - image file extending (truncate, fallocate) + * - any access to block allocation table + */ CoMutex lock; - uint32_t *catalog_bitmap; - unsigned int catalog_size; + ParallelsHeader *header; + uint32_t header_size; + bool header_unclean; + + unsigned long *bat_dirty_bmap; + unsigned int bat_dirty_block; + + uint32_t *bat_bitmap; + unsigned int bat_size; + + int64_t data_end; + uint64_t prealloc_size; + ParallelsPreallocMode prealloc_mode; unsigned int tracks; unsigned int off_multiplier; } BDRVParallelsState; -static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename) + +#define PARALLELS_OPT_PREALLOC_MODE "prealloc-mode" +#define PARALLELS_OPT_PREALLOC_SIZE "prealloc-size" + +static QemuOptsList parallels_runtime_opts = { + .name = "parallels", + .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head), + .desc = { + { + .name = PARALLELS_OPT_PREALLOC_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Preallocation size on image expansion", + .def_value_str = "128MiB", + }, + { + .name = PARALLELS_OPT_PREALLOC_MODE, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode on image expansion " + "(allowed values: falloc, truncate)", + .def_value_str = "falloc", + }, + { /* end of list */ }, + }, +}; + + +static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx) +{ + return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier; +} + +static uint32_t bat_entry_off(uint32_t idx) +{ + return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx; +} + +static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num) +{ + uint32_t index, offset; + + index = sector_num / s->tracks; + offset = sector_num % s->tracks; + + /* not allocated */ + if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) { + return -1; + } + return bat2sect(s, index) + offset; +} + +static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num, + int nb_sectors) +{ + int ret = s->tracks - sector_num % s->tracks; + return MIN(nb_sectors, ret); +} + +static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t start_off = -2, prev_end_off = -2; + + *pnum = 0; + while (nb_sectors > 0 || start_off == -2) { + int64_t offset = seek_to_sector(s, sector_num); + int to_end; + + if (start_off == -2) { + start_off = offset; + prev_end_off = offset; + } else if (offset != prev_end_off) { + break; + } + + to_end = cluster_remainder(s, sector_num, nb_sectors); + nb_sectors -= to_end; + sector_num += to_end; + *pnum += to_end; + + if (offset > 0) { + prev_end_off += to_end; + } + } + return start_off; +} + +static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { - const struct parallels_header *ph = (const void *)buf; + BDRVParallelsState *s = bs->opaque; + uint32_t idx, to_allocate, i; + int64_t pos, space; + + pos = block_status(s, sector_num, nb_sectors, pnum); + if (pos > 0) { + return pos; + } + + idx = sector_num / s->tracks; + if (idx >= s->bat_size) { + return -EINVAL; + } + + to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; + space = to_allocate * s->tracks; + if (s->data_end + space > bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS) { + int ret; + space += s->prealloc_size; + if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { + ret = bdrv_write_zeroes(bs->file, s->data_end, space, 0); + } else { + ret = bdrv_truncate(bs->file, + (s->data_end + space) << BDRV_SECTOR_BITS); + } + if (ret < 0) { + return ret; + } + } + + for (i = 0; i < to_allocate; i++) { + s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); + s->data_end += s->tracks; + bitmap_set(s->bat_dirty_bmap, + bat_entry_off(idx) / s->bat_dirty_block, 1); + } + + return bat2sect(s, idx) + sector_num % s->tracks; +} + - if (buf_size < HEADER_SIZE) +static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block); + unsigned long bit; + + qemu_co_mutex_lock(&s->lock); + + bit = find_first_bit(s->bat_dirty_bmap, size); + while (bit < size) { + uint32_t off = bit * s->bat_dirty_block; + uint32_t to_write = s->bat_dirty_block; + int ret; + + if (off + to_write > s->header_size) { + to_write = s->header_size - off; + } + ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1); + } + bitmap_zero(s->bat_dirty_bmap, size); + + qemu_co_mutex_unlock(&s->lock); + return 0; +} + + +static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVParallelsState *s = bs->opaque; + int64_t offset; + + qemu_co_mutex_lock(&s->lock); + offset = block_status(s, sector_num, nb_sectors, pnum); + qemu_co_mutex_unlock(&s->lock); + + if (offset < 0) { return 0; + } + + return (offset << BDRV_SECTOR_BITS) | + BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; +} + +static coroutine_fn int parallels_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + BDRVParallelsState *s = bs->opaque; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + int ret = 0; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + while (nb_sectors > 0) { + int64_t position; + int n, nbytes; + + qemu_co_mutex_lock(&s->lock); + position = allocate_clusters(bs, sector_num, nb_sectors, &n); + qemu_co_mutex_unlock(&s->lock); + if (position < 0) { + ret = (int)position; + break; + } + + nbytes = n << BDRV_SECTOR_BITS; + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + + ret = bdrv_co_writev(bs->file, position, n, &hd_qiov); + if (ret < 0) { + break; + } + + nb_sectors -= n; + sector_num += n; + bytes_done += nbytes; + } + + qemu_iovec_destroy(&hd_qiov); + return ret; +} + +static coroutine_fn int parallels_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + BDRVParallelsState *s = bs->opaque; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + int ret = 0; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + while (nb_sectors > 0) { + int64_t position; + int n, nbytes; + + qemu_co_mutex_lock(&s->lock); + position = block_status(s, sector_num, nb_sectors, &n); + qemu_co_mutex_unlock(&s->lock); + + nbytes = n << BDRV_SECTOR_BITS; + + if (position < 0) { + qemu_iovec_memset(qiov, bytes_done, 0, nbytes); + } else { + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + + ret = bdrv_co_readv(bs->file, position, n, &hd_qiov); + if (ret < 0) { + break; + } + } + + nb_sectors -= n; + sector_num += n; + bytes_done += nbytes; + } + + qemu_iovec_destroy(&hd_qiov); + return ret; +} + + +static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + int64_t size, prev_off, high_off; + int ret; + uint32_t i; + bool flush_bat = false; + int cluster_size = s->tracks << BDRV_SECTOR_BITS; + + size = bdrv_getlength(bs->file); + if (size < 0) { + res->check_errors++; + return size; + } + + if (s->header_unclean) { + fprintf(stderr, "%s image was not closed correctly\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); + res->corruptions++; + if (fix & BDRV_FIX_ERRORS) { + /* parallels_close will do the job right */ + res->corruptions_fixed++; + s->header_unclean = false; + } + } + + res->bfi.total_clusters = s->bat_size; + res->bfi.compressed_clusters = 0; /* compression is not supported */ + + high_off = 0; + prev_off = 0; + for (i = 0; i < s->bat_size; i++) { + int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; + if (off == 0) { + prev_off = 0; + continue; + } + + /* cluster outside the image */ + if (off > size) { + fprintf(stderr, "%s cluster %u is outside image\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); + res->corruptions++; + if (fix & BDRV_FIX_ERRORS) { + prev_off = 0; + s->bat_bitmap[i] = 0; + res->corruptions_fixed++; + flush_bat = true; + continue; + } + } + + res->bfi.allocated_clusters++; + if (off > high_off) { + high_off = off; + } + + if (prev_off != 0 && (prev_off + cluster_size) != off) { + res->bfi.fragmented_clusters++; + } + prev_off = off; + } + + if (flush_bat) { + ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size); + if (ret < 0) { + res->check_errors++; + return ret; + } + } + + res->image_end_offset = high_off + cluster_size; + if (size > res->image_end_offset) { + int64_t count; + count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size); + fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n", + fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", + size - res->image_end_offset); + res->leaks += count; + if (fix & BDRV_FIX_LEAKS) { + ret = bdrv_truncate(bs->file, res->image_end_offset); + if (ret < 0) { + res->check_errors++; + return ret; + } + res->leaks_fixed += count; + } + } + + return 0; +} + + +static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int64_t total_size, cl_size; + uint8_t tmp[BDRV_SECTOR_SIZE]; + Error *local_err = NULL; + BlockDriverState *file; + uint32_t bat_entries, bat_sectors; + ParallelsHeader header; + int ret; + + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE); + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + file = NULL; + ret = bdrv_open(&file, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + ret = bdrv_truncate(file, 0); + if (ret < 0) { + goto exit; + } + + bat_entries = DIV_ROUND_UP(total_size, cl_size); + bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size); + bat_sectors = (bat_sectors * cl_size) >> BDRV_SECTOR_BITS; + + memset(&header, 0, sizeof(header)); + memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic)); + header.version = cpu_to_le32(HEADER_VERSION); + /* don't care much about geometry, it is not used on image level */ + header.heads = cpu_to_le32(16); + header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32); + header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS); + header.bat_entries = cpu_to_le32(bat_entries); + header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE)); + header.data_off = cpu_to_le32(bat_sectors); + + /* write all the data */ + memset(tmp, 0, sizeof(tmp)); + memcpy(tmp, &header, sizeof(header)); + + ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); + if (ret < 0) { + goto exit; + } + ret = 0; + +done: + bdrv_unref(file); + return ret; + +exit: + error_setg_errno(errp, -ret, "Failed to create Parallels image"); + goto done; +} + + +static int parallels_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const ParallelsHeader *ph = (const void *)buf; + + if (buf_size < sizeof(ParallelsHeader)) { + return 0; + } if ((!memcmp(ph->magic, HEADER_MAGIC, 16) || - !memcmp(ph->magic, HEADER_MAGIC2, 16)) && - (le32_to_cpu(ph->version) == HEADER_VERSION)) + !memcmp(ph->magic, HEADER_MAGIC2, 16)) && + (le32_to_cpu(ph->version) == HEADER_VERSION)) { return 100; + } return 0; } +static int parallels_update_header(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + unsigned size = MAX(bdrv_opt_mem_align(bs->file), sizeof(ParallelsHeader)); + + if (size > s->header_size) { + size = s->header_size; + } + return bdrv_pwrite_sync(bs->file, 0, s->header, size); +} + static int parallels_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVParallelsState *s = bs->opaque; - int i; - struct parallels_header ph; - int ret; - - bs->read_only = 1; // no write support yet + ParallelsHeader ph; + int ret, size, i; + QemuOpts *opts = NULL; + Error *local_err = NULL; + char *buf; ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph)); if (ret < 0) { @@ -115,25 +595,90 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - s->catalog_size = le32_to_cpu(ph.catalog_entries); - if (s->catalog_size > INT_MAX / 4) { + s->bat_size = le32_to_cpu(ph.bat_entries); + if (s->bat_size > INT_MAX / sizeof(uint32_t)) { error_setg(errp, "Catalog too large"); ret = -EFBIG; goto fail; } - s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size); - if (s->catalog_size && s->catalog_bitmap == NULL) { + + size = bat_entry_off(s->bat_size); + s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file)); + s->header = qemu_try_blockalign(bs->file, s->header_size); + if (s->header == NULL) { ret = -ENOMEM; goto fail; } + s->data_end = le32_to_cpu(ph.data_off); + if (s->data_end == 0) { + s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE); + } + if (s->data_end < s->header_size) { + /* there is not enough unused space to fit to block align between BAT + and actual data. We can't avoid read-modify-write... */ + s->header_size = size; + } - ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); + ret = bdrv_pread(bs->file, 0, s->header, s->header_size); if (ret < 0) { goto fail; } + s->bat_bitmap = (uint32_t *)(s->header + 1); + + for (i = 0; i < s->bat_size; i++) { + int64_t off = bat2sect(s, i); + if (off >= s->data_end) { + s->data_end = off + s->tracks; + } + } - for (i = 0; i < s->catalog_size; i++) - le32_to_cpus(&s->catalog_bitmap[i]); + if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) { + /* Image was not closed correctly. The check is mandatory */ + s->header_unclean = true; + if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { + error_setg(errp, "parallels: Image was not closed correctly; " + "cannot be opened read/write"); + ret = -EACCES; + goto fail; + } + } + + opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, &local_err); + if (local_err != NULL) { + goto fail_options; + } + + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err != NULL) { + goto fail_options; + } + + s->prealloc_size = + qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0); + s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS); + buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE); + s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf, + PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); + g_free(buf); + if (local_err != NULL) { + goto fail_options; + } + if (!bdrv_has_zero_init(bs->file) || + bdrv_truncate(bs->file, bdrv_getlength(bs->file)) != 0) { + s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; + } + + if (flags & BDRV_O_RDWR) { + s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC); + ret = parallels_update_header(bs); + if (ret < 0) { + goto fail; + } + } + + s->bat_dirty_block = 4 * getpagesize(); + s->bat_dirty_bmap = + bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block)); qemu_co_mutex_init(&s->lock); return 0; @@ -142,67 +687,67 @@ fail_format: error_setg(errp, "Image not in Parallels format"); ret = -EINVAL; fail: - g_free(s->catalog_bitmap); + qemu_vfree(s->header); return ret; + +fail_options: + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; } -static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) + +static void parallels_close(BlockDriverState *bs) { BDRVParallelsState *s = bs->opaque; - uint32_t index, offset; - - index = sector_num / s->tracks; - offset = sector_num % s->tracks; - /* not allocated */ - if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0)) - return -1; - return - ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512; -} + if (bs->open_flags & BDRV_O_RDWR) { + s->header->inuse = 0; + parallels_update_header(bs); + } -static int parallels_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - while (nb_sectors > 0) { - int64_t position = seek_to_sector(bs, sector_num); - if (position >= 0) { - if (bdrv_pread(bs->file, position, buf, 512) != 512) - return -1; - } else { - memset(buf, 0, 512); - } - nb_sectors--; - sector_num++; - buf += 512; + if (bs->open_flags & BDRV_O_RDWR) { + bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS); } - return 0; -} -static coroutine_fn int parallels_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVParallelsState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = parallels_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; + g_free(s->bat_dirty_bmap); + qemu_vfree(s->header); } -static void parallels_close(BlockDriverState *bs) -{ - BDRVParallelsState *s = bs->opaque; - g_free(s->catalog_bitmap); -} +static QemuOptsList parallels_create_opts = { + .name = "parallels-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size", + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Parallels image cluster size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE), + }, + { /* end of list */ } + } +}; static BlockDriver bdrv_parallels = { .format_name = "parallels", .instance_size = sizeof(BDRVParallelsState), .bdrv_probe = parallels_probe, .bdrv_open = parallels_open, - .bdrv_read = parallels_co_read, .bdrv_close = parallels_close, + .bdrv_co_get_block_status = parallels_co_get_block_status, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_flush_to_os = parallels_co_flush_to_os, + .bdrv_co_readv = parallels_co_readv, + .bdrv_co_writev = parallels_co_writev, + + .bdrv_create = parallels_create, + .bdrv_check = parallels_check, + .create_opts = ¶llels_create_opts, }; static void bdrv_parallels_init(void) diff --git a/block/qapi.c b/block/qapi.c index 8a19aed44..2ce509711 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -24,6 +24,7 @@ #include "block/qapi.h" #include "block/block_int.h" +#include "block/throttle-groups.h" #include "block/write-threshold.h" #include "qmp-commands.h" #include "qapi-visit.h" @@ -31,8 +32,10 @@ #include "qapi/qmp/types.h" #include "sysemu/block-backend.h" -BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) { + ImageInfo **p_image_info; + BlockDriverState *bs0; BlockDeviceInfo *info = g_malloc0(sizeof(*info)); info->file = g_strdup(bs->filename); @@ -63,7 +66,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) if (bs->io_limits_enabled) { ThrottleConfig cfg; - throttle_get_config(&bs->throttle_state, &cfg); + + throttle_group_get_config(bs, &cfg); + info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; @@ -88,10 +93,32 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) info->has_iops_size = cfg.op_size; info->iops_size = cfg.op_size; + + info->has_group = true; + info->group = g_strdup(throttle_group_get_name(bs)); } info->write_threshold = bdrv_write_threshold_get(bs); + bs0 = bs; + p_image_info = &info->image; + while (1) { + Error *local_err = NULL; + bdrv_query_image_info(bs0, p_image_info, &local_err); + if (local_err) { + error_propagate(errp, local_err); + qapi_free_BlockDeviceInfo(info); + return NULL; + } + if (bs0->drv && bs0->backing_hd) { + bs0 = bs0->backing_hd; + (*p_image_info)->has_backing_image = true; + p_image_info = &((*p_image_info)->backing_image); + } else { + break; + } + } + return info; } @@ -264,9 +291,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, { BlockInfo *info = g_malloc0(sizeof(*info)); BlockDriverState *bs = blk_bs(blk); - BlockDriverState *bs0; - ImageInfo **p_image_info; - Error *local_err = NULL; info->device = g_strdup(blk_name(blk)); info->type = g_strdup("unknown"); info->locked = blk_dev_is_medium_locked(blk); @@ -289,23 +313,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, if (bs->drv) { info->has_inserted = true; - info->inserted = bdrv_block_device_info(bs); - - bs0 = bs; - p_image_info = &info->inserted->image; - while (1) { - bdrv_query_image_info(bs0, p_image_info, &local_err); - if (local_err) { - error_propagate(errp, local_err); - goto err; - } - if (bs0->drv && bs0->backing_hd) { - bs0 = bs0->backing_hd; - (*p_image_info)->has_backing_image = true; - p_image_info = &((*p_image_info)->backing_image); - } else { - break; - } + info->inserted = bdrv_block_device_info(bs, errp); + if (info->inserted == NULL) { + goto err; } } @@ -510,18 +520,9 @@ static void dump_qobject(fprintf_function func_fprintf, void *f, } case QTYPE_QBOOL: { QBool *value = qobject_to_qbool(obj); - func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false"); - break; - } - case QTYPE_QERROR: { - QString *value = qerror_human((QError *)obj); - func_fprintf(f, "%s", qstring_get_str(value)); - QDECREF(value); + func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false"); break; } - case QTYPE_NONE: - break; - case QTYPE_MAX: default: abort(); } diff --git a/block/qcow.c b/block/qcow.c index 055896910..01fba54ce 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -25,7 +25,8 @@ #include "block/block_int.h" #include "qemu/module.h" #include -#include "qemu/aes.h" +#include "qapi/qmp/qerror.h" +#include "crypto/cipher.h" #include "migration/migration.h" /**************************************************************/ @@ -71,10 +72,8 @@ typedef struct BDRVQcowState { uint8_t *cluster_cache; uint8_t *cluster_data; uint64_t cluster_cache_offset; - uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + QCryptoCipher *cipher; /* NULL if no key yet */ uint32_t crypt_method_header; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; CoMutex lock; Error *migration_blocker; } BDRVQcowState; @@ -123,8 +122,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, char version[64]; snprintf(version, sizeof(version), "QCOW version %" PRIu32, header.version); - error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "qcow", version); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "qcow", version); ret = -ENOTSUP; goto fail; } @@ -153,6 +152,11 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } + if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { + error_setg(errp, "AES cipher not available"); + ret = -EINVAL; + goto fail; + } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { bs->encrypted = 1; @@ -229,9 +233,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } /* Disable migration when qcow images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "qcow", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The qcow format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); qemu_co_mutex_init(&s->lock); @@ -259,6 +263,7 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) BDRVQcowState *s = bs->opaque; uint8_t keybuf[16]; int len, i; + Error *err; memset(keybuf, 0, 16); len = strlen(key); @@ -269,38 +274,68 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) for(i = 0;i < len;i++) { keybuf[i] = key[i]; } - s->crypt_method = s->crypt_method_header; - - if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) - return -1; - if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + assert(bs->encrypted); + + qcrypto_cipher_free(s->cipher); + s->cipher = qcrypto_cipher_new( + QCRYPTO_CIPHER_ALG_AES_128, + QCRYPTO_CIPHER_MODE_CBC, + keybuf, G_N_ELEMENTS(keybuf), + &err); + + if (!s->cipher) { + /* XXX would be nice if errors in this method could + * be properly propagate to the caller. Would need + * the bdrv_set_key() API signature to be fixed. */ + error_free(err); return -1; + } return 0; } /* The crypt function is compatible with the linux cryptoloop algorithm for < 4 GB images. NOTE: out_buf == in_buf is supported */ -static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key) +static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, Error **errp) { union { uint64_t ll[2]; uint8_t b[16]; } ivec; int i; + int ret; for(i = 0; i < nb_sectors; i++) { ivec.ll[0] = cpu_to_le64(sector_num); ivec.ll[1] = 0; - AES_cbc_encrypt(in_buf, out_buf, 512, key, - ivec.b, enc); + if (qcrypto_cipher_setiv(s->cipher, + ivec.b, G_N_ELEMENTS(ivec.b), + errp) < 0) { + return -1; + } + if (enc) { + ret = qcrypto_cipher_encrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } else { + ret = qcrypto_cipher_decrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } + if (ret < 0) { + return -1; + } sector_num++; in_buf += 512; out_buf += 512; } + return 0; } /* 'allocate' is: @@ -411,17 +446,23 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, bdrv_truncate(bs->file, cluster_offset + s->cluster_size); /* if encrypted, we must initialize the cluster content which won't be written */ - if (s->crypt_method && + if (bs->encrypted && (n_end - n_start) < s->cluster_sectors) { uint64_t start_sect; + assert(s->cipher); start_sect = (offset & ~(s->cluster_size - 1)) >> 9; memset(s->cluster_data + 512, 0x00, 512); for(i = 0; i < s->cluster_sectors; i++) { if (i < n_start || i >= n_end) { - encrypt_sectors(s, start_sect + i, - s->cluster_data, - s->cluster_data + 512, 1, 1, - &s->aes_encrypt_key); + Error *err = NULL; + if (encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, + true, &err) < 0) { + error_free(err); + errno = EIO; + return -1; + } if (bdrv_pwrite(bs->file, cluster_offset + i * 512, s->cluster_data, 512) != 512) return -1; @@ -461,7 +502,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, if (!cluster_offset) { return 0; } - if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) { + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) { return BDRV_BLOCK_DATA; } cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); @@ -528,6 +569,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector hd_qiov; uint8_t *buf; void *orig_buf; + Error *err = NULL; if (qiov->niov > 1) { buf = orig_buf = qemu_try_blockalign(bs, qiov->size); @@ -590,10 +632,12 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, if (ret < 0) { break; } - if (s->crypt_method) { - encrypt_sectors(s, sector_num, buf, buf, - n, 0, - &s->aes_decrypt_key); + if (bs->encrypted) { + assert(s->cipher); + if (encrypt_sectors(s, sector_num, buf, buf, + n, false, &err) < 0) { + goto fail; + } } } ret = 0; @@ -614,6 +658,7 @@ done: return ret; fail: + error_free(err); ret = -EIO; goto done; } @@ -661,12 +706,18 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, ret = -EIO; break; } - if (s->crypt_method) { + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); if (!cluster_data) { cluster_data = g_malloc0(s->cluster_size); } - encrypt_sectors(s, sector_num, cluster_data, buf, - n, 1, &s->aes_encrypt_key); + if (encrypt_sectors(s, sector_num, cluster_data, buf, + n, true, &err) < 0) { + error_free(err); + ret = -EIO; + break; + } src_buf = cluster_data; } else { src_buf = buf; @@ -703,6 +754,8 @@ static void qcow_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; + qcrypto_cipher_free(s->cipher); + s->cipher = NULL; g_free(s->l1_table); qemu_vfree(s->l2_cache); g_free(s->cluster_cache); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index b1155492a..53b8afc3d 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -28,62 +28,68 @@ #include "trace.h" typedef struct Qcow2CachedTable { - void* table; - int64_t offset; - bool dirty; - int cache_hits; - int ref; + int64_t offset; + bool dirty; + uint64_t lru_counter; + int ref; } Qcow2CachedTable; struct Qcow2Cache { - Qcow2CachedTable* entries; - struct Qcow2Cache* depends; + Qcow2CachedTable *entries; + struct Qcow2Cache *depends; int size; bool depends_on_flush; + void *table_array; + uint64_t lru_counter; }; +static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, + Qcow2Cache *c, int table) +{ + BDRVQcowState *s = bs->opaque; + return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; +} + +static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, + Qcow2Cache *c, void *table) +{ + BDRVQcowState *s = bs->opaque; + ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; + int idx = table_offset / s->cluster_size; + assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); + return idx; +} + Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) { BDRVQcowState *s = bs->opaque; Qcow2Cache *c; - int i; c = g_new0(Qcow2Cache, 1); c->size = num_tables; c->entries = g_try_new0(Qcow2CachedTable, num_tables); - if (!c->entries) { - goto fail; - } - - for (i = 0; i < c->size; i++) { - c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size); - if (c->entries[i].table == NULL) { - goto fail; - } + c->table_array = qemu_try_blockalign(bs->file, + (size_t) num_tables * s->cluster_size); + + if (!c->entries || !c->table_array) { + qemu_vfree(c->table_array); + g_free(c->entries); + g_free(c); + c = NULL; } return c; - -fail: - if (c->entries) { - for (i = 0; i < c->size; i++) { - qemu_vfree(c->entries[i].table); - } - } - g_free(c->entries); - g_free(c); - return NULL; } -int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) +int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c) { int i; for (i = 0; i < c->size; i++) { assert(c->entries[i].ref == 0); - qemu_vfree(c->entries[i].table); } + qemu_vfree(c->table_array); g_free(c->entries); g_free(c); @@ -151,8 +157,8 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); } - ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table, - s->cluster_size); + ret = bdrv_pwrite(bs->file, c->entries[i].offset, + qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); if (ret < 0) { return ret; } @@ -228,42 +234,12 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) for (i = 0; i < c->size; i++) { assert(c->entries[i].ref == 0); c->entries[i].offset = 0; - c->entries[i].cache_hits = 0; + c->entries[i].lru_counter = 0; } - return 0; -} - -static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) -{ - int i; - int min_count = INT_MAX; - int min_index = -1; - - - for (i = 0; i < c->size; i++) { - if (c->entries[i].ref) { - continue; - } - - if (c->entries[i].cache_hits < min_count) { - min_index = i; - min_count = c->entries[i].cache_hits; - } - - /* Give newer hits priority */ - /* TODO Check how to optimize the replacement strategy */ - if (c->entries[i].cache_hits > 1) { - c->entries[i].cache_hits /= 2; - } - } + c->lru_counter = 0; - if (min_index == -1) { - /* This can't happen in current synchronous code, but leave the check - * here as a reminder for whoever starts using AIO with the cache */ - abort(); - } - return min_index; + return 0; } static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, @@ -272,24 +248,39 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, BDRVQcowState *s = bs->opaque; int i; int ret; + int lookup_index; + uint64_t min_lru_counter = UINT64_MAX; + int min_lru_index = -1; trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, offset, read_from_disk); /* Check if the table is already cached */ - for (i = 0; i < c->size; i++) { - if (c->entries[i].offset == offset) { + i = lookup_index = (offset / s->cluster_size * 4) % c->size; + do { + const Qcow2CachedTable *t = &c->entries[i]; + if (t->offset == offset) { goto found; } + if (t->ref == 0 && t->lru_counter < min_lru_counter) { + min_lru_counter = t->lru_counter; + min_lru_index = i; + } + if (++i == c->size) { + i = 0; + } + } while (i != lookup_index); + + if (min_lru_index == -1) { + /* This can't happen in current synchronous code, but leave the check + * here as a reminder for whoever starts using AIO with the cache */ + abort(); } - /* If not, write a table back and replace it */ - i = qcow2_cache_find_entry_to_replace(c); + /* Cache miss: write a table back and replace it */ + i = min_lru_index; trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), c == s->l2_table_cache, i); - if (i < 0) { - return i; - } ret = qcow2_cache_entry_flush(bs, c, i); if (ret < 0) { @@ -304,22 +295,19 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); } - ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size); + ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i), + s->cluster_size); if (ret < 0) { return ret; } } - /* Give the table some hits for the start so that it won't be replaced - * immediately. The number 32 is completely arbitrary. */ - c->entries[i].cache_hits = 32; c->entries[i].offset = offset; /* And return the right table */ found: - c->entries[i].cache_hits++; c->entries[i].ref++; - *table = c->entries[i].table; + *table = qcow2_cache_get_table_addr(bs, c, i); trace_qcow2_cache_get_done(qemu_coroutine_self(), c == s->l2_table_cache, i); @@ -339,36 +327,24 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, return qcow2_cache_do_get(bs, c, offset, table, false); } -int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) { - int i; + int i = qcow2_cache_get_table_idx(bs, c, *table); - for (i = 0; i < c->size; i++) { - if (c->entries[i].table == *table) { - goto found; - } - } - return -ENOENT; - -found: c->entries[i].ref--; *table = NULL; + if (c->entries[i].ref == 0) { + c->entries[i].lru_counter = ++c->lru_counter; + } + assert(c->entries[i].ref >= 0); - return 0; } -void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, + void *table) { - int i; - - for (i = 0; i < c->size; i++) { - if (c->entries[i].table == table) { - goto found; - } - } - abort(); - -found: + int i = qcow2_cache_get_table_idx(bs, c, table); + assert(c->entries[i].offset != 0); c->entries[i].dirty = true; } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ed2b44d29..7e94fe70e 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -253,17 +253,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) memcpy(l2_table, old_table, s->cluster_size); - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); - if (ret < 0) { - goto fail; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); } /* write the l2 table to the file */ BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); trace_qcow2_l2_allocate_write_l2(bs, l1_index); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); ret = qcow2_cache_flush(bs, s->l2_table_cache); if (ret < 0) { goto fail; @@ -301,7 +298,7 @@ fail: * as contiguous. (This allows it, for example, to stop at the first compressed * cluster which may require a different handling) */ -static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, +static int count_contiguous_clusters(int nb_clusters, int cluster_size, uint64_t *l2_table, uint64_t stop_flags) { int i; @@ -324,7 +321,7 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, return i; } -static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table) { int i; @@ -342,26 +339,47 @@ static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_tab /* The crypt function is compatible with the linux cryptoloop algorithm for < 4 GB images. NOTE: out_buf == in_buf is supported */ -void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key) +int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, + Error **errp) { union { uint64_t ll[2]; uint8_t b[16]; } ivec; int i; + int ret; for(i = 0; i < nb_sectors; i++) { ivec.ll[0] = cpu_to_le64(sector_num); ivec.ll[1] = 0; - AES_cbc_encrypt(in_buf, out_buf, 512, key, - ivec.b, enc); + if (qcrypto_cipher_setiv(s->cipher, + ivec.b, G_N_ELEMENTS(ivec.b), + errp) < 0) { + return -1; + } + if (enc) { + ret = qcrypto_cipher_encrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } else { + ret = qcrypto_cipher_decrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } + if (ret < 0) { + return -1; + } sector_num++; in_buf += 512; out_buf += 512; } + return 0; } static int coroutine_fn copy_sectors(BlockDriverState *bs, @@ -403,10 +421,16 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, goto out; } - if (s->crypt_method) { - qcow2_encrypt_sectors(s, start_sect + n_start, - iov.iov_base, iov.iov_base, n, 1, - &s->aes_encrypt_key); + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); + if (qcow2_encrypt_sectors(s, start_sect + n_start, + iov.iov_base, iov.iov_base, n, + true, &err) < 0) { + ret = -EIO; + error_free(err); + goto out; + } } ret = qcow2_pre_write_overlap_check(bs, 0, @@ -471,6 +495,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, if (nb_needed > nb_available) { nb_needed = nb_available; } + assert(nb_needed <= INT_MAX); *cluster_offset = 0; @@ -506,6 +531,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); *cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ nb_clusters = size_to_clusters(s, nb_needed << 9); ret = qcow2_get_cluster_type(*cluster_offset); @@ -692,12 +719,9 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, /* compressed clusters never have the copied flag */ BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); l2_table[l2_index] = cpu_to_be64(cluster_offset); - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return 0; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); return cluster_offset; } @@ -771,7 +795,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) if (ret < 0) { goto err; } - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { @@ -789,10 +813,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) } - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - goto err; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); /* * If this was a COW, we need to decrease the refcount of the old cluster. @@ -942,9 +963,9 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, int l2_index; uint64_t cluster_offset; uint64_t *l2_table; - unsigned int nb_clusters; + uint64_t nb_clusters; unsigned int keep_clusters; - int ret, pret; + int ret; trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, *bytes); @@ -961,6 +982,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); @@ -1011,10 +1033,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, /* Cleanup */ out: - pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (pret < 0) { - return pret; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); /* Only return a host offset if we actually made progress. Otherwise we * would make requirements for handle_alloc() that it can't fulfill */ @@ -1046,7 +1065,7 @@ out: * restarted, but the whole request should not be failed. */ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, unsigned int *nb_clusters) + uint64_t *host_offset, uint64_t *nb_clusters) { BDRVQcowState *s = bs->opaque; @@ -1064,7 +1083,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, *host_offset = cluster_offset; return 0; } else { - int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); + int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); if (ret < 0) { return ret; } @@ -1100,7 +1119,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, int l2_index; uint64_t *l2_table; uint64_t entry; - unsigned int nb_clusters; + uint64_t nb_clusters; int ret; uint64_t alloc_cluster_offset; @@ -1118,6 +1137,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); @@ -1139,10 +1159,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, * wrong with our code. */ assert(nb_clusters > 0); - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); /* Allocate, if necessary at a given offset in the image file */ alloc_cluster_offset = start_of_cluster(s, *host_offset); @@ -1414,7 +1431,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard) + uint64_t nb_clusters, enum qcow2_discard_type type, + bool full_discard) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1429,6 +1447,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, /* Limit nb_clusters to one L2 table */ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); for (i = 0; i < nb_clusters; i++) { uint64_t old_l2_entry; @@ -1470,7 +1489,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } /* First remove L2 entries */ - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); if (!full_discard && s->qcow_version >= 3) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); } else { @@ -1481,10 +1500,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, qcow2_free_any_clusters(bs, old_l2_entry, 1, type); } - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); return nb_clusters; } @@ -1494,7 +1510,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, { BDRVQcowState *s = bs->opaque; uint64_t end_offset; - unsigned int nb_clusters; + uint64_t nb_clusters; int ret; end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); @@ -1536,7 +1552,7 @@ fail: * clusters. */ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) + uint64_t nb_clusters) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1551,6 +1567,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, /* Limit nb_clusters to one L2 table */ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); for (i = 0; i < nb_clusters; i++) { uint64_t old_offset; @@ -1558,7 +1575,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, old_offset = be64_to_cpu(l2_table[l2_index + i]); /* Update L2 entries */ - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); if (old_offset & QCOW_OFLAG_COMPRESSED) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); @@ -1567,10 +1584,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, } } - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); return nb_clusters; } @@ -1578,7 +1592,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) { BDRVQcowState *s = bs->opaque; - unsigned int nb_clusters; + uint64_t nb_clusters; int ret; /* The zero flag is only supported by version 3 and newer */ @@ -1760,14 +1774,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, if (is_active_l1) { if (l2_dirty) { - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); qcow2_cache_depends_on_flush(s->l2_table_cache); } - ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); - if (ret < 0) { - l2_table = NULL; - goto fail; - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); } else { if (l2_dirty) { ret = qcow2_pre_write_overlap_check(bs, @@ -1798,12 +1808,7 @@ fail: if (!is_active_l1) { qemu_vfree(l2_table); } else { - if (ret < 0) { - qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); - } else { - ret = qcow2_cache_put(bs, s->l2_table_cache, - (void **)&l2_table); - } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); } } return ret; diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 63c00858d..0b6c302ee 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -265,10 +265,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, block_index = cluster_index & (s->refcount_block_size - 1); *refcount = s->get_refcount(refcount_block, block_index); - ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); - if (ret < 0) { - return ret; - } + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); return 0; } @@ -424,7 +421,7 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Now the new refcount block needs to be written to disk */ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); - qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); ret = qcow2_cache_flush(bs, s->refcount_block_cache); if (ret < 0) { goto fail_block; @@ -448,10 +445,7 @@ static int alloc_refcount_block(BlockDriverState *bs, return -EAGAIN; } - ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); - if (ret < 0) { - goto fail_block; - } + qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); /* * If we come here, we need to grow the refcount table. Again, a new @@ -723,13 +717,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, /* Load the refcount block and allocate it if needed */ if (table_index != old_table_index) { if (refcount_block) { - ret = qcow2_cache_put(bs, s->refcount_block_cache, - &refcount_block); - if (ret < 0) { - goto fail; - } + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); } - ret = alloc_refcount_block(bs, cluster_index, &refcount_block); if (ret < 0) { goto fail; @@ -737,7 +726,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, } old_table_index = table_index; - qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, + refcount_block); /* we can update the count and save it */ block_index = cluster_index & (s->refcount_block_size - 1); @@ -773,11 +763,7 @@ fail: /* Write last changed block to disk */ if (refcount_block) { - int wret; - wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); - if (wret < 0) { - return ret < 0 ? ret : wret; - } + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); } /* @@ -889,8 +875,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) return offset; } -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters) +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters) { BDRVQcowState *s = bs->opaque; uint64_t cluster_index, refcount; @@ -954,19 +940,21 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); - if (!offset || free_in_cluster < size) { - int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); - if (new_cluster < 0) { - return new_cluster; - } + do { + if (!offset || free_in_cluster < size) { + int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + if (new_cluster < 0) { + return new_cluster; + } - if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { - offset = new_cluster; + if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { + offset = new_cluster; + } } - } - assert(offset); - ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); + assert(offset); + ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); if (ret < 0) { return ret; } @@ -1182,15 +1170,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, s->refcount_block_cache); } l2_table[j] = cpu_to_be64(offset); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, + l2_table); } } - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - goto fail; - } - + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); if (addend != 0) { ret = qcow2_update_cluster_refcount(bs, l2_offset >> @@ -1274,7 +1259,7 @@ static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries) static int realloc_refcount_array(BDRVQcowState *s, void **array, int64_t *size, int64_t new_size) { - size_t old_byte_size, new_byte_size; + int64_t old_byte_size, new_byte_size; void *new_ptr; /* Round to clusters so the array can be directly written to disk */ @@ -1290,13 +1275,17 @@ static int realloc_refcount_array(BDRVQcowState *s, void **array, assert(new_byte_size > 0); + if (new_byte_size > SIZE_MAX) { + return -ENOMEM; + } + new_ptr = g_try_realloc(*array, new_byte_size); if (!new_ptr) { return -ENOMEM; } if (new_byte_size > old_byte_size) { - memset((void *)((uintptr_t)new_ptr + old_byte_size), 0, + memset((char *)new_ptr + old_byte_size, 0, new_byte_size - old_byte_size); } @@ -2455,7 +2444,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, if (ret < 0) { return ret; } else if (ret > 0) { - int metadata_ol_bitnr = ffs(ret) - 1; + int metadata_ol_bitnr = ctz32(ret); assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 2aa9dcb1d..b6f58c13e 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -25,6 +25,7 @@ #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" +#include "qemu/error-report.h" void qcow2_free_snapshots(BlockDriverState *bs) { @@ -351,10 +352,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) memset(sn, 0, sizeof(*sn)); - /* Generate an ID if it wasn't passed */ - if (sn_info->id_str[0] == '\0') { - find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); - } + /* Generate an ID */ + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); /* Check that the ID is unique */ if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { diff --git a/block/qcow2.c b/block/qcow2.c index 316a8db22..76c331b38 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -25,7 +25,6 @@ #include "block/block_int.h" #include "qemu/module.h" #include -#include "qemu/aes.h" #include "block/qcow2.h" #include "qemu/error-report.h" #include "qapi/qmp/qerror.h" @@ -207,8 +206,8 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, vsnprintf(msg, sizeof(msg), fmt, ap); va_end(ap); - error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "qcow2", msg); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "qcow2", msg); } static void report_unsupported_feature(BlockDriverState *bs, @@ -483,9 +482,11 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, }; -static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size, +static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, + uint64_t *l2_cache_size, uint64_t *refcount_cache_size, Error **errp) { + BDRVQcowState *s = bs->opaque; uint64_t combined_cache_size; bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; @@ -525,7 +526,9 @@ static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size, } } else { if (!l2_cache_size_set && !refcount_cache_size_set) { - *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE; + *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, + (uint64_t)DEFAULT_L2_CACHE_CLUSTERS + * s->cluster_size); *refcount_cache_size = *l2_cache_size / DEFAULT_L2_REFCOUNT_SIZE_RATIO; } else if (!l2_cache_size_set) { @@ -695,6 +698,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } + if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { + error_setg(errp, "AES cipher not available"); + ret = -EINVAL; + goto fail; + } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { bs->encrypted = 1; @@ -803,7 +811,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err); + read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, + &local_err); if (local_err) { error_propagate(errp, local_err); ret = -EINVAL; @@ -1027,6 +1036,7 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) BDRVQcowState *s = bs->opaque; uint8_t keybuf[16]; int len, i; + Error *err = NULL; memset(keybuf, 0, 16); len = strlen(key); @@ -1037,30 +1047,22 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) for(i = 0;i < len;i++) { keybuf[i] = key[i]; } - s->crypt_method = s->crypt_method_header; + assert(bs->encrypted); - if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) - return -1; - if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + qcrypto_cipher_free(s->cipher); + s->cipher = qcrypto_cipher_new( + QCRYPTO_CIPHER_ALG_AES_128, + QCRYPTO_CIPHER_MODE_CBC, + keybuf, G_N_ELEMENTS(keybuf), + &err); + + if (!s->cipher) { + /* XXX would be nice if errors in this method could + * be properly propagate to the caller. Would need + * the bdrv_set_key() API signature to be fixed. */ + error_free(err); return -1; -#if 0 - /* test */ - { - uint8_t in[16]; - uint8_t out[16]; - uint8_t tmp[16]; - for(i=0;i<16;i++) - in[i] = i; - AES_encrypt(in, tmp, &s->aes_encrypt_key); - AES_decrypt(tmp, out, &s->aes_decrypt_key); - for(i = 0; i < 16; i++) - printf(" %02x", tmp[i]); - printf("\n"); - for(i = 0; i < 16; i++) - printf(" %02x", out[i]); - printf("\n"); } -#endif return 0; } @@ -1103,7 +1105,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, } if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && - !s->crypt_method) { + !s->cipher) { index_in_cluster = sector_num & (s->cluster_sectors - 1); cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; @@ -1153,7 +1155,7 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, /* prepare next request */ cur_nr_sectors = remaining_sectors; - if (s->crypt_method) { + if (s->cipher) { cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); } @@ -1224,7 +1226,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, goto fail; } - if (s->crypt_method) { + if (bs->encrypted) { + assert(s->cipher); + /* * For encrypted images, read everything into a temporary * contiguous buffer on which the AES functions can work. @@ -1255,9 +1259,16 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, if (ret < 0) { goto fail; } - if (s->crypt_method) { - qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); + if (bs->encrypted) { + assert(s->cipher); + Error *err = NULL; + if (qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, false, + &err) < 0) { + error_free(err); + ret = -EIO; + goto fail; + } qemu_iovec_from_buf(qiov, bytes_done, cluster_data, 512 * cur_nr_sectors); } @@ -1315,7 +1326,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, trace_qcow2_writev_start_part(qemu_coroutine_self()); index_in_cluster = sector_num & (s->cluster_sectors - 1); cur_nr_sectors = remaining_sectors; - if (s->crypt_method && + if (bs->encrypted && cur_nr_sectors > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { cur_nr_sectors = @@ -1334,7 +1345,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_nr_sectors * 512); - if (s->crypt_method) { + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); if (!cluster_data) { cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS @@ -1349,8 +1362,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); + if (qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, + true, &err) < 0) { + error_free(err); + ret = -EIO; + goto fail; + } qemu_iovec_reset(&hd_qiov); qemu_iovec_add(&hd_qiov, cluster_data, @@ -1456,6 +1474,9 @@ static void qcow2_close(BlockDriverState *bs) qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); + qcrypto_cipher_free(s->cipher); + s->cipher = NULL; + g_free(s->unknown_header_fields); cleanup_unknown_header_ext(bs); @@ -1472,9 +1493,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQcowState *s = bs->opaque; int flags = s->flags; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; - uint32_t crypt_method = 0; + QCryptoCipher *cipher = NULL; QDict *options; Error *local_err = NULL; int ret; @@ -1484,11 +1503,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) * that means we don't have to worry about reopening them here. */ - if (s->crypt_method) { - crypt_method = s->crypt_method; - memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); - memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); - } + cipher = s->cipher; + s->cipher = NULL; qcow2_close(bs); @@ -1513,11 +1529,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) return; } - if (crypt_method) { - s->crypt_method = crypt_method; - memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); - memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); - } + s->cipher = cipher; } static size_t header_ext_add(char *buf, uint32_t magic, const void *s, @@ -1802,7 +1814,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, { /* Calculate cluster_bits */ int cluster_bits; - cluster_bits = ffs(cluster_size) - 1; + cluster_bits = ctz32(cluster_size); if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) { @@ -2110,7 +2122,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) goto finish; } - refcount_order = ffs(refcount_bits) - 1; + refcount_order = ctz32(refcount_bits); ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, cluster_size, prealloc, opts, version, refcount_order, @@ -2718,8 +2730,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, - s->crypt_method); - if (encrypt != !!s->crypt_method) { + !!s->cipher); + + if (encrypt != !!s->cipher) { fprintf(stderr, "Changing the encryption flag is not " "supported.\n"); return -ENOTSUP; @@ -2824,6 +2837,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int64_t size, const char *message_format, ...) { BDRVQcowState *s = bs->opaque; + const char *node_name; char *message; va_list ap; @@ -2847,8 +2861,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, "corruption events will be suppressed\n", message); } - qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message, - offset >= 0, offset, size >= 0, size, + node_name = bdrv_get_node_name(bs); + qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), + *node_name != '\0', node_name, + message, offset >= 0, offset, + size >= 0, size, fatal, &error_abort); g_free(message); diff --git a/block/qcow2.h b/block/qcow2.h index 2f2094959..4b5a6afc8 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -25,7 +25,7 @@ #ifndef BLOCK_QCOW2_H #define BLOCK_QCOW2_H -#include "qemu/aes.h" +#include "crypto/cipher.h" #include "block/coroutine.h" //#define DEBUG_ALLOC @@ -68,6 +68,8 @@ /* Must be at least 4 to cover all cases of refcount table growth */ #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ +/* Whichever is more */ +#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ /* The refblock cache needs only a fourth of the L2 cache size to cover as many @@ -251,10 +253,8 @@ typedef struct BDRVQcowState { CoMutex lock; - uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + QCryptoCipher *cipher; /* current cipher, NULL if no key yet */ uint32_t crypt_method_header; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; uint64_t snapshots_offset; int snapshots_size; unsigned int nb_snapshots; @@ -412,7 +412,7 @@ static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) return offset & (s->cluster_size - 1); } -static inline int size_to_clusters(BDRVQcowState *s, int64_t size) +static inline uint64_t size_to_clusters(BDRVQcowState *s, uint64_t size) { return (size + (s->cluster_size - 1)) >> s->cluster_bits; } @@ -506,8 +506,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, enum qcow2_discard_type type); int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters); +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters); int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); void qcow2_free_clusters(BlockDriverState *bs, int64_t offset, int64_t size, @@ -534,10 +534,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); void qcow2_l2_cache_reset(BlockDriverState *bs); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); -void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key); +int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, Error **errp); int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset); @@ -575,7 +574,8 @@ int qcow2_read_snapshots(BlockDriverState *bs); Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); -void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, + void *table); int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); @@ -587,6 +587,6 @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); -int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); #endif diff --git a/block/qed.c b/block/qed.c index 892b13c80..954ed007c 100644 --- a/block/qed.c +++ b/block/qed.c @@ -407,8 +407,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, char buf[64]; snprintf(buf, sizeof(buf), "%" PRIx64, s->header.features & ~QED_FEATURE_MASK); - error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "QED", buf); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "QED", buf); return -ENOTSUP; } if (!qed_is_cluster_size_valid(s->header.cluster_size)) { @@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->table_nelems = (s->header.cluster_size * s->header.table_size) / sizeof(uint64_t); - s->l2_shift = ffs(s->header.cluster_size) - 1; + s->l2_shift = ctz32(s->header.cluster_size); s->l2_mask = s->table_nelems - 1; - s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + s->l1_shift = s->l2_shift + ctz32(s->table_nelems); /* Header size calculation must not overflow uint32_t */ if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { diff --git a/block/quorum.c b/block/quorum.c index 437b12251..2f6c45f76 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -13,16 +13,16 @@ * See the COPYING file in the top-level directory. */ -#include -#include #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qjson.h" #include "qapi/qmp/qlist.h" #include "qapi/qmp/qstring.h" #include "qapi-event.h" +#include "crypto/hash.h" #define HASH_LENGTH 32 @@ -33,7 +33,7 @@ /* This union holds a vote hash value */ typedef union QuorumVoteValue { - char h[HASH_LENGTH]; /* SHA-256 hash */ + uint8_t h[HASH_LENGTH]; /* SHA-256 hash */ int64_t l; /* simpler 64 bits hash */ } QuorumVoteValue; @@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) static void quorum_report_failure(QuorumAIOCB *acb) { - const char *reference = bdrv_get_device_name(acb->common.bs)[0] ? - bdrv_get_device_name(acb->common.bs) : - acb->common.bs->node_name; - + const char *reference = bdrv_get_device_or_node_name(acb->common.bs); qapi_event_send_quorum_failure(reference, acb->sector_num, acb->nb_sectors, &error_abort); } @@ -430,25 +427,21 @@ static void quorum_free_vote_list(QuorumVotes *votes) static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) { - int j, ret; - gnutls_hash_hd_t dig; QEMUIOVector *qiov = &acb->qcrs[i].qiov; - - ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); - - if (ret < 0) { - return ret; + size_t len = sizeof(hash->h); + uint8_t *data = hash->h; + + /* XXX - would be nice if we could pass in the Error ** + * and propagate that back, but this quorum code is + * restricted to just errno values currently */ + if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256, + qiov->iov, qiov->niov, + &data, &len, + NULL) < 0) { + return -EINVAL; } - for (j = 0; j < qiov->niov; j++) { - ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len); - if (ret < 0) { - break; - } - } - - gnutls_hash_deinit(dig, (void *) hash); - return ret; + return 0; } static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) @@ -803,8 +796,8 @@ static int quorum_valid_threshold(int threshold, int num_children, Error **errp) { if (threshold < 1) { - error_set(errp, QERR_INVALID_PARAMETER_VALUE, - "vote-threshold", "value >= 1"); + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "vote-threshold", "value >= 1"); return -ERANGE; } @@ -869,25 +862,18 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; QemuOpts *opts = NULL; bool *opened; - QDict *sub = NULL; - QList *list = NULL; - const QListEntry *lentry; int i; int ret = 0; qdict_flatten(options); - qdict_extract_subqdict(options, &sub, "children."); - qdict_array_split(sub, &list); - if (qdict_size(sub)) { - error_setg(&local_err, "Invalid option children.%s", - qdict_first(sub)->key); + /* count how many different children are present */ + s->num_children = qdict_array_entries(options, "children."); + if (s->num_children < 0) { + error_setg(&local_err, "Option children is not a valid array"); ret = -EINVAL; goto exit; } - - /* count how many different children are present */ - s->num_children = qlist_size(list); if (s->num_children < 2) { error_setg(&local_err, "Number of provided children must be greater than 1"); @@ -940,37 +926,17 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, s->bs = g_new0(BlockDriverState *, s->num_children); opened = g_new0(bool, s->num_children); - for (i = 0, lentry = qlist_first(list); lentry; - lentry = qlist_next(lentry), i++) { - QDict *d; - QString *string; - - switch (qobject_type(lentry->value)) - { - /* List of options */ - case QTYPE_QDICT: - d = qobject_to_qdict(lentry->value); - QINCREF(d); - ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL, - &local_err); - break; - - /* QMP reference */ - case QTYPE_QSTRING: - string = qobject_to_qstring(lentry->value); - ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL, - flags, NULL, &local_err); - break; - - default: - error_setg(&local_err, "Specification of child block device %i " - "is invalid", i); - ret = -EINVAL; - } + for (i = 0; i < s->num_children; i++) { + char indexstr[32]; + ret = snprintf(indexstr, 32, "children.%d", i); + assert(ret < 32); + ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs, + &child_format, false, &local_err); if (ret < 0) { goto close_exit; } + opened[i] = true; } @@ -993,8 +959,6 @@ exit: if (local_err) { error_propagate(errp, local_err); } - QDECREF(list); - QDECREF(sub); return ret; } @@ -1056,9 +1020,9 @@ static void quorum_refresh_filename(BlockDriverState *bs) qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD, QOBJECT(qint_from_int(s->threshold))); qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY, - QOBJECT(qbool_from_int(s->is_blkverify))); + QOBJECT(qbool_from_bool(s->is_blkverify))); qdict_put_obj(opts, QUORUM_OPT_REWRITE, - QOBJECT(qbool_from_int(s->rewrite_corrupted))); + QOBJECT(qbool_from_bool(s->rewrite_corrupted))); qdict_put_obj(opts, "children", QOBJECT(children)); bs->full_open_options = opts; @@ -1091,6 +1055,10 @@ static BlockDriver bdrv_quorum = { static void bdrv_quorum_init(void) { + if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) { + /* SHA256 hash support is required for quorum device */ + return; + } bdrv_register(&bdrv_quorum); } diff --git a/block/raw-posix.c b/block/raw-posix.c index 24d85826c..855febed5 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ #include "qemu-common.h" +#include "qemu/error-report.h" #include "qemu/timer.h" #include "qemu/log.h" #include "block/block_int.h" @@ -31,6 +32,7 @@ #include "qemu/iov.h" #include "raw-aio.h" #include "qapi/util.h" +#include "qapi/qmp/qstring.h" #if defined(__APPLE__) && (__MACH__) #include @@ -57,6 +59,7 @@ #include #include #include +#include #ifdef __s390__ #include #endif @@ -94,15 +97,19 @@ #include #endif -//#define DEBUG_FLOPPY - //#define DEBUG_BLOCK -#if defined(DEBUG_BLOCK) -#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \ - { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0) + +#ifdef DEBUG_BLOCK +# define DEBUG_BLOCK_PRINT 1 #else -#define DEBUG_BLOCK_PRINT(formatCstr, ...) +# define DEBUG_BLOCK_PRINT 0 #endif +#define DPRINTF(fmt, ...) \ +do { \ + if (DEBUG_BLOCK_PRINT) { \ + printf(fmt, ## __VA_ARGS__); \ + } \ +} while (0) /* OS X does not have O_DSYNC */ #ifndef O_DSYNC @@ -301,10 +308,11 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) { BDRVRawState *s = bs->opaque; char *buf; + size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); - /* For /dev/sg devices the alignment is not really used. + /* For SCSI generic devices the alignment is not really used. With buffered I/O, we don't have any restrictions. */ - if (bs->sg || !s->needs_alignment) { + if (bdrv_is_sg(bs) || !s->needs_alignment) { bs->request_alignment = 1; s->buf_align = 1; return; @@ -330,9 +338,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) /* If we could not get the sizes so far, we can only guess them */ if (!s->buf_align) { size_t align; - buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { - if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) { + buf = qemu_memalign(max_align, 2 * max_align); + for (align = 512; align <= max_align; align <<= 1) { + if (raw_is_io_aligned(fd, buf + align, max_align)) { s->buf_align = align; break; } @@ -342,8 +350,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) if (!bs->request_alignment) { size_t align; - buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + buf = qemu_memalign(s->buf_align, max_align); + for (align = 512; align <= max_align; align <<= 1) { if (raw_is_io_aligned(fd, buf, align)) { bs->request_alignment = align; break; @@ -725,7 +733,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) BDRVRawState *s = bs->opaque; raw_probe_alignment(bs, s->fd, errp); - bs->bl.opt_mem_alignment = s->buf_align; + bs->bl.min_mem_alignment = s->buf_align; + bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); } static int check_for_dasd(int fd) @@ -1016,6 +1025,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) { struct xfs_flock64 fl; + int err; memset(&fl, 0, sizeof(fl)); fl.l_whence = SEEK_SET; @@ -1023,8 +1033,9 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) fl.l_len = bytes; if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { - DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno)); - return -errno; + err = errno; + DPRINTF("cannot write zero range (%s)\n", strerror(errno)); + return -err; } return 0; @@ -1033,6 +1044,7 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) { struct xfs_flock64 fl; + int err; memset(&fl, 0, sizeof(fl)); fl.l_whence = SEEK_SET; @@ -1040,8 +1052,9 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) fl.l_len = bytes; if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { - DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); - return -errno; + err = errno; + DPRINTF("cannot punch hole (%s)\n", strerror(errno)); + return -err; } return 0; @@ -1846,8 +1859,9 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, *pnum = nb_sectors; ret = BDRV_BLOCK_DATA; } else if (data == start) { - /* On a data extent, compute sectors to the end of the extent. */ - *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); + /* On a data extent, compute sectors to the end of the extent, + * possibly including a partial sector at EOF. */ + *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); ret = BDRV_BLOCK_DATA; } else { /* On a hole, compute sectors to the beginning of the next extent. */ @@ -2073,15 +2087,38 @@ static void hdev_parse_filename(const char *filename, QDict *options, qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); } +static bool hdev_is_sg(BlockDriverState *bs) +{ + +#if defined(__linux__) + + struct stat st; + struct sg_scsi_id scsiid; + int sg_version; + + if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) && + !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) && + !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) { + DPRINTF("SG device found: type=%d, version=%d\n", + scsiid.scsi_type, sg_version); + return true; + } + +#endif + + return false; +} + static int hdev_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRawState *s = bs->opaque; Error *local_err = NULL; int ret; - const char *filename = qdict_get_str(options, "filename"); #if defined(__APPLE__) && defined(__MACH__) + const char *filename = qdict_get_str(options, "filename"); + if (strstart(filename, "/dev/cdrom", NULL)) { kern_return_t kernResult; io_iterator_t mediaIterator; @@ -2110,16 +2147,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, #endif s->type = FTYPE_FILE; -#if defined(__linux__) - { - char resolved_path[ MAXPATHLEN ], *temp; - - temp = realpath(filename, resolved_path); - if (temp && strstart(temp, "/dev/sg", NULL)) { - bs->sg = 1; - } - } -#endif ret = raw_open_common(bs, options, flags, 0, &local_err); if (ret < 0) { @@ -2129,6 +2156,9 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, return ret; } + /* Since this does ioctl the device must be already opened */ + bs->sg = hdev_is_sg(bs); + if (flags & BDRV_O_RDWR) { ret = check_hdev_writable(s); if (ret < 0) { @@ -2157,16 +2187,12 @@ static int fd_open(BlockDriverState *bs) (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { qemu_close(s->fd); s->fd = -1; -#ifdef DEBUG_FLOPPY - printf("Floppy closed\n"); -#endif + DPRINTF("Floppy closed\n"); } if (s->fd < 0) { if (s->fd_got_error && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) { -#ifdef DEBUG_FLOPPY - printf("No floppy (open delayed)\n"); -#endif + DPRINTF("No floppy (open delayed)\n"); return -EIO; } s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK); @@ -2175,14 +2201,10 @@ static int fd_open(BlockDriverState *bs) s->fd_got_error = 1; if (last_media_present) s->fd_media_changed = 1; -#ifdef DEBUG_FLOPPY - printf("No floppy\n"); -#endif + DPRINTF("No floppy\n"); return -EIO; } -#ifdef DEBUG_FLOPPY - printf("Floppy opened\n"); -#endif + DPRINTF("Floppy opened\n"); } if (!last_media_present) s->fd_media_changed = 1; @@ -2408,7 +2430,8 @@ static int floppy_probe_device(const char *filename) struct stat st; if (strstart(filename, "/dev/fd", NULL) && - !strstart(filename, "/dev/fdset/", NULL)) { + !strstart(filename, "/dev/fdset/", NULL) && + !strstart(filename, "/dev/fd/", NULL)) { prio = 50; } @@ -2450,9 +2473,7 @@ static int floppy_media_changed(BlockDriverState *bs) fd_open(bs); ret = s->fd_media_changed; s->fd_media_changed = 0; -#ifdef DEBUG_FLOPPY - printf("Floppy changed=%d\n", ret); -#endif + DPRINTF("Floppy changed=%d\n", ret); return ret; } diff --git a/block/raw-win32.c b/block/raw-win32.c index dae5d2fee..68f2338ac 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -29,6 +29,7 @@ #include "trace.h" #include "block/thread-pool.h" #include "qemu/iov.h" +#include "qapi/qmp/qstring.h" #include #include diff --git a/block/rbd.c b/block/rbd.c index f3ab2ddd5..a60a19d58 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -74,25 +74,18 @@ typedef struct RBDAIOCB { QEMUIOVector *qiov; char *bounce; RBDAIOCmd cmd; - int64_t sector_num; int error; struct BDRVRBDState *s; - int status; } RBDAIOCB; typedef struct RADOSCB { - int rcbid; RBDAIOCB *acb; struct BDRVRBDState *s; - int done; int64_t size; char *buf; int64_t ret; } RADOSCB; -#define RBD_FD_READ 0 -#define RBD_FD_WRITE 1 - typedef struct BDRVRBDState { rados_t cluster; rados_ioctx_t io_ctx; @@ -235,7 +228,9 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) return NULL; } -static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp) +static int qemu_rbd_set_conf(rados_t cluster, const char *conf, + bool only_read_conf_file, + Error **errp) { char *p, *buf; char name[RBD_MAX_CONF_NAME_SIZE]; @@ -267,14 +262,18 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp) qemu_rbd_unescape(value); if (strcmp(name, "conf") == 0) { - ret = rados_conf_read_file(cluster, value); - if (ret < 0) { - error_setg(errp, "error reading conf file %s", value); - break; + /* read the conf file alone, so it doesn't override more + specific settings for a particular device */ + if (only_read_conf_file) { + ret = rados_conf_read_file(cluster, value); + if (ret < 0) { + error_setg(errp, "error reading conf file %s", value); + break; + } } } else if (strcmp(name, "id") == 0) { /* ignore, this is parsed by qemu_rbd_parse_clientname() */ - } else { + } else if (!only_read_conf_file) { ret = rados_conf_set(cluster, name, value); if (ret < 0) { error_setg(errp, "invalid conf option %s", name); @@ -325,7 +324,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) error_setg(errp, "obj size too small"); return -EINVAL; } - obj_order = ffs(objsize) - 1; + obj_order = ctz32(objsize); } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); @@ -337,10 +336,15 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) if (strstr(conf, "conf=") == NULL) { /* try default location, but ignore failure */ rados_conf_read_file(cluster, NULL); + } else if (conf[0] != '\0' && + qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) { + rados_shutdown(cluster); + error_propagate(errp, local_err); + return -EIO; } if (conf[0] != '\0' && - qemu_rbd_set_conf(cluster, conf, &local_err) < 0) { + qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) { rados_shutdown(cluster); error_propagate(errp, local_err); return -EIO; @@ -405,7 +409,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) } qemu_vfree(acb->bounce); acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - acb->status = 0; qemu_aio_unref(acb); } @@ -468,6 +471,23 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, s->snap = g_strdup(snap_buf); } + if (strstr(conf, "conf=") == NULL) { + /* try default location, but ignore failure */ + rados_conf_read_file(s->cluster, NULL); + } else if (conf[0] != '\0') { + r = qemu_rbd_set_conf(s->cluster, conf, true, errp); + if (r < 0) { + goto failed_shutdown; + } + } + + if (conf[0] != '\0') { + r = qemu_rbd_set_conf(s->cluster, conf, false, errp); + if (r < 0) { + goto failed_shutdown; + } + } + /* * Fallback to more conservative semantics if setting cache * options fails. Ignore errors from setting rbd_cache because the @@ -481,18 +501,6 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, rados_conf_set(s->cluster, "rbd_cache", "true"); } - if (strstr(conf, "conf=") == NULL) { - /* try default location, but ignore failure */ - rados_conf_read_file(s->cluster, NULL); - } - - if (conf[0] != '\0') { - r = qemu_rbd_set_conf(s->cluster, conf, errp); - if (r < 0) { - goto failed_shutdown; - } - } - r = rados_connect(s->cluster); if (r < 0) { error_setg(errp, "error connecting"); @@ -621,7 +629,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, acb->error = 0; acb->s = s; acb->bh = NULL; - acb->status = -EINPROGRESS; if (cmd == RBD_AIO_WRITE) { qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); @@ -633,7 +640,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, size = nb_sectors * BDRV_SECTOR_SIZE; rcb = g_new(RADOSCB, 1); - rcb->done = 0; rcb->acb = acb; rcb->buf = buf; rcb->s = acb->s; diff --git a/block/sheepdog.c b/block/sheepdog.c index c14172cfa..9585beb73 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -318,6 +318,10 @@ enum AIOCBState { AIOCB_DISCARD_OBJ, }; +#define AIOCBOverwrapping(x, y) \ + (!(x->max_affect_data_idx < y->min_affect_data_idx \ + || y->max_affect_data_idx < x->min_affect_data_idx)) + struct SheepdogAIOCB { BlockAIOCB common; @@ -334,6 +338,11 @@ struct SheepdogAIOCB { bool cancelable; int nr_pending; + + uint32_t min_affect_data_idx; + uint32_t max_affect_data_idx; + + QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; }; typedef struct BDRVSheepdogState { @@ -362,8 +371,10 @@ typedef struct BDRVSheepdogState { /* Every aio request must be linked to either of these queues. */ QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; - QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head; QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; + + CoQueue overwrapping_queue; + QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; } BDRVSheepdogState; static const char * sd_strerror(int err) @@ -498,13 +509,7 @@ static void sd_aio_cancel(BlockAIOCB *blockacb) AIOReq *aioreq, *next; if (sd_acb_cancelable(acb)) { - /* Remove outstanding requests from pending and failed queues. */ - QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings, - next) { - if (aioreq->aiocb == acb) { - free_aio_req(s, aioreq); - } - } + /* Remove outstanding requests from failed queue. */ QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, next) { if (aioreq->aiocb == acb) { @@ -529,6 +534,10 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors) { SheepdogAIOCB *acb; + uint32_t object_size; + BDRVSheepdogState *s = bs->opaque; + + object_size = (UINT32_C(1) << s->inode.block_size_shift); acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); @@ -542,6 +551,11 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->coroutine = qemu_coroutine_self(); acb->ret = 0; acb->nr_pending = 0; + + acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; + acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + + acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; + return acb; } @@ -703,38 +717,6 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); static void co_write_request(void *opaque); -static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) -{ - AIOReq *aio_req; - - QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) { - if (aio_req->oid == oid) { - return aio_req; - } - } - - return NULL; -} - -/* - * This function searchs pending requests to the object `oid', and - * sends them. - */ -static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) -{ - AIOReq *aio_req; - SheepdogAIOCB *acb; - - while ((aio_req = find_pending_req(s, oid)) != NULL) { - acb = aio_req->aiocb; - /* move aio_req from pending list to inflight one */ - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - acb->aiocb_type); - } -} - static coroutine_fn void reconnect_to_sdog(void *opaque) { BDRVSheepdogState *s = opaque; @@ -840,12 +822,6 @@ static void coroutine_fn aio_read_response(void *opaque) s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); } - /* - * Some requests may be blocked because simultaneous - * create requests are not allowed, so we search the - * pending requests here. - */ - send_pending_req(s, aio_req->oid); } break; case AIOCB_READ_UDATA: @@ -1341,30 +1317,6 @@ out: return ret; } -/* Return true if the specified request is linked to the pending list. */ -static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req) -{ - AIOReq *areq; - QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { - if (areq != aio_req && areq->oid == aio_req->oid) { - /* - * Sheepdog cannot handle simultaneous create requests to the same - * object, so we cannot send the request until the previous request - * finishes. - */ - DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid); - aio_req->flags = 0; - aio_req->base_oid = 0; - aio_req->create = false; - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); - return true; - } - } - - return false; -} - static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) { SheepdogAIOCB *acb = aio_req->aiocb; @@ -1379,10 +1331,6 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) goto out; } - if (check_simultaneous_create(s, aio_req)) { - return; - } - if (s->inode.data_vdi_id[idx]) { aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); aio_req->flags |= SD_FLAG_CMD_COW; @@ -1458,8 +1406,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, filename = qemu_opt_get(opts, "filename"); QLIST_INIT(&s->inflight_aio_head); - QLIST_INIT(&s->pending_aio_head); QLIST_INIT(&s->failed_aio_head); + QLIST_INIT(&s->inflight_aiocb_head); s->fd = -1; memset(vdi, 0, sizeof(vdi)); @@ -1524,6 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; pstrcpy(s->name, sizeof(s->name), vdi); qemu_co_mutex_init(&s->lock); + qemu_co_queue_init(&s->overwrapping_queue); qemu_opts_del(opts); g_free(buf); return 0; @@ -1716,7 +1665,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) if ((object_size - 1) & object_size) { /* not a power of 2? */ return -EINVAL; } - obj_order = ffs(object_size) - 1; + obj_order = ctz32(object_size); if (obj_order < 20 || obj_order > 31) { return -EINVAL; } @@ -2195,12 +2144,6 @@ static int coroutine_fn sd_co_rw_vector(void *p) old_oid, done); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - if (create) { - if (check_simultaneous_create(s, aio_req)) { - goto done; - } - } - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, acb->aiocb_type); done: @@ -2215,6 +2158,20 @@ out: return 1; } +static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) +{ + SheepdogAIOCB *cb; + + QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { + if (AIOCBOverwrapping(aiocb, cb)) { + return true; + } + } + + QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings); + return false; +} + static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -2234,14 +2191,25 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, acb->aio_done_func = sd_write_done; acb->aiocb_type = AIOCB_WRITE_UDATA; +retry: + if (check_overwrapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overwrapping_queue); + goto retry; + } + ret = sd_co_rw_vector(acb); if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); qemu_aio_unref(acb); return ret; } qemu_coroutine_yield(); + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); + return acb->ret; } @@ -2250,19 +2218,30 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, { SheepdogAIOCB *acb; int ret; + BDRVSheepdogState *s = bs->opaque; acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); acb->aiocb_type = AIOCB_READ_UDATA; acb->aio_done_func = sd_finish_aiocb; +retry: + if (check_overwrapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overwrapping_queue); + goto retry; + } + ret = sd_co_rw_vector(acb); if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); qemu_aio_unref(acb); return ret; } qemu_coroutine_yield(); + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); return acb->ret; } @@ -2341,6 +2320,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) if (ret < 0) { error_report("failed to create inode for snapshot: %s", error_get_pretty(local_err)); + error_free(local_err); goto cleanup; } @@ -2609,14 +2589,25 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, acb->aiocb_type = AIOCB_DISCARD_OBJ; acb->aio_done_func = sd_finish_aiocb; +retry: + if (check_overwrapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overwrapping_queue); + goto retry; + } + ret = sd_co_rw_vector(acb); if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); qemu_aio_unref(acb); return ret; } qemu_coroutine_yield(); + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overwrapping_queue); + return acb->ret; } diff --git a/block/snapshot.c b/block/snapshot.c index 698e1a1d5..49e143e99 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -24,6 +24,7 @@ #include "block/snapshot.h" #include "block/block_int.h" +#include "qapi/qmp/qerror.h" QemuOptsList internal_snapshot_opts = { .name = "snapshot", @@ -229,7 +230,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs, { BlockDriver *drv = bs->drv; if (!drv) { - error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); + error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } if (!snapshot_id && !name) { @@ -238,7 +239,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs, } /* drain all pending i/o before deleting snapshot */ - bdrv_drain_all(); + bdrv_drain(bs); if (drv->bdrv_snapshot_delete) { return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); @@ -246,9 +247,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs, if (bs->file) { return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); } - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - drv->format_name, bdrv_get_device_name(bs), - "internal snapshot deletion"); + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support internal snapshot deletion", + drv->format_name, bdrv_get_device_name(bs)); return -ENOTSUP; } @@ -315,7 +316,7 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, BlockDriver *drv = bs->drv; if (!drv) { - error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); + error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } if (!snapshot_id && !name) { @@ -329,9 +330,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, if (drv->bdrv_snapshot_load_tmp) { return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); } - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - drv->format_name, bdrv_get_device_name(bs), - "temporarily load internal snapshot"); + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support temporarily loading internal snapshots", + drv->format_name, bdrv_get_device_name(bs)); return -ENOTSUP; } diff --git a/block/ssh.c b/block/ssh.c index f466cbf39..8d0673903 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -30,9 +30,11 @@ #include #include "block/block_int.h" +#include "qemu/error-report.h" #include "qemu/sockets.h" #include "qemu/uri.h" #include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" /* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in * this block driver code. @@ -561,7 +563,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, /* Open the socket and connect. */ s->sock = inet_connect(s->hostport, errp); if (s->sock < 0) { - ret = -errno; + ret = -EIO; goto err; } diff --git a/block/stream.c b/block/stream.c index a628901f6..ab0bd057f 100644 --- a/block/stream.c +++ b/block/stream.c @@ -14,6 +14,7 @@ #include "trace.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" enum { @@ -227,7 +228,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) StreamBlockJob *s = container_of(job, StreamBlockJob, common); if (speed < 0) { - error_set(errp, QERR_INVALID_PARAMETER, "speed"); + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); return; } ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); @@ -250,7 +251,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, if ((on_error == BLOCKDEV_ON_ERROR_STOP || on_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { - error_set(errp, QERR_INVALID_PARAMETER, "on-error"); + error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); return; } diff --git a/block/throttle-groups.c b/block/throttle-groups.c new file mode 100644 index 000000000..1abc6fcae --- /dev/null +++ b/block/throttle-groups.c @@ -0,0 +1,501 @@ +/* + * QEMU block throttling group infrastructure + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * BenoĆ®t Canet + * Alberto Garcia + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "block/throttle-groups.h" +#include "qemu/queue.h" +#include "qemu/thread.h" +#include "sysemu/qtest.h" + +/* The ThrottleGroup structure (with its ThrottleState) is shared + * among different BlockDriverState and it's independent from + * AioContext, so in order to use it from different threads it needs + * its own locking. + * + * This locking is however handled internally in this file, so it's + * mostly transparent to outside users (but see the documentation in + * throttle_groups_lock()). + * + * The whole ThrottleGroup structure is private and invisible to + * outside users, that only use it through its ThrottleState. + * + * In addition to the ThrottleGroup structure, BlockDriverState has + * fields that need to be accessed by other members of the group and + * therefore also need to be protected by this lock. Once a BDS is + * registered in a group those fields can be accessed by other threads + * any time. + * + * Again, all this is handled internally and is mostly transparent to + * the outside. The 'throttle_timers' field however has an additional + * constraint because it may be temporarily invalid (see for example + * bdrv_set_aio_context()). Therefore in this file a thread will + * access some other BDS's timers only after verifying that that BDS + * has throttled requests in the queue. + */ +typedef struct ThrottleGroup { + char *name; /* This is constant during the lifetime of the group */ + + QemuMutex lock; /* This lock protects the following four fields */ + ThrottleState ts; + QLIST_HEAD(, BlockDriverState) head; + BlockDriverState *tokens[2]; + bool any_timer_armed[2]; + + /* These two are protected by the global throttle_groups_lock */ + unsigned refcount; + QTAILQ_ENTRY(ThrottleGroup) list; +} ThrottleGroup; + +static QemuMutex throttle_groups_lock; +static QTAILQ_HEAD(, ThrottleGroup) throttle_groups = + QTAILQ_HEAD_INITIALIZER(throttle_groups); + +/* Increments the reference count of a ThrottleGroup given its name. + * + * If no ThrottleGroup is found with the given name a new one is + * created. + * + * @name: the name of the ThrottleGroup + * @ret: the ThrottleGroup + */ +static ThrottleGroup *throttle_group_incref(const char *name) +{ + ThrottleGroup *tg = NULL; + ThrottleGroup *iter; + + qemu_mutex_lock(&throttle_groups_lock); + + /* Look for an existing group with that name */ + QTAILQ_FOREACH(iter, &throttle_groups, list) { + if (!strcmp(name, iter->name)) { + tg = iter; + break; + } + } + + /* Create a new one if not found */ + if (!tg) { + tg = g_new0(ThrottleGroup, 1); + tg->name = g_strdup(name); + qemu_mutex_init(&tg->lock); + throttle_init(&tg->ts); + QLIST_INIT(&tg->head); + + QTAILQ_INSERT_TAIL(&throttle_groups, tg, list); + } + + tg->refcount++; + + qemu_mutex_unlock(&throttle_groups_lock); + + return tg; +} + +/* Decrease the reference count of a ThrottleGroup. + * + * When the reference count reaches zero the ThrottleGroup is + * destroyed. + * + * @tg: The ThrottleGroup to unref + */ +static void throttle_group_unref(ThrottleGroup *tg) +{ + qemu_mutex_lock(&throttle_groups_lock); + if (--tg->refcount == 0) { + QTAILQ_REMOVE(&throttle_groups, tg, list); + qemu_mutex_destroy(&tg->lock); + g_free(tg->name); + g_free(tg); + } + qemu_mutex_unlock(&throttle_groups_lock); +} + +/* Get the name from a BlockDriverState's ThrottleGroup. The name (and + * the pointer) is guaranteed to remain constant during the lifetime + * of the group. + * + * @bs: a BlockDriverState that is member of a throttling group + * @ret: the name of the group. + */ +const char *throttle_group_get_name(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + return tg->name; +} + +/* Return the next BlockDriverState in the round-robin sequence, + * simulating a circular list. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @ret: the next BlockDriverState in the sequence + */ +static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + BlockDriverState *next = QLIST_NEXT(bs, round_robin); + + if (!next) { + return QLIST_FIRST(&tg->head); + } + + return next; +} + +/* Return the next BlockDriverState in the round-robin sequence with + * pending I/O requests. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + * @ret: the next BlockDriverState with pending requests, or bs + * if there is none. + */ +static BlockDriverState *next_throttle_token(BlockDriverState *bs, + bool is_write) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockDriverState *token, *start; + + start = token = tg->tokens[is_write]; + + /* get next bs round in round robin style */ + token = throttle_group_next_bs(token); + while (token != start && !token->pending_reqs[is_write]) { + token = throttle_group_next_bs(token); + } + + /* If no IO are queued for scheduling on the next round robin token + * then decide the token is the current bs because chances are + * the current bs get the current request queued. + */ + if (token == start && !token->pending_reqs[is_write]) { + token = bs; + } + + return token; +} + +/* Check if the next I/O request for a BlockDriverState needs to be + * throttled or not. If there's no timer set in this group, set one + * and update the token accordingly. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + * @ret: whether the I/O request needs to be throttled or not + */ +static bool throttle_group_schedule_timer(BlockDriverState *bs, + bool is_write) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleTimers *tt = &bs->throttle_timers; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + bool must_wait; + + /* Check if any of the timers in this group is already armed */ + if (tg->any_timer_armed[is_write]) { + return true; + } + + must_wait = throttle_schedule_timer(ts, tt, is_write); + + /* If a timer just got armed, set bs as the current token */ + if (must_wait) { + tg->tokens[is_write] = bs; + tg->any_timer_armed[is_write] = true; + } + + return must_wait; +} + +/* Look for the next pending I/O request and schedule it. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + */ +static void schedule_next_request(BlockDriverState *bs, bool is_write) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + bool must_wait; + BlockDriverState *token; + + /* Check if there's any pending request to schedule next */ + token = next_throttle_token(bs, is_write); + if (!token->pending_reqs[is_write]) { + return; + } + + /* Set a timer for the request if it needs to be throttled */ + must_wait = throttle_group_schedule_timer(token, is_write); + + /* If it doesn't have to wait, queue it for immediate execution */ + if (!must_wait) { + /* Give preference to requests from the current bs */ + if (qemu_in_coroutine() && + qemu_co_queue_next(&bs->throttled_reqs[is_write])) { + token = bs; + } else { + ThrottleTimers *tt = &token->throttle_timers; + int64_t now = qemu_clock_get_ns(tt->clock_type); + timer_mod(tt->timers[is_write], now + 1); + tg->any_timer_armed[is_write] = true; + } + tg->tokens[is_write] = token; + } +} + +/* Check if an I/O request needs to be throttled, wait and set a timer + * if necessary, and schedule the next request using a round robin + * algorithm. + * + * @bs: the current BlockDriverState + * @bytes: the number of bytes for this I/O + * @is_write: the type of operation (read/write) + */ +void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, + unsigned int bytes, + bool is_write) +{ + bool must_wait; + BlockDriverState *token; + + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + + /* First we check if this I/O has to be throttled. */ + token = next_throttle_token(bs, is_write); + must_wait = throttle_group_schedule_timer(token, is_write); + + /* Wait if there's a timer set or queued requests of this type */ + if (must_wait || bs->pending_reqs[is_write]) { + bs->pending_reqs[is_write]++; + qemu_mutex_unlock(&tg->lock); + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + qemu_mutex_lock(&tg->lock); + bs->pending_reqs[is_write]--; + } + + /* The I/O will be executed, so do the accounting */ + throttle_account(bs->throttle_state, is_write, bytes); + + /* Schedule the next request */ + schedule_next_request(bs, is_write); + + qemu_mutex_unlock(&tg->lock); +} + +/* Update the throttle configuration for a particular group. Similar + * to throttle_config(), but guarantees atomicity within the + * throttling group. + * + * @bs: a BlockDriverState that is member of the group + * @cfg: the configuration to set + */ +void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ + ThrottleTimers *tt = &bs->throttle_timers; + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + /* throttle_config() cancels the timers */ + if (timer_pending(tt->timers[0])) { + tg->any_timer_armed[0] = false; + } + if (timer_pending(tt->timers[1])) { + tg->any_timer_armed[1] = false; + } + throttle_config(ts, tt, cfg); + qemu_mutex_unlock(&tg->lock); +} + +/* Get the throttle configuration from a particular group. Similar to + * throttle_get_config(), but guarantees atomicity within the + * throttling group. + * + * @bs: a BlockDriverState that is member of the group + * @cfg: the configuration will be written here + */ +void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + throttle_get_config(ts, cfg); + qemu_mutex_unlock(&tg->lock); +} + +/* ThrottleTimers callback. This wakes up a request that was waiting + * because it had been throttled. + * + * @bs: the BlockDriverState whose request had been throttled + * @is_write: the type of operation (read/write) + */ +static void timer_cb(BlockDriverState *bs, bool is_write) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + bool empty_queue; + + /* The timer has just been fired, so we can update the flag */ + qemu_mutex_lock(&tg->lock); + tg->any_timer_armed[is_write] = false; + qemu_mutex_unlock(&tg->lock); + + /* Run the request that was waiting for this timer */ + empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]); + + /* If the request queue was empty then we have to take care of + * scheduling the next one */ + if (empty_queue) { + qemu_mutex_lock(&tg->lock); + schedule_next_request(bs, is_write); + qemu_mutex_unlock(&tg->lock); + } +} + +static void read_timer_cb(void *opaque) +{ + timer_cb(opaque, false); +} + +static void write_timer_cb(void *opaque) +{ + timer_cb(opaque, true); +} + +/* Register a BlockDriverState in the throttling group, also + * initializing its timers and updating its throttle_state pointer to + * point to it. If a throttling group with that name does not exist + * yet, it will be created. + * + * @bs: the BlockDriverState to insert + * @groupname: the name of the group + */ +void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) +{ + int i; + ThrottleGroup *tg = throttle_group_incref(groupname); + int clock_type = QEMU_CLOCK_REALTIME; + + if (qtest_enabled()) { + /* For testing block IO throttling only */ + clock_type = QEMU_CLOCK_VIRTUAL; + } + + bs->throttle_state = &tg->ts; + + qemu_mutex_lock(&tg->lock); + /* If the ThrottleGroup is new set this BlockDriverState as the token */ + for (i = 0; i < 2; i++) { + if (!tg->tokens[i]) { + tg->tokens[i] = bs; + } + } + + QLIST_INSERT_HEAD(&tg->head, bs, round_robin); + + throttle_timers_init(&bs->throttle_timers, + bdrv_get_aio_context(bs), + clock_type, + read_timer_cb, + write_timer_cb, + bs); + + qemu_mutex_unlock(&tg->lock); +} + +/* Unregister a BlockDriverState from its group, removing it from the + * list, destroying the timers and setting the throttle_state pointer + * to NULL. + * + * The group will be destroyed if it's empty after this operation. + * + * @bs: the BlockDriverState to remove + */ +void throttle_group_unregister_bs(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + int i; + + qemu_mutex_lock(&tg->lock); + for (i = 0; i < 2; i++) { + if (tg->tokens[i] == bs) { + BlockDriverState *token = throttle_group_next_bs(bs); + /* Take care of the case where this is the last bs in the group */ + if (token == bs) { + token = NULL; + } + tg->tokens[i] = token; + } + } + + /* remove the current bs from the list */ + QLIST_REMOVE(bs, round_robin); + throttle_timers_destroy(&bs->throttle_timers); + qemu_mutex_unlock(&tg->lock); + + throttle_group_unref(tg); + bs->throttle_state = NULL; +} + +/* Acquire the lock of this throttling group. + * + * You won't normally need to use this. None of the functions from the + * ThrottleGroup API require you to acquire the lock since all of them + * deal with it internally. + * + * This should only be used in exceptional cases when you want to + * access the protected fields of a BlockDriverState directly + * (e.g. bdrv_swap()). + * + * @bs: a BlockDriverState that is member of the group + */ +void throttle_group_lock(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); +} + +/* Release the lock of this throttling group. + * + * See the comments in throttle_group_lock(). + */ +void throttle_group_unlock(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + qemu_mutex_unlock(&tg->lock); +} + +static void throttle_groups_init(void) +{ + qemu_mutex_init(&throttle_groups_lock); +} + +block_init(throttle_groups_init); diff --git a/block/vdi.c b/block/vdi.c index 53bd02fe2..7642ef359 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, } /* Disable migration when vdi images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vdi", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vdi format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); qemu_co_mutex_init(&s->write_lock); diff --git a/block/vhdx-log.c b/block/vhdx-log.c index 6547bec40..47fec63c6 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -19,6 +19,7 @@ */ #include "qemu-common.h" #include "block/block_int.h" +#include "qemu/error-report.h" #include "qemu/module.h" #include "block/vhdx.h" diff --git a/block/vhdx.c b/block/vhdx.c index bb3ed45d5..0776de717 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, /* TODO: differencing files */ /* Disable migration when VHDX images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vhdx", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); return 0; @@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); memset(iov1.iov_base, 0, iov1.iov_len); qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, - sinfo.block_offset); + iov1.iov_len); sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; } @@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); memset(iov2.iov_base, 0, iov2.iov_len); qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, - sinfo.block_offset); + iov2.iov_len); sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; } } diff --git a/block/vmdk.c b/block/vmdk.c index 4c71cde3b..fbaab67c8 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -25,6 +25,8 @@ #include "qemu-common.h" #include "block/block_int.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" #include "qemu/module.h" #include "migration/migration.h" #include @@ -321,37 +323,13 @@ static int vmdk_is_cid_valid(BlockDriverState *bs) return 1; } -/* Queue extents, if any, for reopen() */ +/* We have nothing to do for VMDK reopen, stubs just return success */ static int vmdk_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { - BDRVVmdkState *s; - int ret = -1; - int i; - VmdkExtent *e; - assert(state != NULL); assert(state->bs != NULL); - - if (queue == NULL) { - error_setg(errp, "No reopen queue for VMDK extents"); - goto exit; - } - - s = state->bs->opaque; - - assert(s != NULL); - - for (i = 0; i < s->num_extents; i++) { - e = &s->extents[i]; - if (e->file != state->bs->file) { - bdrv_reopen_queue(queue, e->file, state->flags); - } - } - ret = 0; - -exit: - return ret; + return 0; } static int vmdk_parent_open(BlockDriverState *bs) @@ -524,7 +502,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, } ret = vmdk_add_extent(bs, file, false, le32_to_cpu(header.disk_sectors), - le32_to_cpu(header.l1dir_offset) << 9, + (int64_t)le32_to_cpu(header.l1dir_offset) << 9, 0, le32_to_cpu(header.l1dir_size), 4096, @@ -543,7 +521,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, } static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, - Error **errp); + QDict *options, Error **errp); static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, Error **errp) @@ -582,7 +560,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, static int vmdk_open_vmdk4(BlockDriverState *bs, BlockDriverState *file, - int flags, Error **errp) + int flags, QDict *options, Error **errp) { int ret; uint32_t magic; @@ -606,7 +584,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (!buf) { return -EINVAL; } - ret = vmdk_open_desc_file(bs, flags, buf, errp); + ret = vmdk_open_desc_file(bs, flags, buf, options, errp); g_free(buf); return ret; } @@ -669,8 +647,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, char buf[64]; snprintf(buf, sizeof(buf), "VMDK version %" PRId32, le32_to_cpu(header.version)); - error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "vmdk", buf); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "vmdk", buf); return -ENOTSUP; } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { /* VMware KB 2064959 explains that version 3 added support for @@ -763,7 +741,7 @@ static int vmdk_parse_description(const char *desc, const char *opt_name, /* Open an extent file and append to bs array */ static int vmdk_open_sparse(BlockDriverState *bs, BlockDriverState *file, int flags, - char *buf, Error **errp) + char *buf, QDict *options, Error **errp) { uint32_t magic; @@ -773,7 +751,7 @@ static int vmdk_open_sparse(BlockDriverState *bs, return vmdk_open_vmfs_sparse(bs, file, flags, errp); break; case VMDK4_MAGIC: - return vmdk_open_vmdk4(bs, file, flags, errp); + return vmdk_open_vmdk4(bs, file, flags, options, errp); break; default: error_setg(errp, "Image not in VMDK format"); @@ -783,7 +761,8 @@ static int vmdk_open_sparse(BlockDriverState *bs, } static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, - const char *desc_file_path, Error **errp) + const char *desc_file_path, QDict *options, + Error **errp) { int ret; int matches; @@ -797,6 +776,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, BlockDriverState *extent_file; BDRVVmdkState *s = bs->opaque; VmdkExtent *extent; + char extent_opt_prefix[32]; while (*p) { /* parse extent line in one of below formats: @@ -846,8 +826,12 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, extent_path = g_malloc0(PATH_MAX); path_combine(extent_path, PATH_MAX, desc_file_path, fname); extent_file = NULL; - ret = bdrv_open(&extent_file, extent_path, NULL, NULL, - bs->open_flags | BDRV_O_PROTOCOL, NULL, errp); + + ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); + assert(ret < 32); + + ret = bdrv_open_image(&extent_file, extent_path, options, + extent_opt_prefix, bs, &child_file, false, errp); g_free(extent_path); if (ret) { return ret; @@ -870,7 +854,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, if (!buf) { ret = -EINVAL; } else { - ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp); + ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, + options, errp); } g_free(buf); if (ret) { @@ -898,7 +883,7 @@ next_line: } static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, - Error **errp) + QDict *options, Error **errp) { int ret; char ct[128]; @@ -920,7 +905,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, } s->create_type = g_strdup(ct); s->desc_offset = 0; - ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp); + ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp); exit: return ret; } @@ -942,11 +927,11 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, switch (magic) { case VMDK3_MAGIC: case VMDK4_MAGIC: - ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp); + ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp); s->desc_offset = 0x200; break; default: - ret = vmdk_open_desc_file(bs, flags, buf, errp); + ret = vmdk_open_desc_file(bs, flags, buf, options, errp); break; } if (ret) { @@ -963,9 +948,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->lock); /* Disable migration when VMDK images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vmdk", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); g_free(buf); return 0; @@ -1705,12 +1690,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, /* write all the data */ ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); if (ret < 0) { - error_set(errp, QERR_IO_ERROR); + error_setg(errp, QERR_IO_ERROR); goto exit; } ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); if (ret < 0) { - error_set(errp, QERR_IO_ERROR); + error_setg(errp, QERR_IO_ERROR); goto exit; } @@ -1730,7 +1715,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size); if (ret < 0) { - error_set(errp, QERR_IO_ERROR); + error_setg(errp, QERR_IO_ERROR); goto exit; } @@ -1742,7 +1727,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size); if (ret < 0) { - error_set(errp, QERR_IO_ERROR); + error_setg(errp, QERR_IO_ERROR); goto exit; } diff --git a/block/vpc.c b/block/vpc.c index 8ab30d600..3e385d9fb 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -328,9 +328,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->lock); /* Disable migration when VHD images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vpc", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vpc format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); return 0; diff --git a/block/vvfat.c b/block/vvfat.c index 9be632f40..206869712 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -30,6 +30,7 @@ #include "migration/migration.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" #ifndef S_IWGRP #define S_IWGRP 0 @@ -322,6 +323,7 @@ typedef struct BDRVVVFATState { int fat_type; /* 16 or 32 */ array_t fat,directory,mapping; + char volume_label[11]; unsigned int cluster_size; unsigned int sectors_per_cluster; @@ -859,7 +861,7 @@ static int init_directories(BDRVVVFATState* s, { direntry_t* entry=array_get_next(&(s->directory)); entry->attributes=0x28; /* archive | volume label */ - memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name)); + memcpy(entry->name, s->volume_label, sizeof(entry->name)); } /* Now build FAT, and write back information into directory */ @@ -968,7 +970,8 @@ static int init_directories(BDRVVVFATState* s, bootsector->u.fat16.signature=0x29; bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); - memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11); + memcpy(bootsector->u.fat16.volume_label, s->volume_label, + sizeof(bootsector->u.fat16.volume_label)); memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8); bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; @@ -1007,6 +1010,11 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Create a floppy rather than a hard disk image", }, + { + .name = "label", + .type = QEMU_OPT_STRING, + .help = "Use a volume label other than QEMU VVFAT", + }, { .name = "rw", .type = QEMU_OPT_BOOL, @@ -1059,8 +1067,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options, /* Fill in the options QDict */ qdict_put(options, "dir", qstring_from_str(filename)); qdict_put(options, "fat-type", qint_from_int(fat_type)); - qdict_put(options, "floppy", qbool_from_int(floppy)); - qdict_put(options, "rw", qbool_from_int(rw)); + qdict_put(options, "floppy", qbool_from_bool(floppy)); + qdict_put(options, "rw", qbool_from_bool(rw)); } static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, @@ -1069,7 +1077,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, BDRVVVFATState *s = bs->opaque; int cyls, heads, secs; bool floppy; - const char *dirname; + const char *dirname, *label; QemuOpts *opts; Error *local_err = NULL; int ret; @@ -1096,6 +1104,18 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, s->fat_type = qemu_opt_get_number(opts, "fat-type", 0); floppy = qemu_opt_get_bool(opts, "floppy", false); + memset(s->volume_label, ' ', sizeof(s->volume_label)); + label = qemu_opt_get(opts, "label"); + if (label) { + size_t label_length = strlen(label); + if (label_length > 11) { + error_setg(errp, "vvfat label cannot be longer than 11 bytes"); + ret = -EINVAL; + goto fail; + } + memcpy(s->volume_label, label, label_length); + } + if (floppy) { /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ if (!s->fat_type) { @@ -1180,9 +1200,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, /* Disable migration when vvfat is used rw */ if (s->qcow) { - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vvfat (rw)", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, + "The vvfat (rw) format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); } -- cgit v1.2.3