diff options
Diffstat (limited to 'block')
48 files changed, 10437 insertions, 3134 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 4cf9aa499..fd88c03ec 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,8 +1,9 @@ -block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o -block-obj-y += vhdx.o +block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o +block-obj-$(CONFIG_QUORUM) += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o block-obj-y += snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o @@ -10,8 +11,9 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o ifeq ($(CONFIG_POSIX),y) -block-obj-y += nbd.o sheepdog.o +block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_LIBNFS) += nfs.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o @@ -23,4 +25,15 @@ common-obj-y += commit.o common-obj-y += mirror.o common-obj-y += backup.o -$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS) +iscsi.o-cflags := $(LIBISCSI_CFLAGS) +iscsi.o-libs := $(LIBISCSI_LIBS) +curl.o-cflags := $(CURL_CFLAGS) +curl.o-libs := $(CURL_LIBS) +rbd.o-cflags := $(RBD_CFLAGS) +rbd.o-libs := $(RBD_LIBS) +gluster.o-cflags := $(GLUSTERFS_CFLAGS) +gluster.o-libs := $(GLUSTERFS_LIBS) +ssh.o-cflags := $(LIBSSH2_CFLAGS) +ssh.o-libs := $(LIBSSH2_LIBS) +qcow.o-libs := -lz +linux-aio.o-libs := -laio diff --git a/block/backup.c b/block/backup.c index 6ae8a05a3..15a2e55e8 100644 --- a/block/backup.c +++ b/block/backup.c @@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, n); + start * BACKUP_SECTORS_PER_CLUSTER, + n, BDRV_REQ_MAY_UNMAP); } else { ret = bdrv_co_writev(job->target, start * BACKUP_SECTORS_PER_CLUSTER, n, @@ -180,8 +181,13 @@ static int coroutine_fn backup_before_write_notify( void *opaque) { BdrvTrackedRequest *req = opaque; + int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; + int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; - return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL); + assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -202,9 +208,9 @@ static void backup_iostatus_reset(BlockJob *job) bdrv_iostatus_reset(s->target); } -static const BlockJobType backup_job_type = { +static const BlockJobDriver backup_job_driver = { .instance_size = sizeof(BackupBlockJob), - .job_type = "backup", + .job_type = BLOCK_JOB_TYPE_BACKUP, .set_speed = backup_set_speed, .iostatus_reset = backup_iostatus_reset, }; @@ -272,9 +278,9 @@ static void coroutine_fn backup_run(void *opaque) uint64_t delay_ns = ratelimit_calculate_delay( &job->limit, job->sectors_read); job->sectors_read = 0; - block_job_sleep_ns(&job->common, rt_clock, delay_ns); + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); } else { - block_job_sleep_ns(&job->common, rt_clock, 0); + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); } if (block_job_is_cancelled(&job->common)) { @@ -289,14 +295,14 @@ static void coroutine_fn backup_run(void *opaque) * backing file. */ for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { - /* bdrv_co_is_allocated() only returns true/false based - * on the first set of sectors it comes accross that + /* bdrv_is_allocated() only returns true/false based + * on the first set of sectors it comes across that * are are all in the same state. * For that reason we must verify each sector in the * backup cluster length. We end up copying more than * needed but at some point that is always the case. */ alloced = - bdrv_co_is_allocated(bs, + bdrv_is_allocated(bs, start * BACKUP_SECTORS_PER_CLUSTER + i, BACKUP_SECTORS_PER_CLUSTER - i, &n); i += n; @@ -338,7 +344,7 @@ static void coroutine_fn backup_run(void *opaque) hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); - bdrv_delete(target); + bdrv_unref(target); block_job_completed(&job->common, ret); } @@ -370,7 +376,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, return; } - BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed, + BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp); if (!job) { return; diff --git a/block/blkdebug.c b/block/blkdebug.c index ccb627ad9..380c73610 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -168,6 +168,7 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_REFTABLE_LOAD] = "reftable_load", [BLKDBG_REFTABLE_GROW] = "reftable_grow", + [BLKDBG_REFTABLE_UPDATE] = "reftable_update", [BLKDBG_REFBLOCK_LOAD] = "refblock_load", [BLKDBG_REFBLOCK_UPDATE] = "refblock_update", @@ -185,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_FLUSH_TO_OS] = "flush_to_os", [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", + + [BLKDBG_PWRITEV_RMW_HEAD] = "pwritev_rmw.head", + [BLKDBG_PWRITEV_RMW_AFTER_HEAD] = "pwritev_rmw.after_head", + [BLKDBG_PWRITEV_RMW_TAIL] = "pwritev_rmw.tail", + [BLKDBG_PWRITEV_RMW_AFTER_TAIL] = "pwritev_rmw.after_tail", + [BLKDBG_PWRITEV] = "pwritev", + [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", + [BLKDBG_PWRITEV_DONE] = "pwritev_done", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) @@ -270,19 +279,33 @@ static void remove_rule(BlkdebugRule *rule) g_free(rule); } -static int read_config(BDRVBlkdebugState *s, const char *filename) +static int read_config(BDRVBlkdebugState *s, const char *filename, + QDict *options, Error **errp) { - FILE *f; + FILE *f = NULL; int ret; struct add_rule_data d; + Error *local_err = NULL; - f = fopen(filename, "r"); - if (f == NULL) { - return -errno; + if (filename) { + f = fopen(filename, "r"); + if (f == NULL) { + error_setg_errno(errp, errno, "Could not read blkdebug config file"); + return -errno; + } + + ret = qemu_config_parse(f, config_groups, filename); + if (ret < 0) { + error_setg(errp, "Could not parse blkdebug config file"); + ret = -EINVAL; + goto fail; + } } - ret = qemu_config_parse(f, config_groups, filename); - if (ret < 0) { + qemu_config_parse_qdict(options, config_groups, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; goto fail; } @@ -297,7 +320,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename) fail: qemu_opts_reset(&inject_error_opts); qemu_opts_reset(&set_state_opts); - fclose(f); + if (f) { + fclose(f); + } return ret; } @@ -309,7 +334,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options, /* Parse the blkdebug: prefix */ if (!strstart(filename, "blkdebug:", &filename)) { - error_setg(errp, "File name string must start with 'blkdebug:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } @@ -345,53 +372,68 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "[internal use only, will be removed]", }, + { + .name = "align", + .type = QEMU_OPT_SIZE, + .help = "Required alignment in bytes", + }, { /* end of list */ } }, }; -static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags) +static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBlkdebugState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *config; + const char *config; + uint64_t align; int ret; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; - goto fail; + goto out; } - /* Read rules from config file */ + /* Read rules from config file or command line options */ config = qemu_opt_get(opts, "config"); - if (config) { - ret = read_config(s, config); - if (ret < 0) { - goto fail; - } + ret = read_config(s, config, options, errp); + if (ret) { + goto out; } /* Set initial state */ s->state = 1; /* Open the backing file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - ret = -EINVAL; - goto fail; + assert(bs->file == NULL); + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", + flags | BDRV_O_PROTOCOL, false, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; } - ret = bdrv_file_open(&bs->file, filename, NULL, flags); - if (ret < 0) { - goto fail; + /* Set request alignment */ + align = qemu_opt_get_size(opts, "align", bs->request_alignment); + if (align > 0 && align < INT_MAX && !(align & (align - 1))) { + bs->request_alignment = align; + } else { + error_setg(errp, "Invalid alignment"); + ret = -EINVAL; + goto fail_unref; } ret = 0; -fail: + goto out; + +fail_unref: + bdrv_unref(bs->file); +out: qemu_opts_del(opts); return ret; } @@ -590,9 +632,9 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) { BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq *r; + BlkdebugSuspendedReq *r, *next; - QLIST_FOREACH(r, &s->suspended_reqs, next) { + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { if (!strcmp(r->tag, tag)) { qemu_coroutine_enter(r->co, NULL); return 0; @@ -601,6 +643,31 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) return -ENOENT; } +static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, + const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r, *r_next; + BlkdebugRule *rule, *next; + int i, ret = -ENOENT; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { + if (rule->action == ACTION_SUSPEND && + !strcmp(rule->options.suspend.tag, tag)) { + remove_rule(rule); + ret = 0; + } + } + } + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { + if (!strcmp(r->tag, tag)) { + qemu_coroutine_enter(r->co, NULL); + ret = 0; + } + } + return ret; +} static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) { @@ -635,6 +702,8 @@ static BlockDriver bdrv_blkdebug = { .bdrv_debug_event = blkdebug_debug_event, .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, + .bdrv_debug_remove_breakpoint + = blkdebug_debug_remove_breakpoint, .bdrv_debug_resume = blkdebug_debug_resume, .bdrv_debug_is_suspended = blkdebug_debug_is_suspended, }; diff --git a/block/blkverify.c b/block/blkverify.c index 1d58cc393..e1c31171c 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options, /* Parse the blkverify: prefix */ if (!strstart(filename, "blkverify:", &filename)) { - error_setg(errp, "File name string must start with 'blkverify:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } @@ -116,46 +118,37 @@ static QemuOptsList runtime_opts = { }, }; -static int blkverify_open(BlockDriverState *bs, QDict *options, int flags) +static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBlkverifyState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *raw; int ret; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } - /* Parse the raw image filename */ - raw = qemu_opt_get(opts, "x-raw"); - if (raw == NULL) { - ret = -EINVAL; - goto fail; - } - - ret = bdrv_file_open(&bs->file, raw, NULL, flags); + /* Open the raw file */ + assert(bs->file == NULL); + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, + "raw", flags | BDRV_O_PROTOCOL, false, &local_err); if (ret < 0) { + error_propagate(errp, local_err); goto fail; } /* Open the test file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - ret = -EINVAL; - goto fail; - } - - s->test_file = bdrv_new(""); - ret = bdrv_open(s->test_file, filename, NULL, flags, NULL); + assert(s->test_file == NULL); + ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, + "test", flags, false, &local_err); if (ret < 0) { - bdrv_delete(s->test_file); + error_propagate(errp, local_err); s->test_file = NULL; goto fail; } @@ -169,7 +162,7 @@ static void blkverify_close(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; - bdrv_delete(s->test_file); + bdrv_unref(s->test_file); s->test_file = NULL; } @@ -180,110 +173,6 @@ static int64_t blkverify_getlength(BlockDriverState *bs) return bdrv_getlength(s->test_file); } -/** - * Check that I/O vector contents are identical - * - * @a: I/O vector - * @b: I/O vector - * @ret: Offset to first mismatching byte or -1 if match - */ -static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) -{ - int i; - ssize_t offset = 0; - - assert(a->niov == b->niov); - for (i = 0; i < a->niov; i++) { - size_t len = 0; - uint8_t *p = (uint8_t *)a->iov[i].iov_base; - uint8_t *q = (uint8_t *)b->iov[i].iov_base; - - assert(a->iov[i].iov_len == b->iov[i].iov_len); - while (len < a->iov[i].iov_len && *p++ == *q++) { - len++; - } - - offset += len; - - if (len != a->iov[i].iov_len) { - return offset; - } - } - return -1; -} - -typedef struct { - int src_index; - struct iovec *src_iov; - void *dest_base; -} IOVectorSortElem; - -static int sortelem_cmp_src_base(const void *a, const void *b) -{ - const IOVectorSortElem *elem_a = a; - const IOVectorSortElem *elem_b = b; - - /* Don't overflow */ - if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) { - return -1; - } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) { - return 1; - } else { - return 0; - } -} - -static int sortelem_cmp_src_index(const void *a, const void *b) -{ - const IOVectorSortElem *elem_a = a; - const IOVectorSortElem *elem_b = b; - - return elem_a->src_index - elem_b->src_index; -} - -/** - * Copy contents of I/O vector - * - * The relative relationships of overlapping iovecs are preserved. This is - * necessary to ensure identical semantics in the cloned I/O vector. - */ -static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, - void *buf) -{ - IOVectorSortElem sortelems[src->niov]; - void *last_end; - int i; - - /* Sort by source iovecs by base address */ - for (i = 0; i < src->niov; i++) { - sortelems[i].src_index = i; - sortelems[i].src_iov = &src->iov[i]; - } - qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); - - /* Allocate buffer space taking into account overlapping iovecs */ - last_end = NULL; - for (i = 0; i < src->niov; i++) { - struct iovec *cur = sortelems[i].src_iov; - ptrdiff_t rewind = 0; - - /* Detect overlap */ - if (last_end && last_end > cur->iov_base) { - rewind = last_end - cur->iov_base; - } - - sortelems[i].dest_base = buf - rewind; - buf += cur->iov_len - MIN(rewind, cur->iov_len); - last_end = MAX(cur->iov_base + cur->iov_len, last_end); - } - - /* Sort by source iovec index and build destination iovec */ - qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); - for (i = 0; i < src->niov; i++) { - qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); - } -} - static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -347,7 +236,7 @@ static void blkverify_aio_cb(void *opaque, int ret) static void blkverify_verify_readv(BlkverifyAIOCB *acb) { - ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov); + ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); if (offset != -1) { blkverify_err(acb, "contents mismatch in sector %" PRId64, acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); @@ -365,7 +254,7 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs, acb->verify = blkverify_verify_readv; acb->buf = qemu_blockalign(bs->file, qiov->size); qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); - blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); @@ -399,6 +288,20 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs, return bdrv_aio_flush(s->test_file, cb, opaque); } +static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVBlkverifyState *s = bs->opaque; + + bool perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + + if (perm) { + return true; + } + + return bdrv_recurse_is_first_non_filter(s->test_file, candidate); +} + static BlockDriver bdrv_blkverify = { .format_name = "blkverify", .protocol_name = "blkverify", @@ -412,6 +315,9 @@ static BlockDriver bdrv_blkverify = { .bdrv_aio_readv = blkverify_aio_readv, .bdrv_aio_writev = blkverify_aio_writev, .bdrv_aio_flush = blkverify_aio_flush, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, }; static void bdrv_blkverify_init(void) diff --git a/block/bochs.c b/block/bochs.c index d7078c077..eacf956e7 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -39,56 +39,41 @@ // not allocated: 0xffffffff // always little-endian -struct bochs_header_v1 { - char magic[32]; // "Bochs Virtual HD Image" - char type[16]; // "Redolog" - char subtype[16]; // "Undoable" / "Volatile" / "Growing" - uint32_t version; - uint32_t header; // size of header - - union { - struct { - uint32_t catalog; // num of entries - uint32_t bitmap; // bitmap size - uint32_t extent; // extent size - uint64_t disk; // disk size - char padding[HEADER_SIZE - 64 - 8 - 20]; - } redolog; - char padding[HEADER_SIZE - 64 - 8]; - } extra; -}; - -// always little-endian struct bochs_header { - char magic[32]; // "Bochs Virtual HD Image" - char type[16]; // "Redolog" - char subtype[16]; // "Undoable" / "Volatile" / "Growing" + char magic[32]; /* "Bochs Virtual HD Image" */ + char type[16]; /* "Redolog" */ + char subtype[16]; /* "Undoable" / "Volatile" / "Growing" */ uint32_t version; - uint32_t header; // size of header + uint32_t header; /* size of header */ + + uint32_t catalog; /* num of entries */ + uint32_t bitmap; /* bitmap size */ + uint32_t extent; /* extent size */ union { - struct { - uint32_t catalog; // num of entries - uint32_t bitmap; // bitmap size - uint32_t extent; // extent size - uint32_t reserved; // for ??? - uint64_t disk; // disk size - char padding[HEADER_SIZE - 64 - 8 - 24]; - } redolog; - char padding[HEADER_SIZE - 64 - 8]; + struct { + uint32_t reserved; /* for ??? */ + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 12]; + } QEMU_PACKED redolog; + struct { + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 8]; + } QEMU_PACKED redolog_v1; + char padding[HEADER_SIZE - 64 - 20]; } extra; -}; +} QEMU_PACKED; typedef struct BDRVBochsState { CoMutex lock; uint32_t *catalog_bitmap; - int catalog_size; + uint32_t catalog_size; - int data_offset; + uint32_t data_offset; - int bitmap_blocks; - int extent_blocks; - int extent_size; + uint32_t bitmap_blocks; + uint32_t extent_blocks; + uint32_t extent_size; } BDRVBochsState; static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -108,12 +93,12 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int bochs_open(BlockDriverState *bs, QDict *options, int flags) +static int bochs_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBochsState *s = bs->opaque; - int i; + uint32_t i; struct bochs_header bochs; - struct bochs_header_v1 header_v1; int ret; bs->read_only = 1; // no write support yet @@ -128,17 +113,24 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags) strcmp(bochs.subtype, GROWING_TYPE) || ((le32_to_cpu(bochs.version) != HEADER_VERSION) && (le32_to_cpu(bochs.version) != HEADER_V1))) { - return -EMEDIUMTYPE; + error_setg(errp, "Image not in Bochs format"); + return -EINVAL; } if (le32_to_cpu(bochs.version) == HEADER_V1) { - memcpy(&header_v1, &bochs, sizeof(bochs)); - bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512; + bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512; } else { - bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + } + + /* Limit to 1M entries to avoid unbounded allocation. This is what is + * needed for the largest image that bximage can create (~8 TB). */ + s->catalog_size = le32_to_cpu(bochs.catalog); + if (s->catalog_size > 0x100000) { + error_setg(errp, "Catalog size is too large"); + return -EFBIG; } - s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); s->catalog_bitmap = g_malloc(s->catalog_size * 4); ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, @@ -152,10 +144,34 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags) s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); - s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512; - s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512; + s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512; + s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512; - s->extent_size = le32_to_cpu(bochs.extra.redolog.extent); + s->extent_size = le32_to_cpu(bochs.extent); + if (s->extent_size < BDRV_SECTOR_SIZE) { + /* bximage actually never creates extents smaller than 4k */ + error_setg(errp, "Extent size must be at least 512"); + ret = -EINVAL; + goto fail; + } else if (!is_power_of_2(s->extent_size)) { + error_setg(errp, "Extent size %" PRIu32 " is not a power of two", + s->extent_size); + ret = -EINVAL; + goto fail; + } else if (s->extent_size > 0x800000) { + error_setg(errp, "Extent size %" PRIu32 " is too large", + s->extent_size); + ret = -EINVAL; + goto fail; + } + + if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors, + s->extent_size / BDRV_SECTOR_SIZE)) + { + error_setg(errp, "Catalog size is too small for this disk size"); + ret = -EINVAL; + goto fail; + } qemu_co_mutex_init(&s->lock); return 0; @@ -168,8 +184,8 @@ fail: static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) { BDRVBochsState *s = bs->opaque; - int64_t offset = sector_num * 512; - int64_t extent_index, extent_offset, bitmap_offset; + uint64_t offset = sector_num * 512; + uint64_t extent_index, extent_offset, bitmap_offset; char bitmap_entry; // seek to sector @@ -180,8 +196,9 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) return -1; /* not allocated */ } - bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] * - (s->extent_blocks + s->bitmap_blocks)); + bitmap_offset = s->data_offset + + (512 * (uint64_t) s->catalog_bitmap[extent_index] * + (s->extent_blocks + s->bitmap_blocks)); /* read in bitmap for current extent */ if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), diff --git a/block/cloop.c b/block/cloop.c index 6ea7cf404..b6ad50fbb 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -26,6 +26,9 @@ #include "qemu/module.h" #include <zlib.h> +/* Maximum compressed block size */ +#define MAX_BLOCK_SIZE (64 * 1024 * 1024) + typedef struct BDRVCloopState { CoMutex lock; uint32_t block_size; @@ -53,7 +56,8 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cloop_open(BlockDriverState *bs, QDict *options, int flags) +static int cloop_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCloopState *s = bs->opaque; uint32_t offsets_size, max_compressed_block_size = 1, i; @@ -67,6 +71,26 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) return ret; } s->block_size = be32_to_cpu(s->block_size); + if (s->block_size % 512) { + error_setg(errp, "block_size %u must be a multiple of 512", + s->block_size); + return -EINVAL; + } + if (s->block_size == 0) { + error_setg(errp, "block_size cannot be zero"); + return -EINVAL; + } + + /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but + * we can accept more. Prevent ridiculous values like 4 GB - 1 since we + * need a buffer this big. + */ + if (s->block_size > MAX_BLOCK_SIZE) { + error_setg(errp, "block_size %u must be %u MB or less", + s->block_size, + MAX_BLOCK_SIZE / (1024 * 1024)); + return -EINVAL; + } ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); if (ret < 0) { @@ -75,7 +99,23 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) s->n_blocks = be32_to_cpu(s->n_blocks); /* read offsets */ - offsets_size = s->n_blocks * sizeof(uint64_t); + if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { + /* Prevent integer overflow */ + error_setg(errp, "n_blocks %u must be %zu or less", + s->n_blocks, + (UINT32_MAX - 1) / sizeof(uint64_t)); + return -EINVAL; + } + offsets_size = (s->n_blocks + 1) * sizeof(uint64_t); + if (offsets_size > 512 * 1024 * 1024) { + /* Prevent ridiculous offsets_size which causes memory allocation to + * fail or overflows bdrv_pread() size. In practice the 512 MB + * offsets[] limit supports 16 TB images at 256 KB block size. + */ + error_setg(errp, "image requires too many offsets, " + "try increasing block size"); + return -EINVAL; + } s->offsets = g_malloc(offsets_size); ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); @@ -83,13 +123,37 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - for(i=0;i<s->n_blocks;i++) { + for (i = 0; i < s->n_blocks + 1; i++) { + uint64_t size; + s->offsets[i] = be64_to_cpu(s->offsets[i]); - if (i > 0) { - uint32_t size = s->offsets[i] - s->offsets[i - 1]; - if (size > max_compressed_block_size) { - max_compressed_block_size = size; - } + if (i == 0) { + continue; + } + + if (s->offsets[i] < s->offsets[i - 1]) { + error_setg(errp, "offsets not monotonically increasing at " + "index %u, image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + size = s->offsets[i] - s->offsets[i - 1]; + + /* Compressed blocks should be smaller than the uncompressed block size + * but maybe compression performed poorly so the compressed block is + * actually bigger. Clamp down on unrealistic values to prevent + * ridiculous s->compressed_block allocation. + */ + if (size > 2 * MAX_BLOCK_SIZE) { + error_setg(errp, "invalid compressed block size at index %u, " + "image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + if (size > max_compressed_block_size) { + max_compressed_block_size = size; } } @@ -179,9 +243,7 @@ static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, static void cloop_close(BlockDriverState *bs) { BDRVCloopState *s = bs->opaque; - if (s->n_blocks > 0) { - g_free(s->offsets); - } + g_free(s->offsets); g_free(s->compressed_block); g_free(s->uncompressed_block); inflateEnd(&s->zstream); diff --git a/block/commit.c b/block/commit.c index 2227fc2e6..acec4ac5a 100644 --- a/block/commit.c +++ b/block/commit.c @@ -103,14 +103,14 @@ wait: /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } /* Copy if allocated above the base */ - ret = bdrv_co_is_allocated_above(top, base, sector_num, - COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, - &n); + ret = bdrv_is_allocated_above(top, base, sector_num, + COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, + &n); copy = (ret == 1); trace_commit_one_iteration(s, sector_num, n, ret); if (copy) { @@ -173,9 +173,9 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static const BlockJobType commit_job_type = { +static const BlockJobDriver commit_job_driver = { .instance_size = sizeof(CommitBlockJob), - .job_type = "commit", + .job_type = BLOCK_JOB_TYPE_COMMIT, .set_speed = commit_set_speed, }; @@ -198,13 +198,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, return; } - /* Once we support top == active layer, remove this check */ - if (top == bs) { - error_setg(errp, - "Top image as the active layer is currently unsupported"); - return; - } - + assert(top != bs); if (top == base) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; @@ -238,7 +232,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, } - s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp); + s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); if (!s) { return; } diff --git a/block/cow.c b/block/cow.c index 1cc2e89c7..30deb88de 100644 --- a/block/cow.c +++ b/block/cow.c @@ -58,7 +58,8 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cow_open(BlockDriverState *bs, QDict *options, int flags) +static int cow_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCowState *s = bs->opaque; struct cow_header_v2 cow_header; @@ -73,7 +74,8 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags) } if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in COW format"); + ret = -EINVAL; goto fail; } @@ -81,7 +83,7 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags) char version[64]; snprintf(version, sizeof(version), "COW version %d", cow_header.version); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "cow", version); ret = -ENOTSUP; goto fail; @@ -102,42 +104,45 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags) return ret; } -/* - * XXX(hch): right now these functions are extremely inefficient. - * We should just read the whole bitmap we'll need in one go instead. - */ -static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum) +static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors) { - uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; - uint8_t bitmap; - int ret; - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; + int64_t bitnum = start, last = start + nb_sectors; + while (bitnum < last) { + if ((bitnum & 7) == 0 && bitnum + 8 <= last) { + bitmap[bitnum / 8] = 0xFF; + bitnum += 8; + continue; + } + bitmap[bitnum/8] |= (1 << (bitnum % 8)); + bitnum++; } +} - bitmap |= (1 << (bitnum % 8)); +#define BITS_PER_BITMAP_SECTOR (512 * 8) - ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; - } - return 0; +/* Cannot use bitmap.c on big-endian machines. */ +static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap) +{ + return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0; } -static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) +static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors) { - uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; - uint8_t bitmap; - int ret; - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; + int streak_value = value ? 0xFF : 0; + int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR); + int bitnum = start; + while (bitnum < last) { + if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) { + bitnum += 8; + continue; + } + if (cow_test_bit(bitnum, bitmap) == value) { + bitnum++; + continue; + } + break; } - - return !!(bitmap & (1 << (bitnum % 8))); + return MIN(bitnum, last) - start; } /* Return true if first block has been changed (ie. current version is @@ -146,40 +151,100 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *num_same) { - int changed; + int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; + uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; + bool first = true; + int changed = 0, same = 0; - if (nb_sectors == 0) { - *num_same = nb_sectors; - return 0; - } + do { + int ret; + uint8_t bitmap[BDRV_SECTOR_SIZE]; - changed = is_bit_set(bs, sector_num); - if (changed < 0) { - return 0; /* XXX: how to return I/O errors? */ - } + bitnum &= BITS_PER_BITMAP_SECTOR - 1; + int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); - for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) { - if (is_bit_set(bs, sector_num + *num_same) != changed) - break; - } + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + if (first) { + changed = cow_test_bit(bitnum, bitmap); + first = false; + } + + same += cow_find_streak(bitmap, changed, bitnum, nb_sectors); + + bitnum += sector_bits; + nb_sectors -= sector_bits; + offset += BDRV_SECTOR_SIZE; + } while (nb_sectors); + *num_same = same; return changed; } +static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *num_same) +{ + BDRVCowState *s = bs->opaque; + int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same); + int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS); + if (ret < 0) { + return ret; + } + return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID; +} + static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { - int error = 0; - int i; + int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; + uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; + bool first = true; + int sector_bits; + + for ( ; nb_sectors; + bitnum += sector_bits, + nb_sectors -= sector_bits, + offset += BDRV_SECTOR_SIZE) { + int ret, set; + uint8_t bitmap[BDRV_SECTOR_SIZE]; + + bitnum &= BITS_PER_BITMAP_SECTOR - 1; + sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + /* Skip over any already set bits */ + set = cow_find_streak(bitmap, 1, bitnum, sector_bits); + bitnum += set; + sector_bits -= set; + nb_sectors -= set; + if (!sector_bits) { + continue; + } - for (i = 0; i < nb_sectors; i++) { - error = cow_set_bit(bs, sector_num + i); - if (error) { - break; + if (first) { + ret = bdrv_flush(bs->file); + if (ret < 0) { + return ret; + } + first = false; + } + + cow_set_bits(bitmap, bitnum, sector_bits); + + ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; } } - return error; + return 0; } static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, @@ -189,7 +254,11 @@ static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, int ret, n; while (nb_sectors > 0) { - if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) { + ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + return ret; + } + if (ret) { ret = bdrv_pread(bs->file, s->cow_sectors_offset + sector_num * 512, buf, n * 512); @@ -255,12 +324,14 @@ static void cow_close(BlockDriverState *bs) { } -static int cow_create(const char *filename, QEMUOptionParameter *options) +static int cow_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { struct cow_header_v2 cow_header; struct stat st; int64_t image_sectors = 0; const char *image_filename = NULL; + Error *local_err = NULL; int ret; BlockDriverState *cow_bs; @@ -274,13 +345,17 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) options++; } - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR); + cow_bs = NULL; + ret = bdrv_open(&cow_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -314,7 +389,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) } exit: - bdrv_delete(cow_bs); + bdrv_unref(cow_bs); return ret; } @@ -344,7 +419,7 @@ static BlockDriver bdrv_cow = { .bdrv_read = cow_co_read, .bdrv_write = cow_co_write, - .bdrv_co_is_allocated = cow_co_is_allocated, + .bdrv_co_get_block_status = cow_co_get_block_status, .create_options = cow_create_options, }; diff --git a/block/curl.c b/block/curl.c index 82d39ff53..1b9b1f634 100644 --- a/block/curl.c +++ b/block/curl.c @@ -34,6 +34,11 @@ #define DPRINTF(fmt, ...) do { } while (0) #endif +#if LIBCURL_VERSION_NUM >= 0x071000 +/* The multi interface timer callback was introduced in 7.16.0 */ +#define NEED_CURL_TIMER_CALLBACK +#endif + #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ CURLPROTO_FTP | CURLPROTO_FTPS | \ CURLPROTO_TFTP) @@ -77,6 +82,7 @@ typedef struct CURLState typedef struct BDRVCURLState { CURLM *multi; + QEMUTimer timer; size_t len; CURLState states[CURL_NUM_STATES]; char *url; @@ -86,7 +92,23 @@ typedef struct BDRVCURLState { static void curl_clean_state(CURLState *s); static void curl_multi_do(void *arg); -static int curl_aio_flush(void *opaque); + +#ifdef NEED_CURL_TIMER_CALLBACK +static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) +{ + BDRVCURLState *s = opaque; + + DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); + if (timeout_ms == -1) { + timer_del(&s->timer); + } else { + int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; + timer_mod(&s->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); + } + return 0; +} +#endif static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, void *s, void *sp) @@ -94,17 +116,16 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); switch (action) { case CURL_POLL_IN: - qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, s); break; case CURL_POLL_OUT: - qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, s); break; case CURL_POLL_INOUT: - qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, - curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, s); break; case CURL_POLL_REMOVE: - qemu_aio_set_fd_handler(fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(fd, NULL, NULL, NULL); break; } @@ -136,6 +157,11 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) if (!s || !s->orig_buf) goto read_end; + if (s->buf_off >= s->buf_len) { + /* buffer full, read nothing */ + return 0; + } + realsize = MIN(realsize, s->buf_len - s->buf_off); memcpy(s->orig_buf + s->buf_off, ptr, realsize); s->buf_off += realsize; @@ -211,20 +237,10 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, return FIND_RET_NONE; } -static void curl_multi_do(void *arg) +static void curl_multi_read(BDRVCURLState *s) { - BDRVCURLState *s = (BDRVCURLState *)arg; - int running; - int r; int msgs_in_queue; - if (!s->multi) - return; - - do { - r = curl_multi_socket_all(s->multi, &running); - } while(r == CURLM_CALL_MULTI_PERFORM); - /* Try to find done transfers, so we can free the easy * handle again. */ do { @@ -268,6 +284,41 @@ static void curl_multi_do(void *arg) } while(msgs_in_queue); } +static void curl_multi_do(void *arg) +{ + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + int r; + + if (!s->multi) { + return; + } + + do { + r = curl_multi_socket_all(s->multi, &running); + } while(r == CURLM_CALL_MULTI_PERFORM); + + curl_multi_read(s); +} + +static void curl_multi_timeout_do(void *arg) +{ +#ifdef NEED_CURL_TIMER_CALLBACK + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + + if (!s->multi) { + return; + } + + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); + + curl_multi_read(s); +#else + abort(); +#endif +} + static CURLState *curl_init_state(BDRVCURLState *s) { CURLState *state = NULL; @@ -397,7 +448,8 @@ static QemuOptsList runtime_opts = { }, }; -static int curl_open(BlockDriverState *bs, QDict *options, int flags) +static int curl_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCURLState *s = bs->opaque; CURLState *state = NULL; @@ -409,30 +461,27 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags) static int inited = 0; if (flags & BDRV_O_RDWR) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, - "curl block device does not support writes"); + error_setg(errp, "curl block device does not support writes"); return -EROFS; } - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); goto out_noclean; } s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE); if ((s->readahead_size & 0x1ff) != 0) { - fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n", - s->readahead_size); + error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", + s->readahead_size); goto out_noclean; } file = qemu_opt_get(opts, "url"); if (file == NULL) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires " - "an 'url' option"); + error_setg(errp, "curl block driver requires an 'url' option"); goto out_noclean; } @@ -474,12 +523,20 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags) curl_easy_cleanup(state->curl); state->curl = NULL; + aio_timer_init(bdrv_get_aio_context(bs), &s->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + curl_multi_timeout_do, s); + // Now we know the file exists and its size, so let's // initialize the multi interface! s->multi = curl_multi_init(); curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s); curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); +#ifdef NEED_CURL_TIMER_CALLBACK + curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif curl_multi_do(s); qemu_opts_del(opts); @@ -495,21 +552,6 @@ out_noclean: return -EINVAL; } -static int curl_aio_flush(void *opaque) -{ - BDRVCURLState *s = opaque; - int i, j; - - for (i=0; i < CURL_NUM_STATES; i++) { - for(j=0; j < CURL_NUM_ACB; j++) { - if (s->states[i].acb[j]) { - return 1; - } - } - } - return 0; -} - static void curl_aio_cancel(BlockDriverAIOCB *blockacb) { // Do we have to implement canceling? Seems to work without... @@ -589,12 +631,6 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, acb->nb_sectors = nb_sectors; acb->bh = qemu_bh_new(curl_readv_bh_cb, acb); - - if (!acb->bh) { - DPRINTF("CURL: qemu_bh_new failed\n"); - return NULL; - } - qemu_bh_schedule(acb->bh); return &acb->common; } @@ -619,6 +655,9 @@ static void curl_close(BlockDriverState *bs) } if (s->multi) curl_multi_cleanup(s->multi); + + timer_del(&s->timer); + g_free(s->url); } diff --git a/block/dmg.c b/block/dmg.c index 3141cb5b8..856402e1f 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -27,6 +27,14 @@ #include "qemu/module.h" #include <zlib.h> +enum { + /* Limit chunk sizes to prevent unreasonable amounts of memory being used + * or truncating when converting to 32-bit types + */ + DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ + DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +}; + typedef struct BDRVDMGState { CoMutex lock; /* each chunk contains a certain number of sectors, @@ -92,12 +100,44 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) return 0; } -static int dmg_open(BlockDriverState *bs, QDict *options, int flags) +/* Increase max chunk sizes, if necessary. This function is used to calculate + * the buffer sizes needed for compressed/uncompressed chunk I/O. + */ +static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, + uint32_t *max_compressed_size, + uint32_t *max_sectors_per_chunk) +{ + uint32_t compressed_size = 0; + uint32_t uncompressed_sectors = 0; + + switch (s->types[chunk]) { + case 0x80000005: /* zlib compressed */ + compressed_size = s->lengths[chunk]; + uncompressed_sectors = s->sectorcounts[chunk]; + break; + case 1: /* copy */ + uncompressed_sectors = (s->lengths[chunk] + 511) / 512; + break; + case 2: /* zero */ + uncompressed_sectors = s->sectorcounts[chunk]; + break; + } + + if (compressed_size > *max_compressed_size) { + *max_compressed_size = compressed_size; + } + if (uncompressed_sectors > *max_sectors_per_chunk) { + *max_sectors_per_chunk = uncompressed_sectors; + } +} + +static int dmg_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVDMGState *s = bs->opaque; - uint64_t info_begin,info_end,last_in_offset,last_out_offset; + uint64_t info_begin, info_end, last_in_offset, last_out_offset; uint32_t count, tmp; - uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; + uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i; int64_t offset; int ret; @@ -159,37 +199,40 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - if (type == 0x6d697368 && count >= 244) { - int new_size, chunk_count; + if (type == 0x6d697368 && count >= 244) { + size_t new_size; + uint32_t chunk_count; offset += 4; offset += 200; - chunk_count = (count-204)/40; - new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); - s->types = g_realloc(s->types, new_size/2); - s->offsets = g_realloc(s->offsets, new_size); - s->lengths = g_realloc(s->lengths, new_size); - s->sectors = g_realloc(s->sectors, new_size); - s->sectorcounts = g_realloc(s->sectorcounts, new_size); + chunk_count = (count - 204) / 40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size / 2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { ret = read_uint32(bs, offset, &s->types[i]); if (ret < 0) { goto fail; } - offset += 4; - if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { - if(s->types[i]==0xffffffff) { - last_in_offset = s->offsets[i-1]+s->lengths[i-1]; - last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1]; - } - chunk_count--; - i--; - offset += 36; - continue; - } - offset += 4; + offset += 4; + if (s->types[i] != 0x80000005 && s->types[i] != 1 && + s->types[i] != 2) { + if (s->types[i] == 0xffffffff && i > 0) { + last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; + last_out_offset = s->sectors[i - 1] + + s->sectorcounts[i - 1]; + } + chunk_count--; + i--; + offset += 36; + continue; + } + offset += 4; ret = read_uint64(bs, offset, &s->sectors[i]); if (ret < 0) { @@ -204,6 +247,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) } offset += 8; + if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + error_report("sector count %" PRIu64 " for chunk %u is " + "larger than max (%u)", + s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + ret = -EINVAL; + goto fail; + } + ret = read_uint64(bs, offset, &s->offsets[i]); if (ret < 0) { goto fail; @@ -217,19 +268,25 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) } offset += 8; - if(s->lengths[i]>max_compressed_size) - max_compressed_size = s->lengths[i]; - if(s->sectorcounts[i]>max_sectors_per_chunk) - max_sectors_per_chunk = s->sectorcounts[i]; - } - s->n_chunks+=chunk_count; - } + if (s->lengths[i] > DMG_LENGTHS_MAX) { + error_report("length %" PRIu64 " for chunk %u is larger " + "than max (%u)", + s->lengths[i], i, DMG_LENGTHS_MAX); + ret = -EINVAL; + goto fail; + } + + update_max_chunk_size(s, i, &max_compressed_size, + &max_sectors_per_chunk); + } + s->n_chunks += chunk_count; + } } /* initialize zlib engine */ - s->compressed_chunk = g_malloc(max_compressed_size+1); - s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk); - if(inflateInit(&s->zstream) != Z_OK) { + s->compressed_chunk = g_malloc(max_compressed_size + 1); + s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk); + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; } @@ -251,83 +308,82 @@ fail: } static inline int is_sector_in_chunk(BDRVDMGState* s, - uint32_t chunk_num,int sector_num) + uint32_t chunk_num, uint64_t sector_num) { - if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num || - s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num) - return 0; - else - return -1; + if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num || + s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) { + return 0; + } else { + return -1; + } } -static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num) +static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) { /* binary search */ - uint32_t chunk1=0,chunk2=s->n_chunks,chunk3; - while(chunk1!=chunk2) { - chunk3 = (chunk1+chunk2)/2; - if(s->sectors[chunk3]>sector_num) - chunk2 = chunk3; - else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num) - return chunk3; - else - chunk1 = chunk3; + uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; + while (chunk1 != chunk2) { + chunk3 = (chunk1 + chunk2) / 2; + if (s->sectors[chunk3] > sector_num) { + chunk2 = chunk3; + } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { + return chunk3; + } else { + chunk1 = chunk3; + } } return s->n_chunks; /* error */ } -static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num) +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) { BDRVDMGState *s = bs->opaque; - if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) { - int ret; - uint32_t chunk = search_chunk(s,sector_num); - - if(chunk>=s->n_chunks) - return -1; - - s->current_chunk = s->n_chunks; - switch(s->types[chunk]) { - case 0x80000005: { /* zlib compressed */ - int i; - - /* we need to buffer, because only the chunk as whole can be - * inflated. */ - i=0; - do { - ret = bdrv_pread(bs->file, s->offsets[chunk] + i, - s->compressed_chunk+i, s->lengths[chunk]-i); - if(ret<0 && errno==EINTR) - ret=0; - i+=ret; - } while(ret>=0 && ret+i<s->lengths[chunk]); - - if (ret != s->lengths[chunk]) - return -1; - - s->zstream.next_in = s->compressed_chunk; - s->zstream.avail_in = s->lengths[chunk]; - s->zstream.next_out = s->uncompressed_chunk; - s->zstream.avail_out = 512*s->sectorcounts[chunk]; - ret = inflateReset(&s->zstream); - if(ret != Z_OK) - return -1; - ret = inflate(&s->zstream, Z_FINISH); - if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk]) - return -1; - break; } - case 1: /* copy */ - ret = bdrv_pread(bs->file, s->offsets[chunk], + if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { + int ret; + uint32_t chunk = search_chunk(s, sector_num); + + if (chunk >= s->n_chunks) { + return -1; + } + + s->current_chunk = s->n_chunks; + switch (s->types[chunk]) { + case 0x80000005: { /* zlib compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + s->zstream.next_in = s->compressed_chunk; + s->zstream.avail_in = s->lengths[chunk]; + s->zstream.next_out = s->uncompressed_chunk; + s->zstream.avail_out = 512 * s->sectorcounts[chunk]; + ret = inflateReset(&s->zstream); + if (ret != Z_OK) { + return -1; + } + ret = inflate(&s->zstream, Z_FINISH); + if (ret != Z_STREAM_END || + s->zstream.total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; } + case 1: /* copy */ + ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); - if (ret != s->lengths[chunk]) - return -1; - break; - case 2: /* zero */ - memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]); - break; - } - s->current_chunk = chunk; + if (ret != s->lengths[chunk]) { + return -1; + } + break; + case 2: /* zero */ + memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]); + break; + } + s->current_chunk = chunk; } return 0; } @@ -338,12 +394,14 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, BDRVDMGState *s = bs->opaque; int i; - for(i=0;i<nb_sectors;i++) { - uint32_t sector_offset_in_chunk; - if(dmg_read_chunk(bs, sector_num+i) != 0) - return -1; - sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk]; - memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512); + for (i = 0; i < nb_sectors; i++) { + uint32_t sector_offset_in_chunk; + if (dmg_read_chunk(bs, sector_num + i) != 0) { + return -1; + } + sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; + memcpy(buf + i * 512, + s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); } return 0; } @@ -375,12 +433,12 @@ static void dmg_close(BlockDriverState *bs) } static BlockDriver bdrv_dmg = { - .format_name = "dmg", - .instance_size = sizeof(BDRVDMGState), - .bdrv_probe = dmg_probe, - .bdrv_open = dmg_open, - .bdrv_read = dmg_co_read, - .bdrv_close = dmg_close, + .format_name = "dmg", + .instance_size = sizeof(BDRVDMGState), + .bdrv_probe = dmg_probe, + .bdrv_open = dmg_open, + .bdrv_read = dmg_co_read, + .bdrv_close = dmg_close, }; static void bdrv_dmg_init(void) diff --git a/block/gluster.c b/block/gluster.c index 645b7f12a..883608564 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -3,43 +3,26 @@ * * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com> * - * Pipe handling mechanism in AIO implementation is derived from - * block/rbd.c. Hence, + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. * - * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>, - * Josh Durgin <josh.durgin@dreamhost.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. */ #include <glusterfs/api/glfs.h> #include "block/block_int.h" -#include "qemu/sockets.h" #include "qemu/uri.h" typedef struct GlusterAIOCB { - BlockDriverAIOCB common; int64_t size; int ret; - bool *finished; QEMUBH *bh; + Coroutine *coroutine; } GlusterAIOCB; typedef struct BDRVGlusterState { struct glfs *glfs; - int fds[2]; struct glfs_fd *fd; - int qemu_aio_count; - int event_reader_pos; - GlusterAIOCB *event_acb; } BDRVGlusterState; -#define GLUSTER_FD_READ 0 -#define GLUSTER_FD_WRITE 1 - typedef struct GlusterConf { char *server; int port; @@ -50,11 +33,13 @@ typedef struct GlusterConf { static void qemu_gluster_gconf_free(GlusterConf *gconf) { - g_free(gconf->server); - g_free(gconf->volname); - g_free(gconf->image); - g_free(gconf->transport); - g_free(gconf); + if (gconf) { + g_free(gconf->server); + g_free(gconf->volname); + g_free(gconf->image); + g_free(gconf->transport); + g_free(gconf); + } } static int parse_volume_options(GlusterConf *gconf, char *path) @@ -95,7 +80,7 @@ static int parse_volume_options(GlusterConf *gconf, char *path) * 'server' specifies the server where the volume file specification for * the given volume resides. This can be either hostname, ipv4 address * or ipv6 address. ipv6 address needs to be within square brackets [ ]. - * If transport type is 'unix', then 'server' field should not be specifed. + * If transport type is 'unix', then 'server' field should not be specified. * The 'socket' field needs to be populated with the path to unix domain * socket. * @@ -132,7 +117,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) } /* transport */ - if (!strcmp(uri->scheme, "gluster")) { + if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { gconf->transport = g_strdup("tcp"); } else if (!strcmp(uri->scheme, "gluster+tcp")) { gconf->transport = g_strdup("tcp"); @@ -168,7 +153,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) } gconf->server = g_strdup(qp->p[0].value); } else { - gconf->server = g_strdup(uri->server); + gconf->server = g_strdup(uri->server ? uri->server : "localhost"); gconf->port = uri->port; } @@ -180,7 +165,8 @@ out: return ret; } -static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) +static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, + Error **errp) { struct glfs *glfs = NULL; int ret; @@ -188,8 +174,8 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) ret = qemu_gluster_parseuri(gconf, filename); if (ret < 0) { - error_report("Usage: file=gluster[+transport]://[server[:port]]/" - "volname/image[?socket=...]"); + error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" + "volname/image[?socket=...]"); errno = -ret; goto out; } @@ -216,9 +202,11 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) ret = glfs_init(glfs); if (ret) { - error_report("Gluster connection failed for server=%s port=%d " - "volume=%s image=%s transport=%s", gconf->server, gconf->port, - gconf->volname, gconf->image, gconf->transport); + error_setg_errno(errp, errno, + "Gluster connection failed for server=%s port=%d " + "volume=%s image=%s transport=%s", gconf->server, + gconf->port, gconf->volname, gconf->image, + gconf->transport); goto out; } return glfs; @@ -232,54 +220,32 @@ out: return NULL; } -static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s) +static void qemu_gluster_complete_aio(void *opaque) { - int ret; - bool *finished = acb->finished; - BlockDriverCompletionFunc *cb = acb->common.cb; - void *opaque = acb->common.opaque; - - if (!acb->ret || acb->ret == acb->size) { - ret = 0; /* Success */ - } else if (acb->ret < 0) { - ret = acb->ret; /* Read/Write failed */ - } else { - ret = -EIO; /* Partial read/write - fail it */ - } + GlusterAIOCB *acb = (GlusterAIOCB *)opaque; - s->qemu_aio_count--; - qemu_aio_release(acb); - cb(opaque, ret); - if (finished) { - *finished = true; - } + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_coroutine_enter(acb->coroutine, NULL); } -static void qemu_gluster_aio_event_reader(void *opaque) +/* + * AIO callback routine called from GlusterFS thread. + */ +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) { - BDRVGlusterState *s = opaque; - ssize_t ret; - - do { - char *p = (char *)&s->event_acb; - - ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos, - sizeof(s->event_acb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_acb)) { - s->event_reader_pos = 0; - qemu_gluster_complete_aio(s->event_acb, s); - } - } - } while (ret < 0 && errno == EINTR); -} + GlusterAIOCB *acb = (GlusterAIOCB *)arg; -static int qemu_gluster_aio_flush_cb(void *opaque) -{ - BDRVGlusterState *s = opaque; + if (!ret || ret == acb->size) { + acb->ret = 0; /* Success */ + } else if (ret < 0) { + acb->ret = ret; /* Read/Write failed */ + } else { + acb->ret = -EIO; /* Partial read/write - fail it */ + } - return (s->qemu_aio_count > 0); + acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); + qemu_bh_schedule(acb->bh); } /* TODO Convert to fine grained options */ @@ -296,60 +262,57 @@ static QemuOptsList runtime_opts = { }, }; +static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) +{ + assert(open_flags != NULL); + + *open_flags |= O_BINARY; + + if (bdrv_flags & BDRV_O_RDWR) { + *open_flags |= O_RDWR; + } else { + *open_flags |= O_RDONLY; + } + + if ((bdrv_flags & BDRV_O_NOCACHE)) { + *open_flags |= O_DIRECT; + } +} + static int qemu_gluster_open(BlockDriverState *bs, QDict *options, - int bdrv_flags) + int bdrv_flags, Error **errp) { BDRVGlusterState *s = bs->opaque; - int open_flags = O_BINARY; + int open_flags = 0; int ret = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); QemuOpts *opts; Error *local_err = NULL; const char *filename; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } filename = qemu_opt_get(opts, "filename"); - - s->glfs = qemu_gluster_init(gconf, filename); + s->glfs = qemu_gluster_init(gconf, filename, errp); if (!s->glfs) { ret = -errno; goto out; } - if (bdrv_flags & BDRV_O_RDWR) { - open_flags |= O_RDWR; - } else { - open_flags |= O_RDONLY; - } - - if ((bdrv_flags & BDRV_O_NOCACHE)) { - open_flags |= O_DIRECT; - } + qemu_gluster_parse_flags(bdrv_flags, &open_flags); s->fd = glfs_open(s->glfs, gconf->image, open_flags); if (!s->fd) { ret = -errno; - goto out; } - ret = qemu_pipe(s->fds); - if (ret < 0) { - ret = -errno; - goto out; - } - fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], - qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); - out: qemu_opts_del(opts); qemu_gluster_gconf_free(gconf); @@ -365,24 +328,180 @@ out: return ret; } +typedef struct BDRVGlusterReopenState { + struct glfs *glfs; + struct glfs_fd *fd; +} BDRVGlusterReopenState; + + +static int qemu_gluster_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + int ret = 0; + BDRVGlusterReopenState *reop_s; + GlusterConf *gconf = NULL; + int open_flags = 0; + + assert(state != NULL); + assert(state->bs != NULL); + + state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState)); + reop_s = state->opaque; + + qemu_gluster_parse_flags(state->flags, &open_flags); + + gconf = g_malloc0(sizeof(GlusterConf)); + + reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); + if (reop_s->glfs == NULL) { + ret = -errno; + goto exit; + } + + reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); + if (reop_s->fd == NULL) { + /* reops->glfs will be cleaned up in _abort */ + ret = -errno; + goto exit; + } + +exit: + /* state->opaque will be freed in either the _abort or _commit */ + qemu_gluster_gconf_free(gconf); + return ret; +} + +static void qemu_gluster_reopen_commit(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + BDRVGlusterState *s = state->bs->opaque; + + + /* close the old */ + if (s->fd) { + glfs_close(s->fd); + } + if (s->glfs) { + glfs_fini(s->glfs); + } + + /* use the newly opened image / connection */ + s->fd = reop_s->fd; + s->glfs = reop_s->glfs; + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + + +static void qemu_gluster_reopen_abort(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + + if (reop_s == NULL) { + return; + } + + if (reop_s->fd) { + glfs_close(reop_s->fd); + } + + if (reop_s->glfs) { + glfs_fini(reop_s->glfs); + } + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +#ifdef CONFIG_GLUSTERFS_ZEROFILL +static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + int ret; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + BDRVGlusterState *s = bs->opaque; + off_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb->size = size; + acb->ret = 0; + acb->coroutine = qemu_coroutine_self(); + + ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + if (ret < 0) { + ret = -errno; + goto out; + } + + qemu_coroutine_yield(); + ret = acb->ret; + +out: + g_slice_free(GlusterAIOCB, acb); + return ret; +} + +static inline bool gluster_supports_zerofill(void) +{ + return 1; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return glfs_zerofill(fd, offset, size); +} + +#else +static inline bool gluster_supports_zerofill(void) +{ + return 0; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return 0; +} +#endif + static int qemu_gluster_create(const char *filename, - QEMUOptionParameter *options) + QEMUOptionParameter *options, Error **errp) { struct glfs *glfs; struct glfs_fd *fd; int ret = 0; + int prealloc = 0; int64_t total_size = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); - glfs = qemu_gluster_init(gconf, filename); + glfs = qemu_gluster_init(gconf, filename, errp); if (!glfs) { - ret = -errno; + ret = -EINVAL; goto out; } while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { total_size = options->value.n / BDRV_SECTOR_SIZE; + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "full") && + gluster_supports_zerofill()) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'" + " or GlusterFS doesn't support zerofill API", + options->value.s); + ret = -EINVAL; + goto out; + } } options++; } @@ -392,9 +511,15 @@ static int qemu_gluster_create(const char *filename, if (!fd) { ret = -errno; } else { - if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) { + if (prealloc && qemu_gluster_zerofill(fd, 0, + total_size * BDRV_SECTOR_SIZE)) { + ret = -errno; + } + } else { ret = -errno; } + if (glfs_close(fd) != 0) { ret = -errno; } @@ -407,72 +532,18 @@ out: return ret; } -static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; - bool finished = false; - - acb->finished = &finished; - while (!finished) { - qemu_aio_wait(); - } -} - -static const AIOCBInfo gluster_aiocb_info = { - .aiocb_size = sizeof(GlusterAIOCB), - .cancel = qemu_gluster_aio_cancel, -}; - -static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)arg; - BlockDriverState *bs = acb->common.bs; - BDRVGlusterState *s = bs->opaque; - int retval; - - acb->ret = ret; - retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb)); - if (retval != sizeof(acb)) { - /* - * Gluster AIO callback thread failed to notify the waiting - * QEMU thread about IO completion. - * - * Complete this IO request and make the disk inaccessible for - * subsequent reads and writes. - */ - error_report("Gluster failed to notify QEMU about IO completion"); - - qemu_mutex_lock_iothread(); /* We are in gluster thread context */ - acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); - s->qemu_aio_count--; - close(s->fds[GLUSTER_FD_READ]); - close(s->fds[GLUSTER_FD_WRITE]); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, - NULL); - bs->drv = NULL; /* Make the disk inaccessible */ - qemu_mutex_unlock_iothread(); - } -} - -static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int write) +static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; - - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - s->qemu_aio_count++; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = size; acb->ret = 0; - acb->finished = NULL; + acb->coroutine = qemu_coroutine_self(); if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, @@ -483,14 +554,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, } if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) @@ -506,75 +579,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) return 0; } -static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); } -static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } -static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; - s->qemu_aio_count++; + acb->coroutine = qemu_coroutine_self(); ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #ifdef CONFIG_GLUSTERFS_DISCARD -static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque) +static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; - s->qemu_aio_count++; + acb->coroutine = qemu_coroutine_self(); ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #endif @@ -609,10 +675,6 @@ static void qemu_gluster_close(BlockDriverState *bs) { BDRVGlusterState *s = bs->opaque; - close(s->fds[GLUSTER_FD_READ]); - close(s->fds[GLUSTER_FD_WRITE]); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL); - if (s->fd) { glfs_close(s->fd); s->fd = NULL; @@ -632,6 +694,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = { .type = OPT_SIZE, .help = "Virtual disk size" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, { NULL } }; @@ -639,18 +706,25 @@ static BlockDriver bdrv_gluster = { .format_name = "gluster", .protocol_name = "gluster", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -659,18 +733,25 @@ static BlockDriver bdrv_gluster_tcp = { .format_name = "gluster", .protocol_name = "gluster+tcp", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -679,18 +760,25 @@ static BlockDriver bdrv_gluster_unix = { .format_name = "gluster", .protocol_name = "gluster+unix", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -699,18 +787,25 @@ static BlockDriver bdrv_gluster_rdma = { .format_name = "gluster", .protocol_name = "gluster+rdma", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; diff --git a/block/iscsi.c b/block/iscsi.c index e7c1c2b53..f425573df 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,6 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> + * Copyright (c) 2012-2013 Peter Lieven <pl@kamp.de> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,8 @@ #include "trace.h" #include "block/scsi.h" #include "qemu/iov.h" +#include "sysemu/sysemu.h" +#include "qmp-commands.h" #include <iscsi/iscsi.h> #include <iscsi/scsi-lowlevel.h> @@ -50,8 +53,24 @@ typedef struct IscsiLun { uint64_t num_blocks; int events; QEMUTimer *nop_timer; + uint8_t lbpme; + uint8_t lbprz; + uint8_t has_write_same; + struct scsi_inquiry_logical_block_provisioning lbp; + struct scsi_inquiry_block_limits bl; + unsigned char *zeroblock; } IscsiLun; +typedef struct IscsiTask { + int status; + int complete; + int retries; + int do_retry; + struct scsi_task *task; + Coroutine *co; + QEMUBH *bh; +} IscsiTask; + typedef struct IscsiAIOCB { BlockDriverAIOCB common; QEMUIOVector *qiov; @@ -105,6 +124,50 @@ iscsi_schedule_bh(IscsiAIOCB *acb) qemu_bh_schedule(acb->bh); } +static void iscsi_co_generic_bh_cb(void *opaque) +{ + struct IscsiTask *iTask = opaque; + qemu_bh_delete(iTask->bh); + qemu_coroutine_enter(iTask->co, NULL); +} + +static void +iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + struct IscsiTask *iTask = opaque; + struct scsi_task *task = command_data; + + iTask->complete = 1; + iTask->status = status; + iTask->do_retry = 0; + iTask->task = task; + + if (iTask->retries-- > 0 && status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { + error_report("iSCSI CheckCondition: %s", iscsi_get_error(iscsi)); + iTask->do_retry = 1; + goto out; + } + + if (status != SCSI_STATUS_GOOD) { + error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); + } + +out: + if (iTask->co) { + iTask->bh = qemu_bh_new(iscsi_co_generic_bh_cb, iTask); + qemu_bh_schedule(iTask->bh); + } +} + +static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) +{ + *iTask = (struct IscsiTask) { + .co = qemu_coroutine_self(), + .retries = ISCSI_CMD_RETRIES, + }; +} static void iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, @@ -146,13 +209,6 @@ static const AIOCBInfo iscsi_aiocb_info = { static void iscsi_process_read(void *arg); static void iscsi_process_write(void *arg); -static int iscsi_process_flush(void *arg) -{ - IscsiLun *iscsilun = arg; - - return iscsi_queue_length(iscsilun->iscsi) > 0; -} - static void iscsi_set_events(IscsiLun *iscsilun) { @@ -166,7 +222,6 @@ iscsi_set_events(IscsiLun *iscsilun) qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), iscsi_process_read, (ev & POLLOUT) ? iscsi_process_write : NULL, - iscsi_process_flush, iscsilun); } @@ -194,44 +249,6 @@ iscsi_process_write(void *arg) iscsi_set_events(iscsilun); } -static int -iscsi_aio_writev_acb(IscsiAIOCB *acb); - -static void -iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled); - - g_free(acb->buf); - acb->buf = NULL; - - if (acb->canceled != 0) { - return; - } - - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_writev_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to write16 data to iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; - } - - iscsi_schedule_bh(acb); -} - static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) { return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; @@ -256,406 +273,182 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, return 1; } -static int -iscsi_aio_writev_acb(IscsiAIOCB *acb) +static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - size_t size; - uint32_t num_sectors; + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; uint64_t lba; -#if !defined(LIBISCSI_FEATURE_IOVECTOR) - struct iscsi_data data; -#endif - int ret; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; + uint32_t num_sectors; + uint8_t *data = NULL; + uint8_t *buf = NULL; - /* this will allow us to get rid of 'buf' completely */ - size = acb->nb_sectors * BDRV_SECTOR_SIZE; + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); #if !defined(LIBISCSI_FEATURE_IOVECTOR) - data.size = MIN(size, acb->qiov->size); - /* if the iovec only contains one buffer we can pass it directly */ - if (acb->qiov->niov == 1) { - data.data = acb->qiov->iov[0].iov_base; + if (iov->niov == 1) { + data = iov->iov[0].iov_base; } else { - acb->buf = g_malloc(data.size); - qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size); - data.data = acb->buf; + size_t size = MIN(nb_sectors * BDRV_SECTOR_SIZE, iov->size); + buf = g_malloc(size); + qemu_iovec_to_buf(iov, 0, buf, size); + data = buf; } #endif - - acb->task = malloc(sizeof(struct scsi_task)); - if (acb->task == NULL) { - error_report("iSCSI: Failed to allocate task for scsi WRITE16 " - "command. %s", iscsi_get_error(iscsi)); - return -1; + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, + data, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + if (iTask.task == NULL) { + g_free(buf); + return -ENOMEM; } - memset(acb->task, 0, sizeof(struct scsi_task)); - - acb->task->xfer_dir = SCSI_XFER_WRITE; - acb->task->cdb_size = 16; - acb->task->cdb[0] = 0x8a; - lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); - *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); - num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); - *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); - acb->task->expxferlen = size; - #if defined(LIBISCSI_FEATURE_IOVECTOR) - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_write16_cb, - NULL, - acb); -#else - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_write16_cb, - &data, - acb); + scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, + iov->niov); #endif - if (ret != 0) { - scsi_free_scsi_task(acb->task); - g_free(acb->buf); - return -1; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } -#if defined(LIBISCSI_FEATURE_IOVECTOR) - scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); -#endif - - return 0; -} - -static BlockDriverAIOCB * -iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return NULL; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); - - acb->iscsilun = iscsilun; - acb->qiov = qiov; - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_writev_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - iscsi_set_events(iscsilun); - return &acb->common; -} - -static int -iscsi_aio_readv_acb(IscsiAIOCB *acb); - -static void -iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled); + g_free(buf); - if (acb->canceled != 0) { - return; - } - - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_readv_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to read16 data from iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; } - iscsi_schedule_bh(acb); + return 0; } -static int -iscsi_aio_readv_acb(IscsiAIOCB *acb) +static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - size_t size; + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - int ret; #if !defined(LIBISCSI_FEATURE_IOVECTOR) int i; #endif - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - size = acb->nb_sectors * BDRV_SECTOR_SIZE; - - acb->task = malloc(sizeof(struct scsi_task)); - if (acb->task == NULL) { - error_report("iSCSI: Failed to allocate task for scsi READ16 " - "command. %s", iscsi_get_error(iscsi)); - return -1; + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; } - memset(acb->task, 0, sizeof(struct scsi_task)); - acb->task->xfer_dir = SCSI_XFER_READ; - acb->task->expxferlen = size; - lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); - switch (acb->iscsilun->type) { + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + switch (iscsilun->type) { case TYPE_DISK: - acb->task->cdb_size = 16; - acb->task->cdb[0] = 0x88; - *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); - *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); - *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); + iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); break; default: - acb->task->cdb_size = 10; - acb->task->cdb[0] = 0x28; - *(uint32_t *)&acb->task->cdb[2] = htonl(lba); - *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors); + iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, +#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */ + 0, 0, 0, 0, 0, +#endif + iscsi_co_generic_cb, &iTask); break; } - - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_read16_cb, - NULL, - acb); - if (ret != 0) { - scsi_free_scsi_task(acb->task); - return -1; + if (iTask.task == NULL) { + return -ENOMEM; } - #if defined(LIBISCSI_FEATURE_IOVECTOR) - scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); + scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); #else - for (i = 0; i < acb->qiov->niov; i++) { - scsi_task_add_data_in_buffer(acb->task, - acb->qiov->iov[i].iov_len, - acb->qiov->iov[i].iov_base); + for (i = 0; i < iov->niov; i++) { + scsi_task_add_data_in_buffer(iTask.task, + iov->iov[i].iov_len, + iov->iov[i].iov_base); } #endif - return 0; -} -static BlockDriverAIOCB * -iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return NULL; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); - - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->iscsilun = iscsilun; - acb->qiov = qiov; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_readv_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - iscsi_set_events(iscsilun); - return &acb->common; -} - -static int -iscsi_aio_flush_acb(IscsiAIOCB *acb); - -static void -iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - if (acb->canceled != 0) { - return; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_flush_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to sync10 data on iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; - } - - iscsi_schedule_bh(acb); -} - -static int -iscsi_aio_flush_acb(IscsiAIOCB *acb) -{ - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun, - 0, 0, 0, 0, - iscsi_synccache10_cb, - acb); - if (acb->task == NULL) { - error_report("iSCSI: Failed to send synchronizecache10 command. %s", - iscsi_get_error(iscsi)); - return -1; + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; } return 0; } -static BlockDriverAIOCB * -iscsi_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) { IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; - IscsiAIOCB *acb; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_flush_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (bs->sg) { + return 0; } - iscsi_set_events(iscsilun); - - return &acb->common; -} - -static int iscsi_aio_discard_acb(IscsiAIOCB *acb); + iscsi_co_init_iscsitask(iscsilun, &iTask); -static void -iscsi_unmap_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - if (acb->canceled != 0) { - return; +retry: + if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, + 0, iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; } - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_discard_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to unmap data on iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } - iscsi_schedule_bh(acb); -} - -static int iscsi_aio_discard_acb(IscsiAIOCB *acb) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - struct unmap_list list[1]; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size; - - acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun, - 0, 0, &list[0], 1, - iscsi_unmap_cb, - acb); - if (acb->task == NULL) { - error_report("iSCSI: Failed to send unmap command. %s", - iscsi_get_error(iscsi)); - return -1; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - return 0; -} - -static BlockDriverAIOCB * -iscsi_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_discard_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - iscsi_set_events(iscsilun); + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } - return &acb->common; + return 0; } #ifdef __linux__ @@ -850,7 +643,234 @@ iscsi_getlength(BlockDriverState *bs) return len; } -static int parse_chap(struct iscsi_context *iscsi, const char *target) +#if defined(LIBISCSI_FEATURE_IOVECTOR) + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + IscsiLun *iscsilun = bs->opaque; + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; + int64_t ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + ret = -EINVAL; + goto out; + } + + /* default to all sectors allocated */ + ret = BDRV_BLOCK_DATA; + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; + *pnum = nb_sectors; + + /* LUN does not support logical block provisioning */ + if (iscsilun->lbpme == 0) { + goto out; + } + +retry: + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, + sector_qemu2lun(sector_num, iscsilun), + 8 + 16, iscsi_co_generic_cb, + &iTask) == NULL) { + ret = -ENOMEM; + goto out; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.do_retry) { + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + /* in case the get_lba_status_callout fails (i.e. + * because the device is busy or the cmd is not + * supported) we pretend all blocks are allocated + * for backwards compatibility */ + goto out; + } + + lbas = scsi_datain_unmarshall(iTask.task); + if (lbas == NULL) { + ret = -EIO; + goto out; + } + + lbasd = &lbas->descriptors[0]; + + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + ret = -EIO; + goto out; + } + + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { + ret &= ~BDRV_BLOCK_DATA; + if (iscsilun->lbprz) { + ret |= BDRV_BLOCK_ZERO; + } + } + +out: + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + } + return ret; +} + +#endif /* LIBISCSI_FEATURE_IOVECTOR */ + +static int +coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + struct unmap_list list; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (!iscsilun->lbp.lbpu) { + /* UNMAP is not supported by the target */ + return 0; + } + + list.lba = sector_qemu2lun(sector_num, iscsilun); + list.num = sector_qemu2lun(nb_sectors, iscsilun); + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, + iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { + /* the target might fail with a check condition if it + is not happy with the alignment of the UNMAP request + we silently fail in this case */ + return 0; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } + + return 0; +} + +#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) + +static int +coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + uint64_t lba; + uint32_t nb_blocks; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { + /* WRITE SAME without UNMAP is not supported by the target */ + return -ENOTSUP; + } + + if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) { + /* WRITE SAME with UNMAP is not supported by the target */ + return -ENOTSUP; + } + + lba = sector_qemu2lun(sector_num, iscsilun); + nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + + if (iscsilun->zeroblock == NULL) { + iscsilun->zeroblock = g_malloc0(iscsilun->block_size); + } + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION && + iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && + (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE || + iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) { + /* WRITE SAME is not supported by the target */ + iscsilun->has_write_same = false; + scsi_free_scsi_task(iTask.task); + return -ENOTSUP; + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } + + return 0; +} + +#endif /* SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED */ + +static void parse_chap(struct iscsi_context *iscsi, const char *target, + Error **errp) { QemuOptsList *list; QemuOpts *opts; @@ -859,37 +879,35 @@ static int parse_chap(struct iscsi_context *iscsi, const char *target) list = qemu_find_opts("iscsi"); if (!list) { - return 0; + return; } opts = qemu_opts_find(list, target); if (opts == NULL) { opts = QTAILQ_FIRST(&list->head); if (!opts) { - return 0; + return; } } user = qemu_opt_get(opts, "user"); if (!user) { - return 0; + return; } password = qemu_opt_get(opts, "password"); if (!password) { - error_report("CHAP username specified but no password was given"); - return -1; + error_setg(errp, "CHAP username specified but no password was given"); + return; } if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { - error_report("Failed to set initiator username and password"); - return -1; + error_setg(errp, "Failed to set initiator username and password"); } - - return 0; } -static void parse_header_digest(struct iscsi_context *iscsi, const char *target) +static void parse_header_digest(struct iscsi_context *iscsi, const char *target, + Error **errp) { QemuOptsList *list; QemuOpts *opts; @@ -922,7 +940,7 @@ static void parse_header_digest(struct iscsi_context *iscsi, const char *target) } else if (!strcmp(digest, "NONE-CRC32C")) { iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); } else { - error_report("Invalid header-digest setting : %s", digest); + error_setg(errp, "Invalid header-digest setting : %s", digest); } } @@ -930,8 +948,9 @@ static char *parse_initiator_name(const char *target) { QemuOptsList *list; QemuOpts *opts; - const char *name = NULL; - const char *iscsi_name = qemu_get_vm_name(); + const char *name; + char *iscsi_name; + UuidInfo *uuid_info; list = qemu_find_opts("iscsi"); if (list) { @@ -941,16 +960,22 @@ static char *parse_initiator_name(const char *target) } if (opts) { name = qemu_opt_get(opts, "initiator-name"); + if (name) { + return g_strdup(name); + } } } - if (name) { - return g_strdup(name); + uuid_info = qmp_query_uuid(NULL); + if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { + name = qemu_get_vm_name(); } else { - return g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", - iscsi_name ? ":" : "", - iscsi_name ? iscsi_name : ""); + name = uuid_info->UUID; } + iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", + name ? ":" : "", name ? name : ""); + qapi_free_UuidInfo(uuid_info); + return iscsi_name; } #if defined(LIBISCSI_FEATURE_NOP_COUNTER) @@ -968,17 +993,16 @@ static void iscsi_nop_timed_event(void *opaque) return; } - qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); + timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); iscsi_set_events(iscsilun); } #endif -static int iscsi_readcapacity_sync(IscsiLun *iscsilun) +static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) { struct scsi_task *task = NULL; struct scsi_readcapacity10 *rc10 = NULL; struct scsi_readcapacity16 *rc16 = NULL; - int ret = 0; int retries = ISCSI_CMD_RETRIES; do { @@ -993,11 +1017,12 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) if (task != NULL && task->status == SCSI_STATUS_GOOD) { rc16 = scsi_datain_unmarshall(task); if (rc16 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity16 data."); - ret = -EINVAL; + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data."); } else { iscsilun->block_size = rc16->block_length; iscsilun->num_blocks = rc16->returned_lba + 1; + iscsilun->lbpme = rc16->lbpme; + iscsilun->lbprz = rc16->lbprz; } } break; @@ -1006,8 +1031,7 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) if (task != NULL && task->status == SCSI_STATUS_GOOD) { rc10 = scsi_datain_unmarshall(task); if (rc10 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity10 data."); - ret = -EINVAL; + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data."); } else { iscsilun->block_size = rc10->block_size; if (rc10->lba == 0) { @@ -1020,20 +1044,18 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) } break; default: - return 0; + return; } } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION && task->sense.key == SCSI_SENSE_UNIT_ATTENTION && retries-- > 0); if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send readcapacity10 command."); - ret = -EINVAL; + error_setg(errp, "iSCSI: failed to send readcapacity10 command."); } if (task) { scsi_free_scsi_task(task); } - return ret; } /* TODO Convert to fine grained options */ @@ -1050,45 +1072,88 @@ static QemuOptsList runtime_opts = { }, }; +static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, + int evpd, int pc, void **inq, Error **errp) +{ + int full_size; + struct scsi_task *task = NULL; + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + full_size = scsi_datain_getfullsize(task); + if (full_size > task->datain.size) { + scsi_free_scsi_task(task); + + /* we need more data for the full list */ + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + } + + *inq = scsi_datain_unmarshall(task); + if (*inq == NULL) { + error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); + goto fail; + } + + return task; + +fail: + if (!error_is_set(errp)) { + error_setg(errp, "iSCSI: Inquiry command failed : %s", + iscsi_get_error(iscsi)); + } + if (task != NULL) { + scsi_free_scsi_task(task); + } + return NULL; +} + /* * We support iscsi url's on the form * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> + * + * Note: flags are currently not used by iscsi_open. If this function + * is changed such that flags are used, please examine iscsi_reopen_prepare() + * to see if needs to be changed as well. */ -static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) +static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { IscsiLun *iscsilun = bs->opaque; struct iscsi_context *iscsi = NULL; struct iscsi_url *iscsi_url = NULL; struct scsi_task *task = NULL; struct scsi_inquiry_standard *inq = NULL; + struct scsi_inquiry_supported_pages *inq_vpd; char *initiator_name = NULL; QemuOpts *opts; Error *local_err = NULL; const char *filename; - int ret; + int i, ret; if ((BDRV_SECTOR_SIZE % 512) != 0) { - error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. " - "BDRV_SECTOR_SIZE(%lld) is not a multiple " - "of 512", BDRV_SECTOR_SIZE); + error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. " + "BDRV_SECTOR_SIZE(%lld) is not a multiple " + "of 512", BDRV_SECTOR_SIZE); return -EINVAL; } - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } filename = qemu_opt_get(opts, "filename"); - iscsi_url = iscsi_parse_full_url(iscsi, filename); if (iscsi_url == NULL) { - error_report("Failed to parse URL : %s", filename); + error_setg(errp, "Failed to parse URL : %s", filename); ret = -EINVAL; goto out; } @@ -1099,13 +1164,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsi = iscsi_create_context(initiator_name); if (iscsi == NULL) { - error_report("iSCSI: Failed to create iSCSI context."); + error_setg(errp, "iSCSI: Failed to create iSCSI context."); ret = -ENOMEM; goto out; } if (iscsi_set_targetname(iscsi, iscsi_url->target)) { - error_report("iSCSI: Failed to set target name."); + error_setg(errp, "iSCSI: Failed to set target name."); ret = -EINVAL; goto out; } @@ -1114,21 +1179,22 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, iscsi_url->passwd); if (ret != 0) { - error_report("Failed to set initiator username and password"); + error_setg(errp, "Failed to set initiator username and password"); ret = -EINVAL; goto out; } } /* check if we got CHAP username/password via the options */ - if (parse_chap(iscsi, iscsi_url->target) != 0) { - error_report("iSCSI: Failed to set CHAP user/password"); + parse_chap(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { - error_report("iSCSI: Failed to set session type to normal."); + error_setg(errp, "iSCSI: Failed to set session type to normal."); ret = -EINVAL; goto out; } @@ -1136,10 +1202,15 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); /* check if we got HEADER_DIGEST via the options */ - parse_header_digest(iscsi, iscsi_url->target); + parse_header_digest(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { - error_report("iSCSI: Failed to connect to LUN : %s", + error_setg(errp, "iSCSI: Failed to connect to LUN : %s", iscsi_get_error(iscsi)); ret = -EINVAL; goto out; @@ -1147,42 +1218,82 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsilun->iscsi = iscsi; iscsilun->lun = iscsi_url->lun; + iscsilun->has_write_same = true; - task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36); - - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send inquiry command."); - ret = -EINVAL; - goto out; - } - - inq = scsi_datain_unmarshall(task); - if (inq == NULL) { - error_report("iSCSI: Failed to unmarshall inquiry data."); + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, + (void **) &inq, errp); + if (task == NULL) { ret = -EINVAL; goto out; } - iscsilun->type = inq->periperal_device_type; + scsi_free_scsi_task(task); + task = NULL; - if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; goto out; } bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); + bs->request_alignment = iscsilun->block_size; - /* Medium changer or tape. We dont have any emulation for this so this must - * be sg ioctl compatible. We force it to be sg, otherwise qemu will try - * to read from the device to guess the image format. + /* We don't have any emulation for devices other than disks and CD-ROMs, so + * this must be sg ioctl compatible. We force it to be sg, otherwise qemu + * will try to read from the device to guess the image format. */ - if (iscsilun->type == TYPE_MEDIUM_CHANGER || - iscsilun->type == TYPE_TAPE) { + if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { bs->sg = 1; } + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES, + (void **) &inq_vpd, errp); + if (task == NULL) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < inq_vpd->num_pages; i++) { + struct scsi_task *inq_task; + struct scsi_inquiry_logical_block_provisioning *inq_lbp; + struct scsi_inquiry_block_limits *inq_bl; + switch (inq_vpd->pages[i]) { + case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, + (void **) &inq_lbp, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->lbp, inq_lbp, + sizeof(struct scsi_inquiry_logical_block_provisioning)); + scsi_free_scsi_task(inq_task); + break; + case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS, + (void **) &inq_bl, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->bl, inq_bl, + sizeof(struct scsi_inquiry_block_limits)); + scsi_free_scsi_task(inq_task); + break; + default: + break; + } + } + scsi_free_scsi_task(task); + task = NULL; + #if defined(LIBISCSI_FEATURE_NOP_COUNTER) /* Set up a timer for sending out iSCSI NOPs */ - iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun); - qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); + iscsilun->nop_timer = timer_new_ms(QEMU_CLOCK_REALTIME, iscsi_nop_timed_event, iscsilun); + timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); #endif out: @@ -1212,25 +1323,66 @@ static void iscsi_close(BlockDriverState *bs) struct iscsi_context *iscsi = iscsilun->iscsi; if (iscsilun->nop_timer) { - qemu_del_timer(iscsilun->nop_timer); - qemu_free_timer(iscsilun->nop_timer); + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); } - qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); iscsi_destroy_context(iscsi); + g_free(iscsilun->zeroblock); memset(iscsilun, 0, sizeof(IscsiLun)); } +static int iscsi_refresh_limits(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + + /* We don't actually refresh here, but just return data queried in + * iscsi_open(): iscsi targets don't change their limits. */ + if (iscsilun->lbp.lbpu) { + if (iscsilun->bl.max_unmap < 0xffffffff) { + bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, + iscsilun); + } + bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + } + + if (iscsilun->bl.max_ws_len < 0xffffffff) { + bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, + iscsilun); + } + if (iscsilun->lbp.lbpws) { + bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + } + bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, + iscsilun); + return 0; +} + +/* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in + * prepare. Note that this will not re-establish a connection with an iSCSI + * target - it is effectively a NOP. */ +static int iscsi_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + /* NOP */ + return 0; +} + static int iscsi_truncate(BlockDriverState *bs, int64_t offset) { IscsiLun *iscsilun = bs->opaque; - int ret = 0; + Error *local_err = NULL; if (iscsilun->type != TYPE_DISK) { return -ENOTSUP; } - if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { - return ret; + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_free(local_err); + return -EIO; } if (offset > iscsi_getlength(bs)) { @@ -1240,20 +1392,16 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) return 0; } -static int iscsi_has_zero_init(BlockDriverState *bs) -{ - return 0; -} - -static int iscsi_create(const char *filename, QEMUOptionParameter *options) +static int iscsi_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int ret = 0; int64_t total_size = 0; - BlockDriverState bs; + BlockDriverState *bs; IscsiLun *iscsilun = NULL; QDict *bs_options; - memset(&bs, 0, sizeof(BlockDriverState)); + bs = bdrv_new(""); /* Read out options */ while (options && options->name) { @@ -1263,26 +1411,26 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options) options++; } - bs.opaque = g_malloc0(sizeof(struct IscsiLun)); - iscsilun = bs.opaque; + bs->opaque = g_malloc0(sizeof(struct IscsiLun)); + iscsilun = bs->opaque; bs_options = qdict_new(); qdict_put(bs_options, "filename", qstring_from_str(filename)); - ret = iscsi_open(&bs, bs_options, 0); + ret = iscsi_open(bs, bs_options, 0, NULL); QDECREF(bs_options); if (ret != 0) { goto out; } if (iscsilun->nop_timer) { - qemu_del_timer(iscsilun->nop_timer); - qemu_free_timer(iscsilun->nop_timer); + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); } if (iscsilun->type != TYPE_DISK) { ret = -ENODEV; goto out; } - if (bs.total_sectors < total_size) { + if (bs->total_sectors < total_size) { ret = -ENOSPC; goto out; } @@ -1292,10 +1440,27 @@ out: if (iscsilun->iscsi != NULL) { iscsi_destroy_context(iscsilun->iscsi); } - g_free(bs.opaque); + g_free(bs->opaque); + bs->opaque = NULL; + bdrv_unref(bs); return ret; } +static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + IscsiLun *iscsilun = bs->opaque; + bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; + bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; + /* Guess the internal cluster (page) size of the iscsi target by the means + * of opt_unmap_gran. Transfer the unmap granularity only if it has a + * reasonable size for bdi->cluster_size */ + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { + bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; + } + return 0; +} + static QEMUOptionParameter iscsi_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1310,20 +1475,28 @@ static BlockDriver bdrv_iscsi = { .protocol_name = "iscsi", .instance_size = sizeof(IscsiLun), + .bdrv_needs_filename = true, .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, .bdrv_create = iscsi_create, .create_options = iscsi_create_options, + .bdrv_reopen_prepare = iscsi_reopen_prepare, .bdrv_getlength = iscsi_getlength, + .bdrv_get_info = iscsi_get_info, .bdrv_truncate = iscsi_truncate, + .bdrv_refresh_limits = iscsi_refresh_limits, - .bdrv_aio_readv = iscsi_aio_readv, - .bdrv_aio_writev = iscsi_aio_writev, - .bdrv_aio_flush = iscsi_aio_flush, - - .bdrv_aio_discard = iscsi_aio_discard, - .bdrv_has_zero_init = iscsi_has_zero_init, +#if defined(LIBISCSI_FEATURE_IOVECTOR) + .bdrv_co_get_block_status = iscsi_co_get_block_status, +#endif + .bdrv_co_discard = iscsi_co_discard, +#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) + .bdrv_co_write_zeroes = iscsi_co_write_zeroes, +#endif + .bdrv_co_readv = iscsi_co_readv, + .bdrv_co_writev = iscsi_co_writev, + .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ .bdrv_ioctl = iscsi_ioctl, diff --git a/block/linux-aio.c b/block/linux-aio.c index ee0f8d10c..53434e2df 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -39,7 +39,6 @@ struct qemu_laiocb { struct qemu_laio_state { io_context_t ctx; EventNotifier e; - int count; }; static inline ssize_t io_event_ret(struct io_event *ev) @@ -55,8 +54,6 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, { int ret; - s->count--; - ret = laiocb->ret; if (ret != -ECANCELED) { if (ret == laiocb->nbytes) { @@ -101,13 +98,6 @@ static void qemu_laio_completion_cb(EventNotifier *e) } } -static int qemu_laio_flush_cb(EventNotifier *e) -{ - struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); - - return (s->count > 0) ? 1 : 0; -} - static void laio_cancel(BlockDriverAIOCB *blockacb) { struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; @@ -177,14 +167,11 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, goto out_free_aiocb; } io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); - s->count++; if (io_submit(s->ctx, 1, &iocbs) < 0) - goto out_dec_count; + goto out_free_aiocb; return &laiocb->common; -out_dec_count: - s->count--; out_free_aiocb: qemu_aio_release(laiocb); return NULL; @@ -203,8 +190,7 @@ void *laio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb, - qemu_laio_flush_cb); + qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb); return s; diff --git a/block/mirror.c b/block/mirror.c index bed4a7ead..0ef41f999 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -31,7 +31,8 @@ typedef struct MirrorBlockJob { BlockJob common; RateLimit limit; BlockDriverState *target; - MirrorSyncMode mode; + BlockDriverState *base; + bool is_none_mode; BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; @@ -39,6 +40,7 @@ typedef struct MirrorBlockJob { int64_t granularity; size_t buf_size; unsigned long *cow_bitmap; + BdrvDirtyBitmap *dirty_bitmap; HBitmapIter hbi; uint8_t *buf; QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; @@ -94,8 +96,16 @@ static void mirror_iteration_done(MirrorOp *op, int ret) bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); } + qemu_iovec_destroy(&op->qiov); g_slice_free(MirrorOp, op); - qemu_coroutine_enter(s->common.co, NULL); + + /* Enter coroutine when it is not sleeping. The coroutine sleeps to + * rate-limit itself. The coroutine will eventually resume since there is + * a sleep timeout so don't wake it early. + */ + if (s->common.busy) { + qemu_coroutine_enter(s->common.co, NULL); + } } static void mirror_write_complete(void *opaque, int ret) @@ -136,18 +146,20 @@ static void mirror_read_complete(void *opaque, int ret) mirror_write_complete, op); } -static void coroutine_fn mirror_iteration(MirrorBlockJob *s) +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { BlockDriverState *source = s->common.bs; int nb_sectors, sectors_per_chunk, nb_chunks; int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; + uint64_t delay_ns; MirrorOp *op; s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { - bdrv_dirty_iter_init(source, &s->hbi); + bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); + trace_mirror_restart_iter(s, + bdrv_get_dirty_count(source, s->dirty_bitmap)); assert(s->sector_num >= 0); } @@ -183,7 +195,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) do { int added_sectors, added_chunks; - if (!bdrv_get_dirty(source, next_sector) || + if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || test_bit(next_chunk, s->in_flight_bitmap)) { assert(nb_sectors > 0); break; @@ -227,7 +239,12 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) nb_chunks += added_chunks; next_sector += added_sectors; next_chunk += added_chunks; - } while (next_sector < end); + if (!s->synced && s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); + } else { + delay_ns = 0; + } + } while (delay_ns == 0 && next_sector < end); /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_slice_new(MirrorOp); @@ -249,7 +266,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. */ - if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) { + if (next_sector > hbitmap_next_sector + && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { hbitmap_next_sector = hbitmap_iter_next(&s->hbi); } @@ -263,6 +281,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) trace_mirror_one_iteration(s, sector_num, nb_sectors); bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); + return delay_ns; } static void mirror_free_init(MirrorBlockJob *s) @@ -332,14 +351,13 @@ static void coroutine_fn mirror_run(void *opaque) sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); - if (s->mode != MIRROR_SYNC_MODE_NONE) { + if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ - BlockDriverState *base; - base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; + BlockDriverState *base = s->base; for (sector_num = 0; sector_num < end; ) { int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; - ret = bdrv_co_is_allocated_above(bs, base, - sector_num, next - sector_num, &n); + ret = bdrv_is_allocated_above(bs, base, + sector_num, next - sector_num, &n); if (ret < 0) { goto immediate_exit; @@ -355,10 +373,10 @@ static void coroutine_fn mirror_run(void *opaque) } } - bdrv_dirty_iter_init(bs, &s->hbi); - last_pause_ns = qemu_get_clock_ns(rt_clock); + bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi); + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); for (;;) { - uint64_t delay_ns; + uint64_t delay_ns = 0; int64_t cnt; bool should_complete; @@ -367,14 +385,14 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that qemu_aio_flush() returns. * We do so every SLICE_TIME nanoseconds, or when there is an error, * or when the source is clean, whichever comes first. */ - if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME && + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { @@ -382,8 +400,10 @@ static void coroutine_fn mirror_run(void *opaque) qemu_coroutine_yield(); continue; } else if (cnt != 0) { - mirror_iteration(s); - continue; + delay_ns = mirror_iteration(s); + if (delay_ns == 0) { + continue; + } } } @@ -409,7 +429,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = s->should_complete || block_job_is_cancelled(&s->common); - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } } @@ -424,28 +444,21 @@ static void coroutine_fn mirror_run(void *opaque) */ trace_mirror_before_drain(s, cnt); bdrv_drain_all(); - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } ret = 0; - trace_mirror_before_sleep(s, cnt, s->synced); + trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); if (!s->synced) { /* Publish progress */ s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; - - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk); - } else { - delay_ns = 0; - } - - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } } else if (!should_complete) { delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); } else if (cnt == 0) { /* The two disks are in sync. Exit and report successful * completion. @@ -454,7 +467,7 @@ static void coroutine_fn mirror_run(void *opaque) s->common.cancelled = false; break; } - last_pause_ns = qemu_get_clock_ns(rt_clock); + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } immediate_exit: @@ -471,16 +484,22 @@ immediate_exit: qemu_vfree(s->buf); g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); - bdrv_set_dirty_tracking(bs, 0); + bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL); } bdrv_swap(s->target, s->common.bs); + if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { + /* drop the bs loop chain formed by the swap: break the loop then + * trigger the unref from the top one */ + BlockDriverState *p = s->base->backing_hd; + s->base->backing_hd = NULL; + bdrv_unref(p); + } } - bdrv_close(s->target); - bdrv_delete(s->target); + bdrv_unref(s->target); block_job_completed(&s->common, ret); } @@ -505,14 +524,12 @@ static void mirror_iostatus_reset(BlockJob *job) static void mirror_complete(BlockJob *job, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + Error *local_err = NULL; int ret; - ret = bdrv_open_backing_file(s->target, NULL); + ret = bdrv_open_backing_file(s->target, NULL, &local_err); if (ret < 0) { - char backing_filename[PATH_MAX]; - bdrv_get_full_backing_filename(s->target, backing_filename, - sizeof(backing_filename)); - error_setg_file_open(errp, -ret, backing_filename); + error_propagate(errp, local_err); return; } if (!s->synced) { @@ -524,20 +541,32 @@ static void mirror_complete(BlockJob *job, Error **errp) block_job_resume(job); } -static const BlockJobType mirror_job_type = { +static const BlockJobDriver mirror_job_driver = { .instance_size = sizeof(MirrorBlockJob), - .job_type = "mirror", + .job_type = BLOCK_JOB_TYPE_MIRROR, .set_speed = mirror_set_speed, .iostatus_reset= mirror_iostatus_reset, .complete = mirror_complete, }; -void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - BlockDriverCompletionFunc *cb, - void *opaque, Error **errp) +static const BlockJobDriver commit_active_job_driver = { + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = mirror_set_speed, + .iostatus_reset + = mirror_iostatus_reset, + .complete = mirror_complete, +}; + +static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, + int64_t buf_size, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp, + const BlockJobDriver *driver, + bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; @@ -562,7 +591,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, return; } - s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp); + + s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { return; } @@ -570,11 +600,12 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_source_error = on_source_error; s->on_target_error = on_target_error; s->target = target; - s->mode = mode; + s->is_none_mode = is_none_mode; + s->base = base; s->granularity = granularity; s->buf_size = MAX(buf_size, granularity); - bdrv_set_dirty_tracking(bs, granularity); + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity); bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); @@ -582,3 +613,80 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, trace_mirror_start(bs, s, s->common.co, opaque); qemu_coroutine_enter(s->common.co, s); } + +void mirror_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + bool is_none_mode; + BlockDriverState *base; + + is_none_mode = mode == MIRROR_SYNC_MODE_NONE; + base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; + mirror_start_job(bs, target, speed, granularity, buf_size, + on_source_error, on_target_error, cb, opaque, errp, + &mirror_job_driver, is_none_mode, base); +} + +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, + int64_t speed, + BlockdevOnError on_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + int64_t length, base_length; + int orig_base_flags; + int ret; + Error *local_err = NULL; + + orig_base_flags = bdrv_get_flags(base); + + if (bdrv_reopen(base, bs->open_flags, errp)) { + return; + } + + length = bdrv_getlength(bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Unable to determine length of %s", bs->filename); + goto error_restore_flags; + } + + base_length = bdrv_getlength(base); + if (base_length < 0) { + error_setg_errno(errp, -base_length, + "Unable to determine length of %s", base->filename); + goto error_restore_flags; + } + + if (length > base_length) { + ret = bdrv_truncate(base, length); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Top image %s is larger than base image %s, and " + "resize of base image failed", + bs->filename, base->filename); + goto error_restore_flags; + } + } + + bdrv_ref(base); + mirror_start_job(bs, base, speed, 0, 0, + on_error, on_error, cb, opaque, &local_err, + &commit_active_job_driver, false, base); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto error_restore_flags; + } + + return; + +error_restore_flags: + /* ignore error and errp for bdrv_reopen, because we want to propagate + * the original error */ + bdrv_reopen(base, orig_base_flags, NULL); + return; +} diff --git a/block/nbd-client.c b/block/nbd-client.c new file mode 100644 index 000000000..7d698cb61 --- /dev/null +++ b/block/nbd-client.c @@ -0,0 +1,388 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "nbd-client.h" +#include "qemu/sockets.h" + +#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) +#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) + +static void nbd_recv_coroutines_enter_all(NbdClientSession *s) +{ + int i; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + } + } +} + +static void nbd_teardown_connection(NbdClientSession *client) +{ + /* finish any pending coroutines */ + shutdown(client->sock, 2); + nbd_recv_coroutines_enter_all(client); + + qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL); + closesocket(client->sock); + client->sock = -1; +} + +static void nbd_reply_ready(void *opaque) +{ + NbdClientSession *s = opaque; + uint64_t i; + int ret; + + if (s->reply.handle == 0) { + /* No reply already in flight. Fetch a header. It is possible + * that another thread has done the same thing in parallel, so + * the socket is not readable anymore. + */ + ret = nbd_receive_reply(s->sock, &s->reply); + if (ret == -EAGAIN) { + return; + } + if (ret < 0) { + s->reply.handle = 0; + goto fail; + } + } + + /* There's no need for a mutex on the receive side, because the + * handler acts as a synchronization point and ensures that only + * one coroutine is called until the reply finishes. */ + i = HANDLE_TO_INDEX(s, s->reply.handle); + if (i >= MAX_NBD_REQUESTS) { + goto fail; + } + + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + return; + } + +fail: + nbd_teardown_connection(s); +} + +static void nbd_restart_write(void *opaque) +{ + NbdClientSession *s = opaque; + + qemu_coroutine_enter(s->send_coroutine, NULL); +} + +static int nbd_co_send_request(NbdClientSession *s, + struct nbd_request *request, + QEMUIOVector *qiov, int offset) +{ + int rc, ret; + + qemu_co_mutex_lock(&s->send_mutex); + s->send_coroutine = qemu_coroutine_self(); + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s); + if (qiov) { + if (!s->is_unix) { + socket_set_cork(s->sock, 1); + } + rc = nbd_send_request(s->sock, request); + if (rc >= 0) { + ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + rc = -EIO; + } + } + if (!s->is_unix) { + socket_set_cork(s->sock, 0); + } + } else { + rc = nbd_send_request(s->sock, request); + } + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s); + s->send_coroutine = NULL; + qemu_co_mutex_unlock(&s->send_mutex); + return rc; +} + +static void nbd_co_receive_reply(NbdClientSession *s, + struct nbd_request *request, struct nbd_reply *reply, + QEMUIOVector *qiov, int offset) +{ + int ret; + + /* Wait until we're woken up by the read handler. TODO: perhaps + * peek at the next reply and avoid yielding if it's ours? */ + qemu_coroutine_yield(); + *reply = s->reply; + if (reply->handle != request->handle) { + reply->error = EIO; + } else { + if (qiov && reply->error == 0) { + ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + reply->error = EIO; + } + } + + /* Tell the read handler to read another header. */ + s->reply.handle = 0; + } +} + +static void nbd_coroutine_start(NbdClientSession *s, + struct nbd_request *request) +{ + int i; + + /* Poor man semaphore. The free_sema is locked when no other request + * can be accepted, and unlocked after receiving one reply. */ + if (s->in_flight >= MAX_NBD_REQUESTS - 1) { + qemu_co_mutex_lock(&s->free_sema); + assert(s->in_flight < MAX_NBD_REQUESTS); + } + s->in_flight++; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i] == NULL) { + s->recv_coroutine[i] = qemu_coroutine_self(); + break; + } + } + + assert(i < MAX_NBD_REQUESTS); + request->handle = INDEX_TO_HANDLE(s, i); +} + +static void nbd_coroutine_end(NbdClientSession *s, + struct nbd_request *request) +{ + int i = HANDLE_TO_INDEX(s, request->handle); + s->recv_coroutine[i] = NULL; + if (s->in_flight-- == MAX_NBD_REQUESTS) { + qemu_co_mutex_unlock(&s->free_sema); + } +} + +static int nbd_co_readv_1(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + struct nbd_request request = { .type = NBD_CMD_READ }; + struct nbd_reply reply; + ssize_t ret; + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, qiov, offset); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +static int nbd_co_writev_1(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + struct nbd_request request = { .type = NBD_CMD_WRITE }; + struct nbd_reply reply; + ssize_t ret; + + if (!bdrv_enable_write_cache(client->bs) && + (client->nbdflags & NBD_FLAG_SEND_FUA)) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, qiov, offset); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +/* qemu-nbd has a limit of slightly less than 1M per request. Try to + * remain aligned to 4K. */ +#define NBD_MAX_SECTORS 2040 + +int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_readv_1(client, sector_num, + NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_readv_1(client, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_writev_1(client, sector_num, + NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_writev_1(client, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_session_co_flush(NbdClientSession *client) +{ + struct nbd_request request = { .type = NBD_CMD_FLUSH }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { + return 0; + } + + if (client->nbdflags & NBD_FLAG_SEND_FUA) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = 0; + request.len = 0; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, + int nb_sectors) +{ + struct nbd_request request = { .type = NBD_CMD_TRIM }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { + return 0; + } + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +void nbd_client_session_close(NbdClientSession *client) +{ + struct nbd_request request = { + .type = NBD_CMD_DISC, + .from = 0, + .len = 0 + }; + + if (!client->bs) { + return; + } + if (client->sock == -1) { + return; + } + + nbd_send_request(client->sock, &request); + + nbd_teardown_connection(client); + client->bs = NULL; +} + +int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, + int sock, const char *export) +{ + int ret; + + /* NBD handshake */ + logout("session init %s\n", export); + qemu_set_block(sock); + ret = nbd_receive_negotiate(sock, export, + &client->nbdflags, &client->size, + &client->blocksize); + if (ret < 0) { + logout("Failed to negotiate with the NBD server\n"); + closesocket(sock); + return ret; + } + + qemu_co_mutex_init(&client->send_mutex); + qemu_co_mutex_init(&client->free_sema); + client->bs = bs; + client->sock = sock; + + /* Now that we're connected, set the socket to be non-blocking and + * kick the reply mechanism. */ + qemu_set_nonblock(sock); + qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client); + + logout("Established connection with NBD server\n"); + return 0; +} diff --git a/block/nbd-client.h b/block/nbd-client.h new file mode 100644 index 000000000..f2a63378b --- /dev/null +++ b/block/nbd-client.h @@ -0,0 +1,50 @@ +#ifndef NBD_CLIENT_H +#define NBD_CLIENT_H + +#include "qemu-common.h" +#include "block/nbd.h" +#include "block/block_int.h" + +/* #define DEBUG_NBD */ + +#if defined(DEBUG_NBD) +#define logout(fmt, ...) \ + fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +#define MAX_NBD_REQUESTS 16 + +typedef struct NbdClientSession { + int sock; + uint32_t nbdflags; + off_t size; + size_t blocksize; + + CoMutex send_mutex; + CoMutex free_sema; + Coroutine *send_coroutine; + int in_flight; + + Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; + struct nbd_reply reply; + + bool is_unix; + + BlockDriverState *bs; +} NbdClientSession; + +int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, + int sock, const char *export_name); +void nbd_client_session_close(NbdClientSession *client); + +int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, + int nb_sectors); +int nbd_client_session_co_flush(NbdClientSession *client); +int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); + +#endif /* NBD_CLIENT_H */ diff --git a/block/nbd.c b/block/nbd.c index 9c480b8f2..55124239d 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -26,8 +26,7 @@ * THE SOFTWARE. */ -#include "qemu-common.h" -#include "block/nbd.h" +#include "block/nbd-client.h" #include "qemu/uri.h" #include "block/block_int.h" #include "qemu/module.h" @@ -40,37 +39,9 @@ #define EN_OPTSTR ":exportname=" -/* #define DEBUG_NBD */ - -#if defined(DEBUG_NBD) -#define logout(fmt, ...) \ - fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) -#else -#define logout(fmt, ...) ((void)0) -#endif - -#define MAX_NBD_REQUESTS 16 -#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) -#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) - typedef struct BDRVNBDState { - int sock; - uint32_t nbdflags; - off_t size; - size_t blocksize; - - CoMutex send_mutex; - CoMutex free_sema; - Coroutine *send_coroutine; - int in_flight; - - Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; - struct nbd_reply reply; - - bool is_unix; + NbdClientSession client; QemuOpts *socket_opts; - - char *export_name; /* An NBD server may export several devices */ } BDRVNBDState; static int nbd_parse_uri(const char *filename, QDict *options) @@ -217,204 +188,49 @@ out: g_free(file); } -static int nbd_config(BDRVNBDState *s, QDict *options) +static void nbd_config(BDRVNBDState *s, QDict *options, char **export, + Error **errp) { Error *local_err = NULL; - if (qdict_haskey(options, "path")) { - if (qdict_haskey(options, "host")) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not " - "be used at the same time."); - return -EINVAL; + if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { + if (qdict_haskey(options, "path")) { + error_setg(errp, "path and host may not be used at the same time."); + } else { + error_setg(errp, "one of path and host must be specified."); } - s->is_unix = true; - } else if (qdict_haskey(options, "host")) { - s->is_unix = false; - } else { - return -EINVAL; + return; } - s->socket_opts = qemu_opts_create_nofail(&socket_optslist); + s->client.is_unix = qdict_haskey(options, "path"); + s->socket_opts = qemu_opts_create(&socket_optslist, NULL, 0, + &error_abort); qemu_opts_absorb_qdict(s->socket_opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - return -EINVAL; + if (local_err) { + error_propagate(errp, local_err); + return; } if (!qemu_opt_get(s->socket_opts, "port")) { qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT); } - s->export_name = g_strdup(qdict_get_try_str(options, "export")); - if (s->export_name) { + *export = g_strdup(qdict_get_try_str(options, "export")); + if (*export) { qdict_del(options, "export"); } - - return 0; -} - - -static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) -{ - int i; - - /* Poor man semaphore. The free_sema is locked when no other request - * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight >= MAX_NBD_REQUESTS - 1) { - qemu_co_mutex_lock(&s->free_sema); - assert(s->in_flight < MAX_NBD_REQUESTS); - } - s->in_flight++; - - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i] == NULL) { - s->recv_coroutine[i] = qemu_coroutine_self(); - break; - } - } - - assert(i < MAX_NBD_REQUESTS); - request->handle = INDEX_TO_HANDLE(s, i); -} - -static int nbd_have_request(void *opaque) -{ - BDRVNBDState *s = opaque; - - return s->in_flight > 0; -} - -static void nbd_reply_ready(void *opaque) -{ - BDRVNBDState *s = opaque; - uint64_t i; - int ret; - - if (s->reply.handle == 0) { - /* No reply already in flight. Fetch a header. It is possible - * that another thread has done the same thing in parallel, so - * the socket is not readable anymore. - */ - ret = nbd_receive_reply(s->sock, &s->reply); - if (ret == -EAGAIN) { - return; - } - if (ret < 0) { - s->reply.handle = 0; - goto fail; - } - } - - /* There's no need for a mutex on the receive side, because the - * handler acts as a synchronization point and ensures that only - * one coroutine is called until the reply finishes. */ - i = HANDLE_TO_INDEX(s, s->reply.handle); - if (i >= MAX_NBD_REQUESTS) { - goto fail; - } - - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - return; - } - -fail: - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - } - } -} - -static void nbd_restart_write(void *opaque) -{ - BDRVNBDState *s = opaque; - qemu_coroutine_enter(s->send_coroutine, NULL); -} - -static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, - QEMUIOVector *qiov, int offset) -{ - int rc, ret; - - qemu_co_mutex_lock(&s->send_mutex); - s->send_coroutine = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, - nbd_have_request, s); - if (qiov) { - if (!s->is_unix) { - socket_set_cork(s->sock, 1); - } - rc = nbd_send_request(s->sock, request); - if (rc >= 0) { - ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, - offset, request->len); - if (ret != request->len) { - rc = -EIO; - } - } - if (!s->is_unix) { - socket_set_cork(s->sock, 0); - } - } else { - rc = nbd_send_request(s->sock, request); - } - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, - nbd_have_request, s); - s->send_coroutine = NULL; - qemu_co_mutex_unlock(&s->send_mutex); - return rc; -} - -static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request, - struct nbd_reply *reply, - QEMUIOVector *qiov, int offset) -{ - int ret; - - /* Wait until we're woken up by the read handler. TODO: perhaps - * peek at the next reply and avoid yielding if it's ours? */ - qemu_coroutine_yield(); - *reply = s->reply; - if (reply->handle != request->handle) { - reply->error = EIO; - } else { - if (qiov && reply->error == 0) { - ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, - offset, request->len); - if (ret != request->len) { - reply->error = EIO; - } - } - - /* Tell the read handler to read another header. */ - s->reply.handle = 0; - } -} - -static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request) -{ - int i = HANDLE_TO_INDEX(s, request->handle); - s->recv_coroutine[i] = NULL; - if (s->in_flight-- == MAX_NBD_REQUESTS) { - qemu_co_mutex_unlock(&s->free_sema); - } } -static int nbd_establish_connection(BlockDriverState *bs) +static int nbd_establish_connection(BlockDriverState *bs, Error **errp) { BDRVNBDState *s = bs->opaque; int sock; - int ret; - off_t size; - size_t blocksize; - if (s->is_unix) { - sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path")); + if (s->client.is_unix) { + sock = unix_connect_opts(s->socket_opts, errp, NULL, NULL); } else { - sock = tcp_socket_outgoing_opts(s->socket_opts); + sock = inet_connect_opts(s->socket_opts, errp, NULL, NULL); if (sock >= 0) { socket_set_nodelay(sock); } @@ -426,226 +242,85 @@ static int nbd_establish_connection(BlockDriverState *bs) return -errno; } - /* NBD handshake */ - ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, - &blocksize); - if (ret < 0) { - logout("Failed to negotiate with the NBD server\n"); - closesocket(sock); - return ret; - } - - /* Now that we're connected, set the socket to be non-blocking and - * kick the reply mechanism. */ - qemu_set_nonblock(sock); - qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, - nbd_have_request, s); - - s->sock = sock; - s->size = size; - s->blocksize = blocksize; - - logout("Established connection with NBD server\n"); - return 0; -} - -static void nbd_teardown_connection(BlockDriverState *bs) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - - request.type = NBD_CMD_DISC; - request.from = 0; - request.len = 0; - nbd_send_request(s->sock, &request); - - qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); - closesocket(s->sock); + return sock; } -static int nbd_open(BlockDriverState *bs, QDict *options, int flags) +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVNBDState *s = bs->opaque; - int result; - - qemu_co_mutex_init(&s->send_mutex); - qemu_co_mutex_init(&s->free_sema); + char *export = NULL; + int result, sock; + Error *local_err = NULL; /* Pop the config into our state object. Exit if invalid. */ - result = nbd_config(s, options); - if (result != 0) { - return result; + nbd_config(s, options, &export, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; } /* establish TCP connection, return error if it fails * TODO: Configurable retry-until-timeout behaviour. */ - result = nbd_establish_connection(bs); - - return result; -} - -static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - request.type = NBD_CMD_READ; - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, qiov, offset); - } - nbd_coroutine_end(s, &request); - return -reply.error; - -} - -static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - request.type = NBD_CMD_WRITE; - if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) { - request.type |= NBD_CMD_FLAG_FUA; + sock = nbd_establish_connection(bs, errp); + if (sock < 0) { + return sock; } - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, qiov, offset); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + /* NBD handshake */ + result = nbd_client_session_init(&s->client, bs, sock, export); + g_free(export); + return result; } -/* qemu-nbd has a limit of slightly less than 1M per request. Try to - * remain aligned to 4K. */ -#define NBD_MAX_SECTORS 2040 - static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); + BDRVNBDState *s = bs->opaque; + + return nbd_client_session_co_readv(&s->client, sector_num, + nb_sectors, qiov); } static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); + BDRVNBDState *s = bs->opaque; + + return nbd_client_session_co_writev(&s->client, sector_num, + nb_sectors, qiov); } static int nbd_co_flush(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) { - return 0; - } - - request.type = NBD_CMD_FLUSH; - if (s->nbdflags & NBD_FLAG_SEND_FUA) { - request.type |= NBD_CMD_FLAG_FUA; - } - - request.from = 0; - request.len = 0; - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + return nbd_client_session_co_flush(&s->client); } static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) { - return 0; - } - request.type = NBD_CMD_TRIM; - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + return nbd_client_session_co_discard(&s->client, sector_num, + nb_sectors); } static void nbd_close(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - g_free(s->export_name); - qemu_opts_del(s->socket_opts); - nbd_teardown_connection(bs); + qemu_opts_del(s->socket_opts); + nbd_client_session_close(&s->client); } static int64_t nbd_getlength(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - return s->size; + return s->client.size; } static BlockDriver bdrv_nbd = { diff --git a/block/nfs.c b/block/nfs.c new file mode 100644 index 000000000..98aa363e4 --- /dev/null +++ b/block/nfs.c @@ -0,0 +1,442 @@ +/* + * QEMU Block driver for native access to files on NFS shares + * + * Copyright (c) 2014 Peter Lieven <pl@kamp.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/uri.h" +#include "sysemu/sysemu.h" +#include <nfsc/libnfs.h> + +typedef struct NFSClient { + struct nfs_context *context; + struct nfsfh *fh; + int events; + bool has_zero_init; +} NFSClient; + +typedef struct NFSRPC { + int ret; + int complete; + QEMUIOVector *iov; + struct stat *st; + Coroutine *co; + QEMUBH *bh; +} NFSRPC; + +static void nfs_process_read(void *arg); +static void nfs_process_write(void *arg); + +static void nfs_set_events(NFSClient *client) +{ + int ev = nfs_which_events(client->context); + if (ev != client->events) { + qemu_aio_set_fd_handler(nfs_get_fd(client->context), + (ev & POLLIN) ? nfs_process_read : NULL, + (ev & POLLOUT) ? nfs_process_write : NULL, + client); + + } + client->events = ev; +} + +static void nfs_process_read(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLIN); + nfs_set_events(client); +} + +static void nfs_process_write(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLOUT); + nfs_set_events(client); +} + +static void nfs_co_init_task(NFSClient *client, NFSRPC *task) +{ + *task = (NFSRPC) { + .co = qemu_coroutine_self(), + }; +} + +static void nfs_co_generic_bh_cb(void *opaque) +{ + NFSRPC *task = opaque; + qemu_bh_delete(task->bh); + qemu_coroutine_enter(task->co, NULL); +} + +static void +nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, + void *private_data) +{ + NFSRPC *task = private_data; + task->complete = 1; + task->ret = ret; + if (task->ret > 0 && task->iov) { + if (task->ret <= task->iov->size) { + qemu_iovec_from_buf(task->iov, 0, data, task->ret); + } else { + task->ret = -EIO; + } + } + if (task->ret == 0 && task->st) { + memcpy(task->st, data, sizeof(struct stat)); + } + if (task->ret < 0) { + error_report("NFS Error: %s", nfs_get_error(nfs)); + } + if (task->co) { + task->bh = qemu_bh_new(nfs_co_generic_bh_cb, task); + qemu_bh_schedule(task->bh); + } +} + +static int coroutine_fn nfs_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + task.iov = iov; + + if (nfs_pread_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + nfs_co_generic_cb, &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + if (task.ret < 0) { + return task.ret; + } + + /* zero pad short reads */ + if (task.ret < iov->size) { + qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret); + } + + return 0; +} + +static int coroutine_fn nfs_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + char *buf = NULL; + + nfs_co_init_task(client, &task); + + buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE); + qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); + + if (nfs_pwrite_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + buf, nfs_co_generic_cb, &task) != 0) { + g_free(buf); + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + g_free(buf); + + if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) { + return task.ret < 0 ? task.ret : -EIO; + } + + return 0; +} + +static int coroutine_fn nfs_co_flush(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + + if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + return task.ret; +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "nfs", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the NFS file", + }, + { /* end of list */ } + }, +}; + +static void nfs_client_close(NFSClient *client) +{ + if (client->context) { + if (client->fh) { + nfs_close(client->context, client->fh); + } + qemu_aio_set_fd_handler(nfs_get_fd(client->context), NULL, NULL, NULL); + nfs_destroy_context(client->context); + } + memset(client, 0, sizeof(NFSClient)); +} + +static void nfs_file_close(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + nfs_client_close(client); +} + +static int64_t nfs_client_open(NFSClient *client, const char *filename, + int flags, Error **errp) +{ + int ret = -EINVAL, i; + struct stat st; + URI *uri; + QueryParams *qp = NULL; + char *file = NULL, *strp = NULL; + + uri = uri_parse(filename); + if (!uri) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + strp = strrchr(uri->path, '/'); + if (strp == NULL) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + file = g_strdup(strp); + *strp = 0; + + client->context = nfs_init_context(); + if (client->context == NULL) { + error_setg(errp, "Failed to init NFS context"); + goto fail; + } + + qp = query_params_parse(uri->query); + for (i = 0; i < qp->n; i++) { + if (!qp->p[i].value) { + error_setg(errp, "Value for NFS parameter expected: %s", + qp->p[i].name); + goto fail; + } + if (!strncmp(qp->p[i].name, "uid", 3)) { + nfs_set_uid(client->context, atoi(qp->p[i].value)); + } else if (!strncmp(qp->p[i].name, "gid", 3)) { + nfs_set_gid(client->context, atoi(qp->p[i].value)); + } else if (!strncmp(qp->p[i].name, "tcp-syncnt", 10)) { + nfs_set_tcp_syncnt(client->context, atoi(qp->p[i].value)); + } else { + error_setg(errp, "Unknown NFS parameter name: %s", + qp->p[i].name); + goto fail; + } + } + + ret = nfs_mount(client->context, uri->server, uri->path); + if (ret < 0) { + error_setg(errp, "Failed to mount nfs share: %s", + nfs_get_error(client->context)); + goto fail; + } + + if (flags & O_CREAT) { + ret = nfs_creat(client->context, file, 0600, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to create file: %s", + nfs_get_error(client->context)); + goto fail; + } + } else { + ret = nfs_open(client->context, file, flags, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to open file : %s", + nfs_get_error(client->context)); + goto fail; + } + } + + ret = nfs_fstat(client->context, client->fh, &st); + if (ret < 0) { + error_setg(errp, "Failed to fstat file: %s", + nfs_get_error(client->context)); + goto fail; + } + + ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); + client->has_zero_init = S_ISREG(st.st_mode); + goto out; +fail: + nfs_client_close(client); +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + g_free(file); + return ret; +} + +static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { + NFSClient *client = bs->opaque; + int64_t ret; + QemuOpts *opts; + Error *local_err = NULL; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + return -EINVAL; + } + ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), + (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, + errp); + if (ret < 0) { + return ret; + } + bs->total_sectors = ret; + return 0; +} + +static int nfs_file_create(const char *url, QEMUOptionParameter *options, + Error **errp) +{ + int ret = 0; + int64_t total_size = 0; + NFSClient *client = g_malloc0(sizeof(NFSClient)); + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, "size")) { + total_size = options->value.n; + } + options++; + } + + ret = nfs_client_open(client, url, O_CREAT, errp); + if (ret < 0) { + goto out; + } + ret = nfs_ftruncate(client->context, client->fh, total_size); + nfs_client_close(client); +out: + g_free(client); + return ret; +} + +static int nfs_has_zero_init(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + return client->has_zero_init; +} + +static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task = {0}; + struct stat st; + + task.st = &st; + if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_aio_wait(); + } + + return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize); +} + +static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) +{ + NFSClient *client = bs->opaque; + return nfs_ftruncate(client->context, client->fh, offset); +} + +static BlockDriver bdrv_nfs = { + .format_name = "nfs", + .protocol_name = "nfs", + + .instance_size = sizeof(NFSClient), + .bdrv_needs_filename = true, + .bdrv_has_zero_init = nfs_has_zero_init, + .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, + .bdrv_truncate = nfs_file_truncate, + + .bdrv_file_open = nfs_file_open, + .bdrv_close = nfs_file_close, + .bdrv_create = nfs_file_create, + + .bdrv_co_readv = nfs_co_readv, + .bdrv_co_writev = nfs_co_writev, + .bdrv_co_flush_to_disk = nfs_co_flush, +}; + +static void nfs_block_init(void) +{ + bdrv_register(&bdrv_nfs); +} + +block_init(nfs_block_init); diff --git a/block/parallels.c b/block/parallels.c index 18b3ac0b2..1a5bd350b 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -49,9 +49,9 @@ typedef struct BDRVParallelsState { CoMutex lock; uint32_t *catalog_bitmap; - int catalog_size; + unsigned int catalog_size; - int tracks; + unsigned int tracks; } BDRVParallelsState; static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -68,7 +68,8 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam return 0; } -static int parallels_open(BlockDriverState *bs, QDict *options, int flags) +static int parallels_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVParallelsState *s = bs->opaque; int i; @@ -84,15 +85,26 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags) if (memcmp(ph.magic, HEADER_MAGIC, 16) || (le32_to_cpu(ph.version) != HEADER_VERSION)) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in Parallels format"); + ret = -EINVAL; goto fail; } bs->total_sectors = le32_to_cpu(ph.nb_sectors); s->tracks = le32_to_cpu(ph.tracks); + if (s->tracks == 0) { + error_setg(errp, "Invalid image: Zero sectors per track"); + ret = -EINVAL; + goto fail; + } s->catalog_size = le32_to_cpu(ph.catalog_entries); + if (s->catalog_size > INT_MAX / 4) { + error_setg(errp, "Catalog too large"); + ret = -EFBIG; + goto fail; + } s->catalog_bitmap = g_malloc(s->catalog_size * 4); ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); diff --git a/block/qapi.c b/block/qapi.c index a4bc4113b..8f2b4dbe7 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -25,6 +25,63 @@ #include "block/qapi.h" #include "block/block_int.h" #include "qmp-commands.h" +#include "qapi-visit.h" +#include "qapi/qmp-output-visitor.h" +#include "qapi/qmp/types.h" + +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) +{ + BlockDeviceInfo *info = g_malloc0(sizeof(*info)); + + info->file = g_strdup(bs->filename); + info->ro = bs->read_only; + info->drv = g_strdup(bs->drv->format_name); + info->encrypted = bs->encrypted; + info->encryption_key_missing = bdrv_key_required(bs); + + if (bs->node_name[0]) { + info->has_node_name = true; + info->node_name = g_strdup(bs->node_name); + } + + if (bs->backing_file[0]) { + info->has_backing_file = true; + info->backing_file = g_strdup(bs->backing_file); + } + + info->backing_file_depth = bdrv_get_backing_file_depth(bs); + + if (bs->io_limits_enabled) { + ThrottleConfig cfg; + throttle_get_config(&bs->throttle_state, &cfg); + info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; + info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; + info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; + + info->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; + info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; + info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; + + info->has_bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->has_bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->has_bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + info->bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + + info->has_iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + + info->has_iops_size = cfg.op_size; + info->iops_size = cfg.op_size; + } + + return info; +} /* * Returns 0 on success, with *p_list either set to describe snapshot @@ -134,6 +191,9 @@ void bdrv_query_image_info(BlockDriverState *bs, info->dirty_flag = bdi.is_dirty; info->has_dirty_flag = true; } + info->format_specific = bdrv_get_specific_info(bs); + info->has_format_specific = info->format_specific != NULL; + backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { info->backing_filename = g_strdup(backing_filename); @@ -198,50 +258,20 @@ void bdrv_query_info(BlockDriverState *bs, info->io_status = bs->iostatus; } - if (bs->dirty_bitmap) { - info->has_dirty = true; - info->dirty = g_malloc0(sizeof(*info->dirty)); - info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; - info->dirty->granularity = - ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); + if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { + info->has_dirty_bitmaps = true; + info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); } if (bs->drv) { info->has_inserted = true; - info->inserted = g_malloc0(sizeof(*info->inserted)); - info->inserted->file = g_strdup(bs->filename); - info->inserted->ro = bs->read_only; - info->inserted->drv = g_strdup(bs->drv->format_name); - info->inserted->encrypted = bs->encrypted; - info->inserted->encryption_key_missing = bdrv_key_required(bs); - - if (bs->backing_file[0]) { - info->inserted->has_backing_file = true; - info->inserted->backing_file = g_strdup(bs->backing_file); - } - - info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); - - if (bs->io_limits_enabled) { - info->inserted->bps = - bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; - info->inserted->bps_rd = - bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; - info->inserted->bps_wr = - bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; - info->inserted->iops = - bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; - info->inserted->iops_rd = - bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; - info->inserted->iops_wr = - bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; - } + info->inserted = bdrv_block_device_info(bs); bs0 = bs; p_image_info = &info->inserted->image; while (1) { bdrv_query_image_info(bs0, p_image_info, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto err; } @@ -289,6 +319,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs) s->parent = bdrv_query_stats(bs->file); } + if (bs->backing_hd) { + s->has_backing = true; + s->backing = bdrv_query_stats(bs->backing_hd); + } + return s; } @@ -301,7 +336,7 @@ BlockInfoList *qmp_query_block(Error **errp) while ((bs = bdrv_next(bs))) { BlockInfoList *info = g_malloc0(sizeof(*info)); bdrv_query_info(bs, &info->value, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto err; } @@ -397,6 +432,119 @@ void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, } } +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict); +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list); + +static void dump_qobject(fprintf_function func_fprintf, void *f, + int comp_indent, QObject *obj) +{ + switch (qobject_type(obj)) { + case QTYPE_QINT: { + QInt *value = qobject_to_qint(obj); + func_fprintf(f, "%" PRId64, qint_get_int(value)); + break; + } + case QTYPE_QSTRING: { + QString *value = qobject_to_qstring(obj); + func_fprintf(f, "%s", qstring_get_str(value)); + break; + } + case QTYPE_QDICT: { + QDict *value = qobject_to_qdict(obj); + dump_qdict(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QLIST: { + QList *value = qobject_to_qlist(obj); + dump_qlist(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QFLOAT: { + QFloat *value = qobject_to_qfloat(obj); + func_fprintf(f, "%g", qfloat_get_double(value)); + break; + } + case QTYPE_QBOOL: { + QBool *value = qobject_to_qbool(obj); + func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false"); + break; + } + case QTYPE_QERROR: { + QString *value = qerror_human((QError *)obj); + func_fprintf(f, "%s", qstring_get_str(value)); + break; + } + case QTYPE_NONE: + break; + case QTYPE_MAX: + default: + abort(); + } +} + +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list) +{ + const QListEntry *entry; + int i = 0; + + for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: "; + + func_fprintf(f, format, indentation * 4, "", i); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict) +{ + const QDictEntry *entry; + + for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s%s:\n" : "%*s%s: "; + char key[strlen(entry->key) + 1]; + int i; + + /* replace dashes with spaces in key (variable) names */ + for (i = 0; entry->key[i]; i++) { + key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; + } + key[i] = 0; + + func_fprintf(f, format, indentation * 4, "", key); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, + ImageInfoSpecific *info_spec) +{ + Error *local_err = NULL; + QmpOutputVisitor *ov = qmp_output_visitor_new(); + QObject *obj, *data; + + visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, + &local_err); + obj = qmp_output_get_qobject(ov); + assert(qobject_type(obj) == QTYPE_QDICT); + data = qdict_get(qobject_to_qdict(obj), "data"); + dump_qobject(func_fprintf, f, 1, data); + qmp_output_visitor_cleanup(ov); +} + void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, ImageInfo *info) { @@ -467,4 +615,9 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, func_fprintf(f, "\n"); } } + + if (info->has_format_specific) { + func_fprintf(f, "Format specific information:\n"); + bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); + } } diff --git a/block/qcow.c b/block/qcow.c index 5239bd68f..d5a7d5fd1 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -92,7 +92,8 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int qcow_open(BlockDriverState *bs, QDict *options, int flags) +static int qcow_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQcowState *s = bs->opaque; int len, i, shift, ret; @@ -112,23 +113,26 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags) be64_to_cpus(&header.l1_table_offset); if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in qcow format"); + ret = -EINVAL; goto fail; } if (header.version != QCOW_VERSION) { char version[64]; snprintf(version, sizeof(version), "QCOW version %d", header.version); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow", version); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow", version); ret = -ENOTSUP; goto fail; } if (header.size <= 1 || header.cluster_bits < 9) { + error_setg(errp, "invalid value in qcow header"); ret = -EINVAL; goto fail; } if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "invalid encryption method in qcow header"); ret = -EINVAL; goto fail; } @@ -395,7 +399,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, return cluster_offset; } -static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQcowState *s = bs->opaque; @@ -410,7 +414,14 @@ static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, if (n > nb_sectors) n = nb_sectors; *pnum = n; - return (cluster_offset != 0); + if (!cluster_offset) { + return 0; + } + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) { + return BDRV_BLOCK_DATA; + } + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; } static int decompress_buffer(uint8_t *out_buf, int out_buf_size, @@ -651,7 +662,8 @@ static void qcow_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static int qcow_create(const char *filename, QEMUOptionParameter *options) +static int qcow_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int header_size, backing_filename_len, l1_size, shift, i; QCowHeader header; @@ -659,6 +671,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) int64_t total_size = 0; const char *backing_file = NULL; int flags = 0; + Error *local_err = NULL; int ret; BlockDriverState *qcow_bs; @@ -674,13 +687,17 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) options++; } - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); + qcow_bs = NULL; + ret = bdrv_open(&qcow_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -706,7 +723,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) backing_file = NULL; } header.cluster_bits = 9; /* 512 byte cluster to avoid copying - unmodifyed sectors */ + unmodified sectors */ header.l2_bits = 12; /* 32 KB L2 tables */ } else { header.cluster_bits = 12; /* 4 KB clusters */ @@ -751,7 +768,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) g_free(tmp); ret = 0; exit: - bdrv_delete(qcow_bs); + bdrv_unref(qcow_bs); return ret; } @@ -896,7 +913,7 @@ static BlockDriver bdrv_qcow = { .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, - .bdrv_co_is_allocated = qcow_co_is_allocated, + .bdrv_co_get_block_status = qcow_co_get_block_status, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 2f3114ecc..8ecbb5bc0 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -115,6 +115,21 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) } if (c == s->refcount_block_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, + c->entries[i].offset, s->cluster_size); + } else if (c == s->l2_table_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + c->entries[i].offset, s->cluster_size); + } else { + ret = qcow2_pre_write_overlap_check(bs, 0, + c->entries[i].offset, s->cluster_size); + } + + if (ret < 0) { + return ret; + } + + if (c == s->refcount_block_cache) { BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); } else if (c == s->l2_table_cache) { BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); @@ -185,6 +200,24 @@ void qcow2_cache_depends_on_flush(Qcow2Cache *c) c->depends_on_flush = true; } +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret, i; + + ret = qcow2_cache_flush(bs, c); + if (ret < 0) { + return ret; + } + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + c->entries[i].offset = 0; + c->entries[i].cache_hits = 0; + } + + return 0; +} + static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) { int i; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index cca76d4fc..331ab0802 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -35,6 +35,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, BDRVQcowState *s = bs->opaque; int new_l1_size2, ret, i; uint64_t *new_l1_table; + int64_t old_l1_table_offset, old_l1_size; int64_t new_l1_table_offset, new_l1_size; uint8_t data[12]; @@ -54,7 +55,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, } } - if (new_l1_size > INT_MAX) { + if (new_l1_size > INT_MAX / sizeof(uint64_t)) { return -EFBIG; } @@ -80,6 +81,14 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, goto fail; } + /* the L1 position has not yet been updated, so these clusters must + * indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, + new_l1_size2); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); @@ -92,17 +101,19 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, /* set new table */ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); cpu_to_be32w((uint32_t*)data, new_l1_size); - cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); + stq_be_p(data + 4, new_l1_table_offset); ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); if (ret < 0) { goto fail; } g_free(s->l1_table); - qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); + old_l1_table_offset = s->l1_table_offset; s->l1_table_offset = new_l1_table_offset; s->l1_table = new_l1_table; + old_l1_size = s->l1_size; s->l1_size = new_l1_size; + qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); return 0; fail: g_free(new_l1_table); @@ -137,7 +148,7 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset, * and we really don't want bdrv_pread to perform a read-modify-write) */ #define L1_ENTRIES_PER_SECTOR (512 / 8) -static int write_l1_entry(BlockDriverState *bs, int l1_index) +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) { BDRVQcowState *s = bs->opaque; uint64_t buf[L1_ENTRIES_PER_SECTOR]; @@ -149,6 +160,12 @@ static int write_l1_entry(BlockDriverState *bs, int l1_index) buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); } + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); + if (ret < 0) { + return ret; + } + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, buf, sizeof(buf)); @@ -173,7 +190,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) { BDRVQcowState *s = bs->opaque; uint64_t old_l2_offset; - uint64_t *l2_table; + uint64_t *l2_table = NULL; int64_t l2_offset; int ret; @@ -185,7 +202,8 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); if (l2_offset < 0) { - return l2_offset; + ret = l2_offset; + goto fail; } ret = qcow2_cache_flush(bs, s->refcount_block_cache); @@ -198,7 +216,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) trace_qcow2_l2_allocate_get_empty(bs, l1_index); ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); if (ret < 0) { - return ret; + goto fail; } l2_table = *table; @@ -239,7 +257,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) /* update the L1 entry */ trace_qcow2_l2_allocate_write_l1(bs, l1_index); s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - ret = write_l1_entry(bs, l1_index); + ret = qcow2_write_l1_entry(bs, l1_index); if (ret < 0) { goto fail; } @@ -250,8 +268,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) fail: trace_qcow2_l2_allocate_done(bs, l1_index, ret); - qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + if (l2_table != NULL) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + } s->l1_table[l1_index] = old_l2_offset; + if (l2_offset > 0) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_ALWAYS); + } return ret; } @@ -263,23 +287,26 @@ fail: * cluster which may require a different handling) */ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, - uint64_t *l2_table, uint64_t start, uint64_t stop_flags) + uint64_t *l2_table, uint64_t stop_flags) { int i; - uint64_t mask = stop_flags | L2E_OFFSET_MASK; - uint64_t offset = be64_to_cpu(l2_table[0]) & mask; + uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; + uint64_t first_entry = be64_to_cpu(l2_table[0]); + uint64_t offset = first_entry & mask; if (!offset) return 0; - for (i = start; i < start + nb_clusters; i++) { + assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED); + + for (i = 0; i < nb_clusters; i++) { uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; if (offset + (uint64_t) i * cluster_size != l2_entry) { break; } } - return (i - start); + return i; } static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) @@ -332,15 +359,6 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, struct iovec iov; int n, ret; - /* - * If this is the last cluster and it is only partially used, we must only - * copy until the end of the image, or bdrv_check_request will fail for the - * bdrv_read/write calls below. - */ - if (start_sect + n_end > bs->total_sectors) { - n_end = bs->total_sectors - start_sect; - } - n = n_end - n_start; if (n <= 0) { return 0; @@ -353,6 +371,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + if (!bs->drv) { + return -ENOMEDIUM; + } + /* Call .bdrv_co_readv() directly instead of using the public block-layer * interface. This avoids double I/O throttling and request tracking, * which can lead to deadlock when block layer copy-on-read is enabled. @@ -368,6 +390,12 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, &s->aes_encrypt_key); } + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto out; + } + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); if (ret < 0) { @@ -463,11 +491,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, break; case QCOW2_CLUSTER_ZERO: if (s->qcow_version < 3) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); return -EIO; } c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + &l2_table[l2_index], QCOW_OFLAG_ZERO); *cluster_offset = 0; break; case QCOW2_CLUSTER_UNALLOCATED: @@ -478,8 +506,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, case QCOW2_CLUSTER_NORMAL: /* how many allocated clusters ? */ c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + &l2_table[l2_index], QCOW_OFLAG_ZERO); *cluster_offset &= L2E_OFFSET_MASK; break; default: @@ -695,6 +722,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) } qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { /* if two concurrent writes happen to the same unallocated cluster * each write allocates separate cluster and writes data concurrently. @@ -908,7 +936,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, /* We keep all QCOW_OFLAG_COPIED clusters */ keep_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, + &l2_table[l2_index], QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); assert(keep_clusters <= nb_clusters); @@ -1150,7 +1178,7 @@ fail: * Return 0 on success and -errno in error cases */ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) + int *num, uint64_t *host_offset, QCowL2Meta **m) { BDRVQcowState *s = bs->opaque; uint64_t start, remaining; @@ -1158,15 +1186,13 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, uint64_t cur_bytes; int ret; - trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, - n_start, n_end); + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); - assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); - offset = start_of_cluster(s, offset); + assert((offset & ~BDRV_SECTOR_MASK) == 0); again: - start = offset + (n_start << BDRV_SECTOR_BITS); - remaining = (n_end - n_start) << BDRV_SECTOR_BITS; + start = offset; + remaining = *num << BDRV_SECTOR_BITS; cluster_offset = 0; *host_offset = 0; cur_bytes = 0; @@ -1252,7 +1278,7 @@ again: } } - *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); + *num -= remaining >> BDRV_SECTOR_BITS; assert(*num > 0); assert(*host_offset != 0); @@ -1317,7 +1343,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) + unsigned int nb_clusters, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1337,16 +1363,34 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, uint64_t old_offset; old_offset = be64_to_cpu(l2_table[l2_index + i]); - if ((old_offset & L2E_OFFSET_MASK) == 0) { + + /* + * Make sure that a discarded area reads back as zeroes for v3 images + * (we cannot do it for v2 without actually writing a zero-filled + * buffer). We can skip the operation if the cluster is already marked + * as zero, or if it's unallocated and we don't have a backing file. + * + * TODO We might want to use bdrv_get_block_status(bs) here, but we're + * holding s->lock, so that doesn't work today. + */ + if (old_offset & QCOW_OFLAG_ZERO) { + continue; + } + + if ((old_offset & L2E_OFFSET_MASK) == 0 && !bs->backing_hd) { continue; } /* First remove L2 entries */ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - l2_table[l2_index + i] = cpu_to_be64(0); + if (s->qcow_version >= 3) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + } else { + l2_table[l2_index + i] = cpu_to_be64(0); + } /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); + qcow2_free_any_clusters(bs, old_offset, 1, type); } ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); @@ -1358,7 +1402,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors) + int nb_sectors, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; uint64_t end_offset; @@ -1369,7 +1413,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Round start up and end down */ offset = align_offset(offset, s->cluster_size); - end_offset &= ~(s->cluster_size - 1); + end_offset = start_of_cluster(s, end_offset); if (offset > end_offset) { return 0; @@ -1381,7 +1425,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Each L2 table is handled by its own loop iteration */ while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters); + ret = discard_single_l2(bs, offset, nb_clusters, type); if (ret < 0) { goto fail; } @@ -1476,3 +1520,255 @@ fail: return ret; } + +/* + * Expands all zero clusters in a specific L1 table (or deallocates them, for + * non-backed non-pre-allocated zero clusters). + * + * expanded_clusters is a bitmap where every bit corresponds to one cluster in + * the image file; a bit gets set if the corresponding cluster has been used for + * zero expansion (i.e., has been filled with zeroes and is referenced from an + * L2 table). nb_clusters contains the total cluster count of the image file, + * i.e., the number of bits in expanded_clusters. + */ +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, + int l1_size, uint8_t **expanded_clusters, + uint64_t *nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + bool is_active_l1 = (l1_table == s->l1_table); + uint64_t *l2_table = NULL; + int ret; + int i, j; + + if (!is_active_l1) { + /* inactive L2 tables require a buffer to be stored in when loading + * them from disk */ + l2_table = qemu_blockalign(bs, s->cluster_size); + } + + for (i = 0; i < l1_size; i++) { + uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; + bool l2_dirty = false; + + if (!l2_offset) { + /* unallocated */ + continue; + } + + if (is_active_l1) { + /* get active L2 tables from cache */ + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void **)&l2_table); + } else { + /* load inactive L2 tables from disk */ + ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + } + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; + int cluster_type = qcow2_get_cluster_type(l2_entry); + bool preallocated = offset != 0; + + if (cluster_type == QCOW2_CLUSTER_NORMAL) { + cluster_index = offset >> s->cluster_bits; + assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); + if ((*expanded_clusters)[cluster_index / 8] & + (1 << (cluster_index % 8))) { + /* Probably a shared L2 table; this cluster was a zero + * cluster which has been expanded, its refcount + * therefore most likely requires an update. */ + ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, + QCOW2_DISCARD_NEVER); + if (ret < 0) { + goto fail; + } + /* Since we just increased the refcount, the COPIED flag may + * no longer be set. */ + l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); + l2_dirty = true; + } + continue; + } + else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) { + continue; + } + + if (!preallocated) { + if (!bs->backing_hd) { + /* not backed; therefore we can simply deallocate the + * cluster */ + l2_table[j] = 0; + l2_dirty = true; + continue; + } + + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + ret = offset; + goto fail; + } + } + + ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, + s->cluster_sectors, 0); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); + l2_dirty = true; + + cluster_index = offset >> s->cluster_bits; + + if (cluster_index >= *nb_clusters) { + uint64_t old_bitmap_size = (*nb_clusters + 7) / 8; + uint64_t new_bitmap_size; + /* The offset may lie beyond the old end of the underlying image + * file for growable files only */ + assert(bs->file->growable); + *nb_clusters = size_to_clusters(s, bs->file->total_sectors * + BDRV_SECTOR_SIZE); + new_bitmap_size = (*nb_clusters + 7) / 8; + *expanded_clusters = g_realloc(*expanded_clusters, + new_bitmap_size); + /* clear the newly allocated space */ + memset(&(*expanded_clusters)[old_bitmap_size], 0, + new_bitmap_size - old_bitmap_size); + } + + assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); + (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8); + } + + if (is_active_l1) { + if (l2_dirty) { + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_depends_on_flush(s->l2_table_cache); + } + ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + if (ret < 0) { + l2_table = NULL; + goto fail; + } + } else { + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, + QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, + s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } + } + } + + ret = 0; + +fail: + if (l2_table) { + if (!is_active_l1) { + qemu_vfree(l2_table); + } else { + if (ret < 0) { + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + } else { + ret = qcow2_cache_put(bs, s->l2_table_cache, + (void **)&l2_table); + } + } + } + return ret; +} + +/* + * For backed images, expands all zero clusters on the image. For non-backed + * images, deallocates all non-pre-allocated zero clusters (and claims the + * allocation for pre-allocated ones). This is important for downgrading to a + * qcow2 version which doesn't yet support metadata zero clusters. + */ +int qcow2_expand_zero_clusters(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table = NULL; + uint64_t nb_clusters; + uint8_t *expanded_clusters; + int ret; + int i, j; + + nb_clusters = size_to_clusters(s, bs->file->total_sectors * + BDRV_SECTOR_SIZE); + expanded_clusters = g_malloc0((nb_clusters + 7) / 8); + + ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, + &expanded_clusters, &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* Inactive L1 tables may point to active L2 tables - therefore it is + * necessary to flush the L2 table cache before trying to access the L2 + * tables pointed to by inactive L1 entries (else we might try to expand + * zero clusters that have already been expanded); furthermore, it is also + * necessary to empty the L2 table cache, since it may contain tables which + * are now going to be modified directly on disk, bypassing the cache. + * qcow2_cache_empty() does both for us. */ + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->nb_snapshots; i++) { + int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + + BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; + + l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); + + ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / + BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->snapshots[i].l1_size; j++) { + be64_to_cpus(&l1_table[j]); + } + + ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, + &expanded_clusters, &nb_clusters); + if (ret < 0) { + goto fail; + } + } + + ret = 0; + +fail: + g_free(expanded_clusters); + g_free(l1_table); + return ret; +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 1244693f3..a37ee4501 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -25,8 +25,10 @@ #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" +#include "qemu/range.h" +#include "qapi/qmp/types.h" -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, int addend, enum qcow2_discard_type type); @@ -38,8 +40,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int qcow2_refcount_init(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; - int ret, refcount_table_size2, i; + unsigned int refcount_table_size2, i; + int ret; + assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); s->refcount_table = g_malloc(refcount_table_size2); if (s->refcount_table_size > 0) { @@ -85,7 +89,7 @@ static int load_refcount_block(BlockDriverState *bs, static int get_refcount(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; - int refcount_table_index, block_index; + uint64_t refcount_table_index, block_index; int64_t refcount_block_offset; int ret; uint16_t *refcount_block; @@ -94,7 +98,8 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); if (refcount_table_index >= s->refcount_table_size) return 0; - refcount_block_offset = s->refcount_table[refcount_table_index]; + refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; if (!refcount_block_offset) return 0; @@ -189,10 +194,11 @@ static int alloc_refcount_block(BlockDriverState *bs, * they can describe them themselves. * * - We need to consider that at this point we are inside update_refcounts - * and doing the initial refcount increase. This means that some clusters - * have already been allocated by the caller, but their refcount isn't - * accurate yet. free_cluster_index tells us where this allocation ends - * as long as we don't overwrite it by freeing clusters. + * and potentially doing an initial refcount increase. This means that + * some clusters have already been allocated by the caller, but their + * refcount isn't accurate yet. If we allocate clusters for metadata, we + * need to return -EAGAIN to signal the caller that it needs to restart + * the search for free clusters. * * - alloc_clusters_noref and qcow2_free_clusters may load a different * refcount block into the cache @@ -277,7 +283,10 @@ static int alloc_refcount_block(BlockDriverState *bs, } s->refcount_table[refcount_table_index] = new_block; - return 0; + + /* The new refcount block may be where the caller intended to put its + * data, so let it restart the search. */ + return -EAGAIN; } ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); @@ -300,8 +309,11 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Calculate the number of refcount blocks needed so far */ uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); - uint64_t blocks_used = (s->free_cluster_index + - refcount_block_clusters - 1) / refcount_block_clusters; + uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters); + + if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { + return -EFBIG; + } /* And now we need at least one block more for the new metadata */ uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); @@ -334,8 +346,6 @@ static int alloc_refcount_block(BlockDriverState *bs, uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); - assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); - /* Fill the new refcount table */ memcpy(new_table, s->refcount_table, s->refcount_table_size * sizeof(uint64_t)); @@ -398,18 +408,19 @@ static int alloc_refcount_block(BlockDriverState *bs, s->refcount_table_size = table_size; s->refcount_table_offset = table_offset; - /* Free old table. Remember, we must not change free_cluster_index */ - uint64_t old_free_cluster_index = s->free_cluster_index; + /* Free old table. */ qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), QCOW2_DISCARD_OTHER); - s->free_cluster_index = old_free_cluster_index; ret = load_refcount_block(bs, new_block, (void**) refcount_block); if (ret < 0) { return ret; } - return 0; + /* If we were trying to do the initial refcount update for some cluster + * allocation, we might have used the same clusters to store newly + * allocated metadata. Make the caller search some new space. */ + return -EAGAIN; fail_table: g_free(new_table); @@ -513,8 +524,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, s->l2_table_cache); } - start = offset & ~(s->cluster_size - 1); - last = (offset + length - 1) & ~(s->cluster_size - 1); + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + length - 1); for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { @@ -599,10 +610,10 @@ fail: * If the return value is non-negative, it is the new refcount of the cluster. * If it is negative, it is -errno and indicates an error. */ -static int update_cluster_refcount(BlockDriverState *bs, - int64_t cluster_index, - int addend, - enum qcow2_discard_type type) +int qcow2_update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend, + enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int ret; @@ -624,15 +635,16 @@ static int update_cluster_refcount(BlockDriverState *bs, /* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) { BDRVQcowState *s = bs->opaque; - int i, nb_clusters, refcount; + uint64_t i, nb_clusters; + int refcount; nb_clusters = size_to_clusters(s, size); retry: for(i = 0; i < nb_clusters; i++) { - int64_t next_cluster_index = s->free_cluster_index++; + uint64_t next_cluster_index = s->free_cluster_index++; refcount = get_refcount(bs, next_cluster_index); if (refcount < 0) { @@ -649,18 +661,21 @@ retry: return (s->free_cluster_index - nb_clusters) << s->cluster_bits; } -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) { int64_t offset; int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); - offset = alloc_clusters_noref(bs, size); - if (offset < 0) { - return offset; - } + do { + offset = alloc_clusters_noref(bs, size); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); - ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } @@ -673,33 +688,36 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, { BDRVQcowState *s = bs->opaque; uint64_t cluster_index; - uint64_t old_free_cluster_index; - int i, refcount, ret; + uint64_t i; + int refcount, ret; - /* Check how many clusters there are free */ - cluster_index = offset >> s->cluster_bits; - for(i = 0; i < nb_clusters; i++) { - refcount = get_refcount(bs, cluster_index++); + assert(nb_clusters >= 0); + if (nb_clusters == 0) { + return 0; + } - if (refcount < 0) { - return refcount; - } else if (refcount != 0) { - break; + do { + /* Check how many clusters there are free */ + cluster_index = offset >> s->cluster_bits; + for(i = 0; i < nb_clusters; i++) { + refcount = get_refcount(bs, cluster_index++); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + break; + } } - } - /* And then allocate them */ - old_free_cluster_index = s->free_cluster_index; - s->free_cluster_index = cluster_index + i; + /* And then allocate them */ + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); - ret = update_refcount(bs, offset, i << s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } - s->free_cluster_index = old_free_cluster_index; - return i; } @@ -722,7 +740,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } redo: free_in_cluster = s->cluster_size - - (s->free_byte_offset & (s->cluster_size - 1)); + offset_into_cluster(s, s->free_byte_offset); if (size <= free_in_cluster) { /* enough space in current cluster */ offset = s->free_byte_offset; @@ -730,20 +748,20 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) free_in_cluster -= size; if (free_in_cluster == 0) s->free_byte_offset = 0; - if ((offset & (s->cluster_size - 1)) != 0) - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); + if (offset_into_cluster(s, offset) != 0) + qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); } else { offset = qcow2_alloc_clusters(bs, s->cluster_size); if (offset < 0) { return offset; } - cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); + cluster_offset = start_of_cluster(s, s->free_byte_offset); if ((cluster_offset + s->cluster_size) == offset) { /* we are lucky: contiguous data */ offset = s->free_byte_offset; - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); + qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); s->free_byte_offset += size; } else { s->free_byte_offset = offset; @@ -752,8 +770,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } /* The cluster refcount was incremented, either by qcow2_alloc_clusters() - * or explicitly by update_cluster_refcount(). Refcount blocks must be - * flushed before the caller's L2 table updates. + * or explicitly by qcow2_update_cluster_refcount(). Refcount blocks must + * be flushed before the caller's L2 table updates. */ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); return offset; @@ -794,11 +812,13 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, } break; case QCOW2_CLUSTER_NORMAL: - qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits, type); + case QCOW2_CLUSTER_ZERO: + if (l2_entry & L2E_OFFSET_MASK) { + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits, type); + } break; case QCOW2_CLUSTER_UNALLOCATED: - case QCOW2_CLUSTER_ZERO: break; default: abort(); @@ -861,15 +881,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } for(j = 0; j < s->l2_size; j++) { + uint64_t cluster_index; + offset = be64_to_cpu(l2_table[j]); - if (offset != 0) { - old_offset = offset; - offset &= ~QCOW_OFLAG_COPIED; - if (offset & QCOW_OFLAG_COMPRESSED) { + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + + switch (qcow2_get_cluster_type(offset)) { + case QCOW2_CLUSTER_COMPRESSED: nb_csectors = ((offset >> s->csize_shift) & s->csize_mask) + 1; if (addend != 0) { - int ret; ret = update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, nb_csectors * 512, addend, @@ -880,11 +902,20 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } /* compressed clusters are never modified */ refcount = 2; - } else { - uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + break; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_ZERO: + cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + if (!cluster_index) { + /* unallocated */ + refcount = 0; + break; + } if (addend != 0) { - refcount = update_cluster_refcount(bs, cluster_index, addend, - QCOW2_DISCARD_SNAPSHOT); + refcount = qcow2_update_cluster_refcount(bs, + cluster_index, addend, + QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, cluster_index); } @@ -893,19 +924,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, ret = refcount; goto fail; } - } + break; - if (refcount == 1) { - offset |= QCOW_OFLAG_COPIED; - } - if (offset != old_offset) { - if (addend > 0) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - l2_table[j] = cpu_to_be64(offset); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + case QCOW2_CLUSTER_UNALLOCATED: + refcount = 0; + break; + + default: + abort(); + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + if (addend > 0) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); } + l2_table[j] = cpu_to_be64(offset); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); } } @@ -916,8 +954,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (addend != 0) { - refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, - QCOW2_DISCARD_SNAPSHOT); + refcount = qcow2_update_cluster_refcount(bs, l2_offset >> + s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, l2_offset >> s->cluster_bits); } @@ -982,22 +1020,17 @@ static void inc_refcounts(BlockDriverState *bs, int64_t offset, int64_t size) { BDRVQcowState *s = bs->opaque; - int64_t start, last, cluster_offset; - int k; + uint64_t start, last, cluster_offset, k; if (size <= 0) return; - start = offset & ~(s->cluster_size - 1); - last = (offset + size - 1) & ~(s->cluster_size - 1); + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + size - 1); for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { k = cluster_offset >> s->cluster_bits; - if (k < 0) { - fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", - cluster_offset); - res->corruptions++; - } else if (k >= refcount_table_size) { + if (k >= refcount_table_size) { fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " "the end of the image file, can't properly check refcounts.\n", cluster_offset); @@ -1014,7 +1047,6 @@ static void inc_refcounts(BlockDriverState *bs, /* Flags for check_refcounts_l1() and check_refcounts_l2() */ enum { - CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */ CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ }; @@ -1033,7 +1065,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, BDRVQcowState *s = bs->opaque; uint64_t *l2_table, l2_entry; uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, refcount; + int i, l2_size, nb_csectors; /* Read L2 table from disk */ l2_size = s->l2_size * sizeof(uint64_t); @@ -1085,23 +1117,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, case QCOW2_CLUSTER_NORMAL: { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ uint64_t offset = l2_entry & L2E_OFFSET_MASK; - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, offset >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for offset %" - PRIx64 ": %s\n", l2_entry, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" - PRIx64 " refcount=%d\n", l2_entry, refcount); - res->corruptions++; - } - } - if (flags & CHECK_FRAG_INFO) { res->bfi.allocated_clusters++; if (next_contiguous_offset && @@ -1116,7 +1133,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, offset, s->cluster_size); /* Correct offsets are cluster aligned */ - if (offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " "properly aligned; L2 entry corrupted.\n", offset); res->corruptions++; @@ -1158,7 +1175,7 @@ static int check_refcounts_l1(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; uint64_t *l1_table, l2_offset, l1_size2; - int i, refcount, ret; + int i, ret; l1_size2 = l1_size * sizeof(uint64_t); @@ -1182,29 +1199,13 @@ static int check_refcounts_l1(BlockDriverState *bs, for(i = 0; i < l1_size; i++) { l2_offset = l1_table[i]; if (l2_offset) { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) - >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for l2_offset %" - PRIx64 ": %s\n", l2_offset, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 - " refcount=%d\n", l2_offset, refcount); - res->corruptions++; - } - } - /* Mark L2 table as used */ l2_offset &= L1E_OFFSET_MASK; inc_refcounts(bs, res, refcount_table, refcount_table_size, l2_offset, s->cluster_size); /* L2 tables are cluster aligned */ - if (l2_offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, l2_offset)) { fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " "cluster aligned; L1 entry corrupted\n", l2_offset); res->corruptions++; @@ -1229,6 +1230,240 @@ fail: } /* + * Checks the OFLAG_COPIED flag for all L1 and L2 entries. + * + * This function does not print an error message nor does it increment + * check_errors if get_refcount fails (this is because such an error will have + * been already detected and sufficiently signaled by the calling function + * (qcow2_check_refcounts) by the time this function is called). + */ +static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); + int ret; + int refcount; + int i, j; + + for (i = 0; i < s->l1_size; i++) { + uint64_t l1_entry = s->l1_table[i]; + uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; + bool l2_dirty = false; + + if (!l2_offset) { + continue; + } + + refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + if (refcount < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " + "l1_entry=%" PRIx64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + i, l1_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + s->l1_table[i] = refcount == 1 + ? l1_entry | QCOW_OFLAG_COPIED + : l1_entry & ~QCOW_OFLAG_COPIED; + ret = qcow2_write_l1_entry(bs, i); + if (ret < 0) { + res->check_errors++; + goto fail; + } + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + + ret = bdrv_pread(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not read L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; + int cluster_type = qcow2_get_cluster_type(l2_entry); + + if ((cluster_type == QCOW2_CLUSTER_NORMAL) || + ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { + refcount = get_refcount(bs, data_offset >> s->cluster_bits); + if (refcount < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED data cluster: " + "l2_entry=%" PRIx64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + l2_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + l2_table[j] = cpu_to_be64(refcount == 1 + ? l2_entry | QCOW_OFLAG_COPIED + : l2_entry & ~QCOW_OFLAG_COPIED); + l2_dirty = true; + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + } + } + + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + l2_offset, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table; metadata " + "overlap check failed: %s\n", strerror(-ret)); + res->check_errors++; + goto fail; + } + + ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + } + } + + ret = 0; + +fail: + qemu_vfree(l2_table); + return ret; +} + +/* + * Writes one sector of the refcount table to the disk + */ +#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t)) +static int write_reftable_entry(BlockDriverState *bs, int rt_index) +{ + BDRVQcowState *s = bs->opaque; + uint64_t buf[RT_ENTRIES_PER_SECTOR]; + int rt_start_index; + int i, ret; + + rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1); + for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) { + buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]); + } + + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE, + s->refcount_table_offset + rt_start_index * sizeof(uint64_t), + sizeof(buf)); + if (ret < 0) { + return ret; + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset + + rt_start_index * sizeof(uint64_t), buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * Allocates a new cluster for the given refcount block (represented by its + * offset in the image file) and copies the current content there. This function + * does _not_ decrement the reference count for the currently occupied cluster. + * + * This function prints an informative message to stderr on error (and returns + * -errno); on success, the offset of the newly allocated cluster is returned. + */ +static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index, + uint64_t offset) +{ + BDRVQcowState *s = bs->opaque; + int64_t new_offset = 0; + void *refcount_block = NULL; + int ret; + + /* allocate new refcount block */ + new_offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (new_offset < 0) { + fprintf(stderr, "Could not allocate new cluster: %s\n", + strerror(-new_offset)); + ret = new_offset; + goto done; + } + + /* fetch current refcount block content */ + ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block); + if (ret < 0) { + fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret)); + goto fail_free_cluster; + } + + /* new block has not yet been entered into refcount table, therefore it is + * no refcount block yet (regarding this check) */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "Could not write refcount block; metadata overlap " + "check failed: %s\n", strerror(-ret)); + /* the image will be marked corrupt, so don't even attempt on freeing + * the cluster */ + goto done; + } + + /* write to new block */ + ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block, + s->cluster_sectors); + if (ret < 0) { + fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret)); + goto fail_free_cluster; + } + + /* update refcount table */ + assert(!offset_into_cluster(s, new_offset)); + s->refcount_table[reftable_index] = new_offset; + ret = write_reftable_entry(bs, reftable_index); + if (ret < 0) { + fprintf(stderr, "Could not update refcount table: %s\n", + strerror(-ret)); + goto fail_free_cluster; + } + + goto done; + +fail_free_cluster: + qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER); + +done: + if (refcount_block) { + /* This should never fail, as it would only do so if the given refcount + * block cannot be found in the cache. As this is impossible as long as + * there are no bugs, assert the success. */ + int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + assert(tmp == 0); + } + + if (ret < 0) { + return ret; + } + + return new_offset; +} + +/* * Checks an image for refcount consistency. * * Returns 0 if no errors are found, the number of errors in case the image is @@ -1238,14 +1473,19 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { BDRVQcowState *s = bs->opaque; - int64_t size, i, highest_cluster; - int nb_clusters, refcount1, refcount2; + int64_t size, i, highest_cluster, nb_clusters; + int refcount1, refcount2; QCowSnapshot *sn; uint16_t *refcount_table; int ret; size = bdrv_getlength(bs->file); nb_clusters = size_to_clusters(s, size); + if (nb_clusters > INT_MAX) { + res->check_errors++; + return -EFBIG; + } + refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); res->bfi.total_clusters = @@ -1257,8 +1497,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, /* current L1 table */ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - s->l1_table_offset, s->l1_size, - CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); + s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); if (ret < 0) { goto fail; } @@ -1286,7 +1525,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, cluster = offset >> s->cluster_bits; /* Refcount blocks are cluster aligned */ - if (offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR refcount block %" PRId64 " is not " "cluster aligned; refcount table entry corrupted\n", i); res->corruptions++; @@ -1304,10 +1543,39 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, inc_refcounts(bs, res, refcount_table, nb_clusters, offset, s->cluster_size); if (refcount_table[cluster] != 1) { - fprintf(stderr, "ERROR refcount block %" PRId64 + fprintf(stderr, "%s refcount block %" PRId64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", i, refcount_table[cluster]); - res->corruptions++; + + if (fix & BDRV_FIX_ERRORS) { + int64_t new_offset; + + new_offset = realloc_refcount_block(bs, i, offset); + if (new_offset < 0) { + res->corruptions++; + continue; + } + + /* update refcounts */ + if ((new_offset >> s->cluster_bits) >= nb_clusters) { + /* increase refcount_table size if necessary */ + int old_nb_clusters = nb_clusters; + nb_clusters = (new_offset >> s->cluster_bits) + 1; + refcount_table = g_realloc(refcount_table, + nb_clusters * sizeof(uint16_t)); + memset(&refcount_table[old_nb_clusters], 0, (nb_clusters + - old_nb_clusters) * sizeof(uint16_t)); + } + refcount_table[cluster]--; + inc_refcounts(bs, res, refcount_table, nb_clusters, + new_offset, s->cluster_size); + + res->corruptions_fixed++; + } else { + res->corruptions++; + } } } } @@ -1363,6 +1631,12 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } } + /* check OFLAG_COPIED */ + ret = check_oflag_copied(bs, res, fix); + if (ret < 0) { + goto fail; + } + res->image_end_offset = (highest_cluster + 1) * s->cluster_size; ret = 0; @@ -1372,3 +1646,173 @@ fail: return ret; } +#define overlaps_with(ofs, sz) \ + ranges_overlap(offset, size, ofs, sz) + +/* + * Checks if the given offset into the image file is actually free to use by + * looking for overlaps with important metadata sections (L1/L2 tables etc.), + * i.e. a sanity check without relying on the refcount tables. + * + * The ign parameter specifies what checks not to perform (being a bitmask of + * QCow2MetadataOverlap values), i.e., what sections to ignore. + * + * Returns: + * - 0 if writing to this offset will not affect the mentioned metadata + * - a positive QCow2MetadataOverlap value indicating one overlapping section + * - a negative value (-errno) indicating an error while performing a check, + * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 + */ +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int chk = s->overlap_check & ~ign; + int i, j; + + if (!size) { + return 0; + } + + if (chk & QCOW2_OL_MAIN_HEADER) { + if (offset < s->cluster_size) { + return QCOW2_OL_MAIN_HEADER; + } + } + + /* align range to test to cluster boundaries */ + size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); + offset = start_of_cluster(s, offset); + + if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { + if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { + return QCOW2_OL_ACTIVE_L1; + } + } + + if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { + if (overlaps_with(s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t))) { + return QCOW2_OL_REFCOUNT_TABLE; + } + } + + if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { + if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { + return QCOW2_OL_SNAPSHOT_TABLE; + } + } + + if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + if (s->snapshots[i].l1_size && + overlaps_with(s->snapshots[i].l1_table_offset, + s->snapshots[i].l1_size * sizeof(uint64_t))) { + return QCOW2_OL_INACTIVE_L1; + } + } + } + + if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { + for (i = 0; i < s->l1_size; i++) { + if ((s->l1_table[i] & L1E_OFFSET_MASK) && + overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_ACTIVE_L2; + } + } + } + + if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { + for (i = 0; i < s->refcount_table_size; i++) { + if ((s->refcount_table[i] & REFT_OFFSET_MASK) && + overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_REFCOUNT_BLOCK; + } + } + } + + if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + uint64_t l1_ofs = s->snapshots[i].l1_table_offset; + uint32_t l1_sz = s->snapshots[i].l1_size; + uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); + uint64_t *l1 = g_malloc(l1_sz2); + int ret; + + ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); + if (ret < 0) { + g_free(l1); + return ret; + } + + for (j = 0; j < l1_sz; j++) { + uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; + if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { + g_free(l1); + return QCOW2_OL_INACTIVE_L2; + } + } + + g_free(l1); + } + } + + return 0; +} + +static const char *metadata_ol_names[] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", + [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", + [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", + [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", + [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", +}; + +/* + * First performs a check for metadata overlaps (through + * qcow2_check_metadata_overlap); if that fails with a negative value (error + * while performing a check), that value is returned. If an impending overlap + * is detected, the BDS will be made unusable, the qcow2 file marked corrupt + * and -EIO returned. + * + * Returns 0 if there were neither overlaps nor errors while checking for + * overlaps; or a negative value (-errno) on error. + */ +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); + + if (ret < 0) { + return ret; + } else if (ret > 0) { + int metadata_ol_bitnr = ffs(ret) - 1; + char *message; + QObject *data; + + assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); + + fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps " + "with %s); image marked as corrupt.\n", + metadata_ol_names[metadata_ol_bitnr]); + message = g_strdup_printf("Prevented %s overwrite", + metadata_ol_names[metadata_ol_bitnr]); + data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %" + PRId64 ", 'size': %" PRId64 " }", bs->device_name, message, + offset, size); + monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data); + g_free(message); + qobject_decref(data); + + qcow2_mark_corrupt(bs); + bs->drv = NULL; /* make BDS unusable */ + return -EIO; + } + + return 0; +} diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 0caac9055..0aa9defbe 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -26,31 +26,6 @@ #include "block/block_int.h" #include "block/qcow2.h" -typedef struct QEMU_PACKED QCowSnapshotHeader { - /* header is 8 byte aligned */ - uint64_t l1_table_offset; - - uint32_t l1_size; - uint16_t id_str_size; - uint16_t name_size; - - uint32_t date_sec; - uint32_t date_nsec; - - uint64_t vm_clock_nsec; - - uint32_t vm_state_size; - uint32_t extra_data_size; /* for extension */ - /* extra data follows */ - /* id_str follows */ - /* name follows */ -} QCowSnapshotHeader; - -typedef struct QEMU_PACKED QCowSnapshotExtraData { - uint64_t vm_state_size_large; - uint64_t disk_size; -} QCowSnapshotExtraData; - void qcow2_free_snapshots(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; @@ -141,8 +116,14 @@ int qcow2_read_snapshots(BlockDriverState *bs) } offset += name_size; sn->name[name_size] = '\0'; + + if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } } + assert(offset - s->snapshots_offset <= INT_MAX); s->snapshots_size = offset - s->snapshots_offset; return 0; @@ -163,7 +144,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) uint32_t nb_snapshots; uint64_t snapshots_offset; } QEMU_PACKED header_data; - int64_t offset, snapshots_offset; + int64_t offset, snapshots_offset = 0; int ret; /* compute the size of the snapshots */ @@ -175,20 +156,36 @@ static int qcow2_write_snapshots(BlockDriverState *bs) offset += sizeof(extra); offset += strlen(sn->id_str); offset += strlen(sn->name); + + if (offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } } + + assert(offset <= INT_MAX); snapshots_size = offset; /* Allocate space for the new snapshot list */ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); offset = snapshots_offset; if (offset < 0) { - return offset; + ret = offset; + goto fail; } ret = bdrv_flush(bs); if (ret < 0) { - return ret; + goto fail; } + /* The snapshot list position has not yet been updated, so these clusters + * must indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size); + if (ret < 0) { + goto fail; + } + + /* Write all snapshots to the new list */ for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; @@ -211,6 +208,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) id_str_size = strlen(sn->id_str); name_size = strlen(sn->name); + assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); h.id_str_size = cpu_to_be16(id_str_size); h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); @@ -269,6 +267,10 @@ static int qcow2_write_snapshots(BlockDriverState *bs) return 0; fail: + if (snapshots_offset > 0) { + qcow2_free_clusters(bs, snapshots_offset, snapshots_size, + QCOW2_DISCARD_ALWAYS); + } return ret; } @@ -277,7 +279,8 @@ static void find_new_snapshot_id(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; QCowSnapshot *sn; - int i, id, id_max = 0; + int i; + unsigned long id, id_max = 0; for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; @@ -285,34 +288,50 @@ static void find_new_snapshot_id(BlockDriverState *bs, if (id > id_max) id_max = id; } - snprintf(id_str, id_str_size, "%d", id_max + 1); + snprintf(id_str, id_str_size, "%lu", id_max + 1); } -static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +static int find_snapshot_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name) { BDRVQcowState *s = bs->opaque; int i; - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].id_str, id_str)) - return i; + if (id && name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id) && + !strcmp(s->snapshots[i].name, name)) { + return i; + } + } + } else if (id) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id)) { + return i; + } + } + } else if (name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) { + return i; + } + } } + return -1; } -static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +static int find_snapshot_by_id_or_name(BlockDriverState *bs, + const char *id_or_name) { - BDRVQcowState *s = bs->opaque; - int i, ret; + int ret; - ret = find_snapshot_by_id(bs, name); - if (ret >= 0) + ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL); + if (ret >= 0) { return ret; - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].name, name)) - return i; } - return -1; + return find_snapshot_by_id_and_name(bs, NULL, id_or_name); } /* if no id is provided, a new one is constructed */ @@ -326,6 +345,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) uint64_t *l1_table = NULL; int64_t l1_table_offset; + if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) { + return -EFBIG; + } + memset(sn, 0, sizeof(*sn)); /* Generate an ID if it wasn't passed */ @@ -334,7 +357,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) } /* Check that the ID is unique */ - if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { + if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { return -EEXIST; } @@ -363,6 +386,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) l1_table[i] = cpu_to_be64(s->l1_table[i]); } + ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { @@ -396,11 +425,19 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) if (ret < 0) { g_free(s->snapshots); s->snapshots = old_snapshot_list; + s->nb_snapshots--; goto fail; } g_free(old_snapshot_list); + /* The VM state isn't needed any more in the active L1 table; in fact, it + * hurts by causing expensive COW for the next snapshot. */ + qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), + align_offset(sn->vm_state_size, s->cluster_size) + >> BDRV_SECTOR_BITS, + QCOW2_DISCARD_NEVER); + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -475,6 +512,12 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset, cur_l1_bytes); + if (ret < 0) { + goto fail; + } + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, cur_l1_bytes); if (ret < 0) { @@ -531,15 +574,19 @@ fail: return ret; } -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { BDRVQcowState *s = bs->opaque; QCowSnapshot sn; int snapshot_index, ret; /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); if (snapshot_index < 0) { + error_setg(errp, "Can't find the snapshot"); return -ENOENT; } sn = s->snapshots[snapshot_index]; @@ -551,6 +598,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) s->nb_snapshots--; ret = qcow2_write_snapshots(bs); if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to remove snapshot from snapshot list"); return ret; } @@ -568,6 +617,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, sn.l1_size, -1); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table"); return ret; } qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), @@ -576,6 +626,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) /* must update the copied flag on the current cluster offsets */ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to update snapshot status in disk"); return ret; } @@ -617,7 +669,10 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) return s->nb_snapshots; } -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { int i, snapshot_index; BDRVQcowState *s = bs->opaque; @@ -629,18 +684,25 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) assert(bs->read_only); /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); if (snapshot_index < 0) { + error_setg(errp, + "Can't find snapshot"); return -ENOENT; } sn = &s->snapshots[snapshot_index]; /* Allocate and read in the snapshot's L1 table */ - new_l1_bytes = s->l1_size * sizeof(uint64_t); + if (sn->l1_size > QCOW_MAX_L1_SIZE) { + error_setg(errp, "Snapshot L1 table too large"); + return -EFBIG; + } + new_l1_bytes = sn->l1_size * sizeof(uint64_t); new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); if (ret < 0) { + error_setg(errp, "Failed to read l1 table for snapshot"); g_free(new_l1_table); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index 3376901bd..e903d971c 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -52,7 +52,7 @@ typedef struct { uint32_t magic; uint32_t len; -} QCowExtension; +} QEMU_PACKED QCowExtension; #define QCOW2_EXT_MAGIC_END 0 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA @@ -79,7 +79,8 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) * return 0 upon success, non-0 otherwise */ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, - uint64_t end_offset, void **p_feature_table) + uint64_t end_offset, void **p_feature_table, + Error **errp) { BDRVQcowState *s = bs->opaque; QCowExtension ext; @@ -100,10 +101,10 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attempting to read extended header in offset %lu\n", offset); #endif - if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { - fprintf(stderr, "qcow2_read_extension: ERROR: " - "pread fail from offset %" PRIu64 "\n", - offset); + ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); + if (ret < 0) { + error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " + "pread fail from offset %" PRIu64, offset); return 1; } be32_to_cpus(&ext.magic); @@ -113,7 +114,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("ext.magic = 0x%x\n", ext.magic); #endif if (ext.len > end_offset - offset) { - error_report("Header extension too large"); + error_setg(errp, "Header extension too large"); return -EINVAL; } @@ -123,14 +124,16 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_BACKING_FORMAT: if (ext.len >= sizeof(bs->backing_format)) { - fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" - " (>=%zu)\n", - ext.len, sizeof(bs->backing_format)); + error_setg(errp, "ERROR: ext_backing_format: len=%u too large" + " (>=%zu)", ext.len, sizeof(bs->backing_format)); return 2; } - if (bdrv_pread(bs->file, offset , bs->backing_format, - ext.len) != ext.len) + ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " + "Could not read format name"); return 3; + } bs->backing_format[ext.len] = '\0'; #ifdef DEBUG_EXT printf("Qcow2: Got format extension %s\n", bs->backing_format); @@ -142,6 +145,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); ret = bdrv_pread(bs->file, offset , feature_table, ext.len); if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " + "Could not read table"); return ret; } @@ -161,6 +166,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, ret = bdrv_pread(bs->file, offset , uext->data, uext->len); if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: unknown extension: " + "Could not read data"); return ret; } } @@ -184,8 +191,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs) } } -static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, - const char *fmt, ...) +static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, + Error **errp, const char *fmt, ...) { char msg[64]; va_list ap; @@ -194,17 +201,17 @@ static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, vsnprintf(msg, sizeof(msg), fmt, ap); va_end(ap); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow2", msg); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2", + msg); } static void report_unsupported_feature(BlockDriverState *bs, - Qcow2Feature *table, uint64_t mask) + Error **errp, Qcow2Feature *table, uint64_t mask) { while (table && table->name[0] != '\0') { if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { if (mask & (1 << table->bit)) { - report_unsupported(bs, "%.46s",table->name); + report_unsupported(bs, errp, "%.46s", table->name); mask &= ~(1 << table->bit); } } @@ -212,7 +219,8 @@ static void report_unsupported_feature(BlockDriverState *bs, } if (mask) { - report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); + report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64, + mask); } } @@ -261,12 +269,46 @@ static int qcow2_mark_clean(BlockDriverState *bs) BDRVQcowState *s = bs->opaque; if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + int ret; + + s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + return qcow2_update_header(bs); + } + return 0; +} + +/* + * Marks the image as corrupt. + */ +int qcow2_mark_corrupt(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; + return qcow2_update_header(bs); +} + +/* + * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes + * before if necessary. + */ +int qcow2_mark_consistent(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { int ret = bdrv_flush(bs); if (ret < 0) { return ret; } - s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; return qcow2_update_header(bs); } return 0; @@ -281,11 +323,41 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, } if (fix && result->check_errors == 0 && result->corruptions == 0) { - return qcow2_mark_clean(bs); + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + return qcow2_mark_consistent(bs); } return ret; } +static int validate_table_offset(BlockDriverState *bs, uint64_t offset, + uint64_t entries, size_t entry_len) +{ + BDRVQcowState *s = bs->opaque; + uint64_t size; + + /* Use signed INT64_MAX as the maximum even for uint64_t header fields, + * because values will be passed to qemu functions taking int64_t. */ + if (entries > INT64_MAX / entry_len) { + return -EINVAL; + } + + size = entries * entry_len; + + if (INT64_MAX - size < offset) { + return -EINVAL; + } + + /* Tables must be cluster aligned */ + if (offset & (s->cluster_size - 1)) { + return -EINVAL; + } + + return 0; +} + static QemuOptsList qcow2_runtime_opts = { .name = "qcow2", .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), @@ -311,22 +383,84 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Generate discard requests when other clusters are freed", }, + { + .name = QCOW2_OPT_OVERLAP, + .type = QEMU_OPT_STRING, + .help = "Selects which overlap checks to perform from a range of " + "templates (none, constant, cached, all)", + }, + { + .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the main qcow2 header", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the active L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an active L2 table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the refcount table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into a refcount block", + }, + { + .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the snapshot table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L2 table", + }, { /* end of list */ } }, }; -static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) +static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, + [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, + [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, + [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, +}; + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQcowState *s = bs->opaque; - int len, i, ret = 0; + unsigned int len, i; + int ret = 0; QCowHeader header; QemuOpts *opts; Error *local_err = NULL; uint64_t ext_end; uint64_t l1_vm_state_index; + const char *opt_overlap_check; + int overlap_check_template = 0; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read qcow2 header"); goto fail; } be32_to_cpus(&header.magic); @@ -344,17 +478,30 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) be32_to_cpus(&header.nb_snapshots); if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image is not in qcow2 format"); + ret = -EINVAL; goto fail; } if (header.version < 2 || header.version > 3) { - report_unsupported(bs, "QCOW version %d", header.version); + report_unsupported(bs, errp, "QCOW version %d", header.version); ret = -ENOTSUP; goto fail; } s->qcow_version = header.version; + /* Initialise cluster size */ + if (header.cluster_bits < MIN_CLUSTER_BITS || + header.cluster_bits > MAX_CLUSTER_BITS) { + error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits); + ret = -EINVAL; + goto fail; + } + + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + /* Initialise version 3 header fields */ if (header.version == 2) { header.incompatible_features = 0; @@ -368,6 +515,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) be64_to_cpus(&header.autoclear_features); be32_to_cpus(&header.refcount_order); be32_to_cpus(&header.header_length); + + if (header.header_length < 104) { + error_setg(errp, "qcow2 header too short"); + ret = -EINVAL; + goto fail; + } + } + + if (header.header_length > s->cluster_size) { + error_setg(errp, "qcow2 header exceeds cluster size"); + ret = -EINVAL; + goto fail; } if (header.header_length > sizeof(header)) { @@ -376,10 +535,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, s->unknown_header_fields_size); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " + "fields"); goto fail; } } + if (header.backing_file_offset > s->cluster_size) { + error_setg(errp, "Invalid backing file offset"); + ret = -EINVAL; + goto fail; + } + if (header.backing_file_offset) { ext_end = header.backing_file_offset; } else { @@ -394,28 +561,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { void *feature_table = NULL; qcow2_read_extensions(bs, header.header_length, ext_end, - &feature_table); - report_unsupported_feature(bs, feature_table, + &feature_table, NULL); + report_unsupported_feature(bs, errp, feature_table, s->incompatible_features & ~QCOW2_INCOMPAT_MASK); ret = -ENOTSUP; + g_free(feature_table); goto fail; } + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { + /* Corrupt images may not be written to unless they are being repaired + */ + if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { + error_setg(errp, "qcow2: Image is corrupt; cannot be opened " + "read/write"); + ret = -EACCES; + goto fail; + } + } + /* Check support for various header values */ if (header.refcount_order != 4) { - report_unsupported(bs, "%d bit reference counts", + report_unsupported(bs, errp, "%d bit reference counts", 1 << header.refcount_order); ret = -ENOTSUP; goto fail; } + s->refcount_order = header.refcount_order; - if (header.cluster_bits < MIN_CLUSTER_BITS || - header.cluster_bits > MAX_CLUSTER_BITS) { - ret = -EINVAL; - goto fail; - } if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "Unsupported encryption method: %i", + header.crypt_method); ret = -EINVAL; goto fail; } @@ -423,27 +600,57 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) if (s->crypt_method_header) { bs->encrypted = 1; } - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ s->l2_size = 1 << s->l2_bits; bs->total_sectors = header.size / 512; s->csize_shift = (62 - (s->cluster_bits - 8)); s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + s->refcount_table_offset = header.refcount_table_offset; s->refcount_table_size = header.refcount_table_clusters << (s->cluster_bits - 3); - s->snapshots_offset = header.snapshots_offset; - s->nb_snapshots = header.nb_snapshots; + if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { + error_setg(errp, "Reference count table too large"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, s->refcount_table_offset, + s->refcount_table_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid reference count table offset"); + goto fail; + } + + /* Snapshot table offset/length */ + if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { + error_setg(errp, "Too many snapshots"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, header.snapshots_offset, + header.nb_snapshots, + sizeof(QCowSnapshotHeader)); + if (ret < 0) { + error_setg(errp, "Invalid snapshot table offset"); + goto fail; + } /* read the level 1 table */ + if (header.l1_size > QCOW_MAX_L1_SIZE) { + error_setg(errp, "Active L1 table too large"); + ret = -EFBIG; + goto fail; + } s->l1_size = header.l1_size; l1_vm_state_index = size_to_l1(s, header.size); if (l1_vm_state_index > INT_MAX) { + error_setg(errp, "Image is too big"); ret = -EFBIG; goto fail; } @@ -452,16 +659,27 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) /* the L1 table must contain at least enough entries to put header.size bytes */ if (s->l1_size < s->l1_vm_state_index) { + error_setg(errp, "L1 table is too small"); ret = -EINVAL; goto fail; } + + ret = validate_table_offset(bs, header.l1_table_offset, + header.l1_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid L1 table offset"); + goto fail; + } s->l1_table_offset = header.l1_table_offset; + + if (s->l1_size > 0) { s->l1_table = g_malloc0( align_offset(s->l1_size * sizeof(uint64_t), 512)); ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read L1 table"); goto fail; } for(i = 0;i < s->l1_size; i++) { @@ -482,6 +700,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) ret = qcow2_refcount_init(bs); if (ret != 0) { + error_setg_errno(errp, -ret, "Could not initialize refcount handling"); goto fail; } @@ -489,7 +708,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) QTAILQ_INIT(&s->discards); /* read qcow2 extensions */ - if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { + if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, + &local_err)) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -497,27 +718,36 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; - if (len > 1023) { - len = 1023; + if (len > MIN(1023, s->cluster_size - header.backing_file_offset)) { + error_setg(errp, "Backing file name too long"); + ret = -EINVAL; + goto fail; } ret = bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read backing file name"); goto fail; } bs->backing_file[len] = '\0'; } + /* Internal snapshots */ + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + ret = qcow2_read_snapshots(bs); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read snapshots"); goto fail; } /* Clear unknown autoclear feature bits */ - if (!bs->read_only && s->autoclear_features != 0) { + if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) { s->autoclear_features = 0; ret = qcow2_update_header(bs); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not update qcow2 header"); goto fail; } } @@ -526,22 +756,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) qemu_co_mutex_init(&s->lock); /* Repair image if dirty */ - if (!(flags & BDRV_O_CHECK) && !bs->read_only && + if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only && (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { BdrvCheckResult result = {0}; ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not repair dirty image"); goto fail; } } /* Enable lazy_refcounts according to image and command line options */ - opts = qemu_opts_create_nofail(&qcow2_runtime_opts); + opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -559,11 +789,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) s->discard_passthrough[QCOW2_DISCARD_OTHER] = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached"; + if (!strcmp(opt_overlap_check, "none")) { + overlap_check_template = 0; + } else if (!strcmp(opt_overlap_check, "constant")) { + overlap_check_template = QCOW2_OL_CONSTANT; + } else if (!strcmp(opt_overlap_check, "cached")) { + overlap_check_template = QCOW2_OL_CACHED; + } else if (!strcmp(opt_overlap_check, "all")) { + overlap_check_template = QCOW2_OL_ALL; + } else { + error_setg(errp, "Unsupported value '%s' for qcow2 option " + "'overlap-check'. Allowed are either of the following: " + "none, constant, cached, all", opt_overlap_check); + qemu_opts_del(opts); + ret = -EINVAL; + goto fail; + } + + s->overlap_check = 0; + for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { + /* overlap-check defines a template bitmask, but every flag may be + * overwritten through the associated boolean option */ + s->overlap_check |= + qemu_opt_get_bool(opts, overlap_bool_option_names[i], + overlap_check_template & (1 << i)) << i; + } + qemu_opts_del(opts); if (s->use_lazy_refcounts && s->qcow_version < 3) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " - "a qcow2 image with at least qemu 1.1 compatibility level"); + error_setg(errp, "Lazy refcounts require a qcow2 image with at least " + "qemu 1.1 compatibility level"); ret = -EINVAL; goto fail; } @@ -582,14 +839,28 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) qcow2_free_snapshots(bs); qcow2_refcount_close(bs); g_free(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; if (s->l2_table_cache) { qcow2_cache_destroy(bs, s->l2_table_cache); } + if (s->refcount_block_cache) { + qcow2_cache_destroy(bs, s->refcount_block_cache); + } g_free(s->cluster_cache); qemu_vfree(s->cluster_data); return ret; } +static int qcow2_refresh_limits(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->cluster_sectors; + + return 0; +} + static int qcow2_set_key(BlockDriverState *bs, const char *key) { BDRVQcowState *s = bs->opaque; @@ -632,32 +903,56 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) return 0; } -/* We have nothing to do for QCOW2 reopen, stubs just return - * success */ +/* We have no actual commit/abort logic for qcow2, but we need to write out any + * unwritten data if we reopen read-only. */ static int qcow2_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { + int ret; + + if ((state->flags & BDRV_O_RDWR) == 0) { + ret = bdrv_flush(state->bs); + if (ret < 0) { + return ret; + } + + ret = qcow2_mark_clean(state->bs); + if (ret < 0) { + return ret; + } + } + return 0; } -static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQcowState *s = bs->opaque; uint64_t cluster_offset; - int ret; + int index_in_cluster, ret; + int64_t status = 0; *pnum = nb_sectors; - /* FIXME We can get errors here, but the bdrv_co_is_allocated interface - * can't pass them on today */ qemu_co_mutex_lock(&s->lock); ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { - *pnum = 0; + return ret; } - return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); + if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && + !s->crypt_method) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; + } + if (ret == QCOW2_CLUSTER_ZERO) { + status |= BDRV_BLOCK_ZERO; + } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { + status |= BDRV_BLOCK_DATA; + } + return status; } /* handle reading after the end of the backing file */ @@ -821,7 +1116,6 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; int index_in_cluster; - int n_end; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset; @@ -845,14 +1139,16 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, trace_qcow2_writev_start_part(qemu_coroutine_self()); index_in_cluster = sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + remaining_sectors; + cur_nr_sectors = remaining_sectors; if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + cur_nr_sectors > + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { + cur_nr_sectors = + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; } ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); + &cur_nr_sectors, &cluster_offset, &l2meta); if (ret < 0) { goto fail; } @@ -881,6 +1177,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, cur_nr_sectors * 512); } + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, + cur_nr_sectors * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto fail; + } + qemu_co_mutex_unlock(&s->lock); BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), @@ -947,11 +1250,15 @@ static void qcow2_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; g_free(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; - qcow2_cache_flush(bs, s->l2_table_cache); - qcow2_cache_flush(bs, s->refcount_block_cache); + if (!(bs->open_flags & BDRV_O_INCOMING)) { + qcow2_cache_flush(bs, s->l2_table_cache); + qcow2_cache_flush(bs, s->refcount_block_cache); - qcow2_mark_clean(bs); + qcow2_mark_clean(bs); + } qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); @@ -965,7 +1272,7 @@ static void qcow2_close(BlockDriverState *bs) qcow2_free_snapshots(bs); } -static void qcow2_invalidate_cache(BlockDriverState *bs) +static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQcowState *s = bs->opaque; int flags = s->flags; @@ -973,6 +1280,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) AES_KEY aes_decrypt_key; uint32_t crypt_method = 0; QDict *options; + Error *local_err = NULL; + int ret; /* * Backing files are read-only which makes all of their metadata immutable, @@ -987,12 +1296,25 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) qcow2_close(bs); - options = qdict_new(); - qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, - qbool_from_int(s->use_lazy_refcounts)); + bdrv_invalidate_cache(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } memset(s, 0, sizeof(BDRVQcowState)); - qcow2_open(bs, options, flags); + options = qdict_clone_shallow(bs->options); + + ret = qcow2_open(bs, options, flags, &local_err); + if (local_err) { + error_setg(errp, "Could not reopen qcow2 layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); + return; + } QDECREF(options); @@ -1076,7 +1398,7 @@ int qcow2_update_header(BlockDriverState *bs) .incompatible_features = cpu_to_be64(s->incompatible_features), .compatible_features = cpu_to_be64(s->compatible_features), .autoclear_features = cpu_to_be64(s->autoclear_features), - .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .refcount_order = cpu_to_be32(s->refcount_order), .header_length = cpu_to_be32(header_length), }; @@ -1130,6 +1452,11 @@ int qcow2_update_header(BlockDriverState *bs) .name = "dirty bit", }, { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, + .name = "corrupt bit", + }, + { .type = QCOW2_FEAT_TYPE_COMPATIBLE, .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, .name = "lazy refcounts", @@ -1210,34 +1537,39 @@ static int preallocate(BlockDriverState *bs) int ret; QCowL2Meta *meta; - nb_sectors = bdrv_getlength(bs) >> 9; + nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; offset = 0; while (nb_sectors) { - num = MIN(nb_sectors, INT_MAX >> 9); - ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, + num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); + ret = qcow2_alloc_cluster_offset(bs, offset, &num, &host_offset, &meta); if (ret < 0) { return ret; } - ret = qcow2_alloc_cluster_link_l2(bs, meta); - if (ret < 0) { - qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, - QCOW2_DISCARD_NEVER); - return ret; - } + while (meta) { + QCowL2Meta *next = meta->next; + + ret = qcow2_alloc_cluster_link_l2(bs, meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta->alloc_offset, + meta->nb_clusters, QCOW2_DISCARD_NEVER); + return ret; + } - /* There are no dependent requests, but we need to remove our request - * from the list of in-flight requests */ - if (meta != NULL) { + /* There are no dependent requests, but we need to remove our + * request from the list of in-flight requests */ QLIST_REMOVE(meta, next_in_flight); + + g_free(meta); + meta = next; } /* TODO Preallocate data if requested */ nb_sectors -= num; - offset += num << 9; + offset += num << BDRV_SECTOR_BITS; } /* @@ -1246,9 +1578,10 @@ static int preallocate(BlockDriverState *bs) * EOF). Extend the image to the last allocated sector. */ if (host_offset != 0) { - uint8_t buf[512]; - memset(buf, 0, 512); - ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); + uint8_t buf[BDRV_SECTOR_SIZE]; + memset(buf, 0, BDRV_SECTOR_SIZE); + ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1, + buf, 1); if (ret < 0) { return ret; } @@ -1260,7 +1593,8 @@ static int preallocate(BlockDriverState *bs) static int qcow2_create2(const char *filename, int64_t total_size, const char *backing_file, const char *backing_format, int flags, size_t cluster_size, int prealloc, - QEMUOptionParameter *options, int version) + QEMUOptionParameter *options, int version, + Error **errp) { /* Calculate cluster_bits */ int cluster_bits; @@ -1268,9 +1602,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) { - error_report( - "Cluster size must be a power of two between %d and %dk", - 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); + error_setg(errp, "Cluster size must be a power of two between %d and " + "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); return -EINVAL; } @@ -1287,59 +1620,72 @@ static int qcow2_create2(const char *filename, int64_t total_size, * size for any qcow2 image. */ BlockDriverState* bs; - QCowHeader header; - uint8_t* refcount_table; + QCowHeader *header; + uint64_t* refcount_table; + Error *local_err = NULL; int ret; - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } /* Write the header */ - memset(&header, 0, sizeof(header)); - header.magic = cpu_to_be32(QCOW_MAGIC); - header.version = cpu_to_be32(version); - header.cluster_bits = cpu_to_be32(cluster_bits); - header.size = cpu_to_be64(0); - header.l1_table_offset = cpu_to_be64(0); - header.l1_size = cpu_to_be32(0); - header.refcount_table_offset = cpu_to_be64(cluster_size); - header.refcount_table_clusters = cpu_to_be32(1); - header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); - header.header_length = cpu_to_be32(sizeof(header)); + QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); + header = g_malloc0(cluster_size); + *header = (QCowHeader) { + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(version), + .cluster_bits = cpu_to_be32(cluster_bits), + .size = cpu_to_be64(0), + .l1_table_offset = cpu_to_be64(0), + .l1_size = cpu_to_be32(0), + .refcount_table_offset = cpu_to_be64(cluster_size), + .refcount_table_clusters = cpu_to_be32(1), + .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .header_length = cpu_to_be32(sizeof(*header)), + }; if (flags & BLOCK_FLAG_ENCRYPT) { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); } else { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); } if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { - header.compatible_features |= + header->compatible_features |= cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); + ret = bdrv_pwrite(bs, 0, header, cluster_size); + g_free(header); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write qcow2 header"); goto out; } - /* Write an empty refcount table */ - refcount_table = g_malloc0(cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); + /* Write a refcount table with one refcount block */ + refcount_table = g_malloc0(2 * cluster_size); + refcount_table[0] = cpu_to_be64(2 * cluster_size); + ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); g_free(refcount_table); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write refcount table"); goto out; } - bdrv_close(bs); + bdrv_unref(bs); + bs = NULL; /* * And now open the image and make it consistent first (i.e. increase the @@ -1348,14 +1694,17 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ BlockDriver* drv = bdrv_find_format("qcow2"); assert(drv != NULL); - ret = bdrv_open(bs, filename, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err); if (ret < 0) { + error_propagate(errp, local_err); goto out; } - ret = qcow2_alloc_clusters(bs, 2 * cluster_size); + ret = qcow2_alloc_clusters(bs, 3 * cluster_size); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " + "header and refcount table"); goto out; } else if (ret != 0) { @@ -1366,6 +1715,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Okay, now that we have a valid image, let's give it the right size */ ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not resize image"); goto out; } @@ -1373,6 +1723,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, if (backing_file) { ret = bdrv_change_backing_file(bs, backing_file, backing_format); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not assign backing file '%s' " + "with format '%s'", backing_file, backing_format); goto out; } } @@ -1384,17 +1736,33 @@ static int qcow2_create2(const char *filename, int64_t total_size, ret = preallocate(bs); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not preallocate metadata"); goto out; } } + bdrv_unref(bs); + bs = NULL; + + /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, + drv, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + ret = 0; out: - bdrv_delete(bs); + if (bs) { + bdrv_unref(bs); + } return ret; } -static int qcow2_create(const char *filename, QEMUOptionParameter *options) +static int qcow2_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { const char *backing_file = NULL; const char *backing_fmt = NULL; @@ -1402,7 +1770,9 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) int flags = 0; size_t cluster_size = DEFAULT_CLUSTER_SIZE; int prealloc = 0; - int version = 2; + int version = 3; + Error *local_err = NULL; + int ret; /* Read out options */ while (options && options->name) { @@ -1424,18 +1794,20 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) } else if (!strcmp(options->value.s, "metadata")) { prealloc = 1; } else { - fprintf(stderr, "Invalid preallocation mode: '%s'\n", - options->value.s); + error_setg(errp, "Invalid preallocation mode: '%s'", + options->value.s); return -EINVAL; } } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { - if (!options->value.s || !strcmp(options->value.s, "0.10")) { + if (!options->value.s) { + /* keep the default */ + } else if (!strcmp(options->value.s, "0.10")) { version = 2; } else if (!strcmp(options->value.s, "1.1")) { version = 3; } else { - fprintf(stderr, "Invalid compatibility level: '%s'\n", - options->value.s); + error_setg(errp, "Invalid compatibility level: '%s'", + options->value.s); return -EINVAL; } } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { @@ -1445,43 +1817,27 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) } if (backing_file && prealloc) { - fprintf(stderr, "Backing file and preallocation cannot be used at " - "the same time\n"); + error_setg(errp, "Backing file and preallocation cannot be used at " + "the same time"); return -EINVAL; } if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { - fprintf(stderr, "Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)\n"); + error_setg(errp, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)"); return -EINVAL; } - return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, - cluster_size, prealloc, options, version); -} - -static int qcow2_make_empty(BlockDriverState *bs) -{ -#if 0 - /* XXX: not correct */ - BDRVQcowState *s = bs->opaque; - uint32_t l1_length = s->l1_size * sizeof(uint64_t); - int ret; - - memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) - return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); - if (ret < 0) - return ret; - - l2_cache_reset(bs); -#endif - return 0; + ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, + cluster_size, prealloc, options, version, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { int ret; BDRVQcowState *s = bs->opaque; @@ -1508,7 +1864,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); + nb_sectors, QCOW2_DISCARD_REQUEST); qemu_co_mutex_unlock(&s->lock); return ret; } @@ -1631,6 +1987,12 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, goto fail; } cluster_offset &= s->cluster_offset_mask; + + ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); if (ret < 0) { @@ -1668,19 +2030,43 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) return 0; } -static int64_t qcow2_vm_state_offset(BDRVQcowState *s) -{ - return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); -} - static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVQcowState *s = bs->opaque; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); bdi->cluster_size = s->cluster_size; bdi->vm_state_offset = qcow2_vm_state_offset(s); return 0; } +static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + + *spec_info = (ImageInfoSpecific){ + .kind = IMAGE_INFO_SPECIFIC_KIND_QCOW2, + { + .qcow2 = g_new(ImageInfoSpecificQCow2, 1), + }, + }; + if (s->qcow_version == 2) { + *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("0.10"), + }; + } else if (s->qcow_version == 3) { + *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("1.1"), + .lazy_refcounts = s->compatible_features & + QCOW2_COMPAT_LAZY_REFCOUNTS, + .has_lazy_refcounts = true, + }; + } + + return spec_info; +} + #if 0 static void dump_refcounts(BlockDriverState *bs) { @@ -1706,13 +2092,22 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BDRVQcowState *s = bs->opaque; + int64_t total_sectors = bs->total_sectors; int growable = bs->growable; + bool zero_beyond_eof = bs->zero_beyond_eof; int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); bs->growable = 1; + bs->zero_beyond_eof = false; ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); bs->growable = growable; + bs->zero_beyond_eof = zero_beyond_eof; + + /* bdrv_co_do_writev will have increased the total_sectors value to include + * the VM state - the VM state is however not an actual part of the block + * device, therefore, we need to restore the old value. */ + bs->total_sectors = total_sectors; return ret; } @@ -1722,16 +2117,212 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, { BDRVQcowState *s = bs->opaque; int growable = bs->growable; + bool zero_beyond_eof = bs->zero_beyond_eof; int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); bs->growable = 1; + bs->zero_beyond_eof = false; ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); bs->growable = growable; + bs->zero_beyond_eof = zero_beyond_eof; return ret; } +/* + * Downgrades an image's version. To achieve this, any incompatible features + * have to be removed. + */ +static int qcow2_downgrade(BlockDriverState *bs, int target_version) +{ + BDRVQcowState *s = bs->opaque; + int current_version = s->qcow_version; + int ret; + + if (target_version == current_version) { + return 0; + } else if (target_version > current_version) { + return -EINVAL; + } else if (target_version != 2) { + return -EINVAL; + } + + if (s->refcount_order != 4) { + /* we would have to convert the image to a refcount_order == 4 image + * here; however, since qemu (at the time of writing this) does not + * support anything different than 4 anyway, there is no point in doing + * so right now; however, we should error out (if qemu supports this in + * the future and this code has not been adapted) */ + error_report("qcow2_downgrade: Image refcount orders other than 4 are " + "currently not supported."); + return -ENOTSUP; + } + + /* clear incompatible features */ + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + } + + /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in + * the first place; if that happens nonetheless, returning -ENOTSUP is the + * best thing to do anyway */ + + if (s->incompatible_features) { + return -ENOTSUP; + } + + /* since we can ignore compatible features, we can set them to 0 as well */ + s->compatible_features = 0; + /* if lazy refcounts have been used, they have already been fixed through + * clearing the dirty flag */ + + /* clearing autoclear features is trivial */ + s->autoclear_features = 0; + + ret = qcow2_expand_zero_clusters(bs); + if (ret < 0) { + return ret; + } + + s->qcow_version = target_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = current_version; + return ret; + } + return 0; +} + +static int qcow2_amend_options(BlockDriverState *bs, + QEMUOptionParameter *options) +{ + BDRVQcowState *s = bs->opaque; + int old_version = s->qcow_version, new_version = old_version; + uint64_t new_size = 0; + const char *backing_file = NULL, *backing_format = NULL; + bool lazy_refcounts = s->use_lazy_refcounts; + int ret; + int i; + + for (i = 0; options[i].name; i++) + { + if (!options[i].assigned) { + /* only change explicitly defined options */ + continue; + } + + if (!strcmp(options[i].name, "compat")) { + if (!options[i].value.s) { + /* preserve default */ + } else if (!strcmp(options[i].value.s, "0.10")) { + new_version = 2; + } else if (!strcmp(options[i].value.s, "1.1")) { + new_version = 3; + } else { + fprintf(stderr, "Unknown compatibility level %s.\n", + options[i].value.s); + return -EINVAL; + } + } else if (!strcmp(options[i].name, "preallocation")) { + fprintf(stderr, "Cannot change preallocation mode.\n"); + return -ENOTSUP; + } else if (!strcmp(options[i].name, "size")) { + new_size = options[i].value.n; + } else if (!strcmp(options[i].name, "backing_file")) { + backing_file = options[i].value.s; + } else if (!strcmp(options[i].name, "backing_fmt")) { + backing_format = options[i].value.s; + } else if (!strcmp(options[i].name, "encryption")) { + if ((options[i].value.n != !!s->crypt_method)) { + fprintf(stderr, "Changing the encryption flag is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(options[i].name, "cluster_size")) { + if (options[i].value.n != s->cluster_size) { + fprintf(stderr, "Changing the cluster size is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(options[i].name, "lazy_refcounts")) { + lazy_refcounts = options[i].value.n; + } else { + /* if this assertion fails, this probably means a new option was + * added without having it covered here */ + assert(false); + } + } + + if (new_version != old_version) { + if (new_version > old_version) { + /* Upgrade */ + s->qcow_version = new_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = old_version; + return ret; + } + } else { + ret = qcow2_downgrade(bs, new_version); + if (ret < 0) { + return ret; + } + } + } + + if (backing_file || backing_format) { + ret = qcow2_change_backing_file(bs, backing_file ?: bs->backing_file, + backing_format ?: bs->backing_format); + if (ret < 0) { + return ret; + } + } + + if (s->use_lazy_refcounts != lazy_refcounts) { + if (lazy_refcounts) { + if (s->qcow_version < 3) { + fprintf(stderr, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)\n"); + return -EINVAL; + } + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = true; + } else { + /* make image clean first */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + /* now disallow lazy refcounts */ + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = false; + } + } + + if (new_size) { + ret = bdrv_truncate(bs, new_size); + if (ret < 0) { + return ret; + } + } + + return 0; +} + static QEMUOptionParameter qcow2_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1786,9 +2377,8 @@ static BlockDriver bdrv_qcow2 = { .bdrv_reopen_prepare = qcow2_reopen_prepare, .bdrv_create = qcow2_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = qcow2_co_is_allocated, + .bdrv_co_get_block_status = qcow2_co_get_block_status, .bdrv_set_key = qcow2_set_key, - .bdrv_make_empty = qcow2_make_empty, .bdrv_co_readv = qcow2_co_readv, .bdrv_co_writev = qcow2_co_writev, @@ -1805,16 +2395,19 @@ static BlockDriver bdrv_qcow2 = { .bdrv_snapshot_list = qcow2_snapshot_list, .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, .bdrv_get_info = qcow2_get_info, + .bdrv_get_specific_info = qcow2_get_specific_info, .bdrv_save_vmstate = qcow2_save_vmstate, .bdrv_load_vmstate = qcow2_load_vmstate, .bdrv_change_backing_file = qcow2_change_backing_file, + .bdrv_refresh_limits = qcow2_refresh_limits, .bdrv_invalidate_cache = qcow2_invalidate_cache, .create_options = qcow2_create_options, .bdrv_check = qcow2_check, + .bdrv_amend_options = qcow2_amend_options, }; static void bdrv_qcow2_init(void) diff --git a/block/qcow2.h b/block/qcow2.h index dba977141..b49424b85 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -38,13 +38,26 @@ #define QCOW_CRYPT_AES 1 #define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE 0x800000 + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE 0x2000000 + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) /* indicate that the refcount of the referenced cluster is exactly one. */ -#define QCOW_OFLAG_COPIED (1LL << 63) +#define QCOW_OFLAG_COPIED (1ULL << 63) /* indicate that the cluster is compressed (they never have the copied flag) */ -#define QCOW_OFLAG_COMPRESSED (1LL << 62) +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) /* The cluster reads as all zeros */ -#define QCOW_OFLAG_ZERO (1LL << 0) +#define QCOW_OFLAG_ZERO (1ULL << 0) #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ @@ -63,6 +76,15 @@ #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" typedef struct QCowHeader { uint32_t magic; @@ -86,7 +108,33 @@ typedef struct QCowHeader { uint32_t refcount_order; uint32_t header_length; -} QCowHeader; +} QEMU_PACKED QCowHeader; + +typedef struct QEMU_PACKED QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; +} QCowSnapshotExtraData; + typedef struct QCowSnapshot { uint64_t l1_table_offset; @@ -119,9 +167,12 @@ enum { /* Incompatible feature bits */ enum { QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_CORRUPT_BITNR = 1, QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, - QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY + | QCOW2_INCOMPAT_CORRUPT, }; /* Compatible feature bits */ @@ -179,8 +230,8 @@ typedef struct BDRVQcowState { uint64_t *refcount_table; uint64_t refcount_table_offset; uint32_t refcount_table_size; - int64_t free_cluster_index; - int64_t free_byte_offset; + uint64_t free_cluster_index; + uint64_t free_byte_offset; CoMutex lock; @@ -190,15 +241,18 @@ typedef struct BDRVQcowState { AES_KEY aes_decrypt_key; uint64_t snapshots_offset; int snapshots_size; - int nb_snapshots; + unsigned int nb_snapshots; QCowSnapshot *snapshots; int flags; int qcow_version; bool use_lazy_refcounts; + int refcount_order; bool discard_passthrough[QCOW2_DISCARD_MAX]; + int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ + uint64_t incompatible_features; uint64_t compatible_features; uint64_t autoclear_features; @@ -286,11 +340,50 @@ enum { QCOW2_CLUSTER_ZERO }; -#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL -#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +typedef enum QCow2MetadataOverlap { + QCOW2_OL_MAIN_HEADER_BITNR = 0, + QCOW2_OL_ACTIVE_L1_BITNR = 1, + QCOW2_OL_ACTIVE_L2_BITNR = 2, + QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, + QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, + QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, + QCOW2_OL_INACTIVE_L1_BITNR = 6, + QCOW2_OL_INACTIVE_L2_BITNR = 7, + + QCOW2_OL_MAX_BITNR = 8, + + QCOW2_OL_NONE = 0, + QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), + QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), + QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), + QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), + QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), + QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), + QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), + /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv + * reads. */ + QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ + (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ + QCOW2_OL_SNAPSHOT_TABLE) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ + (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ + QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ + (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL -#define REFT_OFFSET_MASK 0xffffffffffffff00ULL +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) { @@ -324,6 +417,16 @@ static inline int64_t align_offset(int64_t offset, int n) return offset; } +static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s) +{ + return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; +} + static inline int qcow2_get_cluster_type(uint64_t l2_entry) { if (l2_entry & QCOW_OFLAG_COMPRESSED) { @@ -361,13 +464,18 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_mark_corrupt(BlockDriverState *bs); +int qcow2_mark_consistent(BlockDriverState *bs); int qcow2_update_header(BlockDriverState *bs); /* qcow2-refcount.c functions */ int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); +int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, + int addend, enum qcow2_discard_type type); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int nb_clusters); int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); @@ -385,9 +493,15 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, void qcow2_process_discards(BlockDriverState *bs, int ret); +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); + /* qcow2-cluster.c functions */ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, bool exact_size); +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); void qcow2_l2_cache_reset(BlockDriverState *bs); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, @@ -398,22 +512,30 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset); int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); + int *num, uint64_t *host_offset, QCowL2Meta **m); uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size); int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors); + int nb_sectors, enum qcow2_discard_type type); int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); +int qcow2_expand_zero_clusters(BlockDriverState *bs); + /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); void qcow2_free_snapshots(BlockDriverState *bs); int qcow2_read_snapshots(BlockDriverState *bs); @@ -428,6 +550,8 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); void qcow2_cache_depends_on_flush(Qcow2Cache *c); +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); + int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, diff --git a/block/qed.c b/block/qed.c index f767b0528..3bd9db9c8 100644 --- a/block/qed.c +++ b/block/qed.c @@ -353,10 +353,10 @@ static void qed_start_need_check_timer(BDRVQEDState *s) { trace_qed_start_need_check_timer(s); - /* Use vm_clock so we don't alter the image file while suspended for + /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for * migration. */ - qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + + timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); } @@ -364,7 +364,7 @@ static void qed_start_need_check_timer(BDRVQEDState *s) static void qed_cancel_need_check_timer(BDRVQEDState *s) { trace_qed_cancel_need_check_timer(s); - qemu_del_timer(s->need_check_timer); + timer_del(s->need_check_timer); } static void bdrv_qed_rebind(BlockDriverState *bs) @@ -373,7 +373,8 @@ static void bdrv_qed_rebind(BlockDriverState *bs) s->bs = bs; } -static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQEDState *s = bs->opaque; QEDHeader le_header; @@ -390,14 +391,15 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) qed_header_le_to_cpu(&le_header, &s->header); if (s->header.magic != QED_MAGIC) { - return -EMEDIUMTYPE; + error_setg(errp, "Image not in QED format"); + return -EINVAL; } if (s->header.features & ~QED_FEATURE_MASK) { /* image uses unsupported feature bits */ char buf[64]; snprintf(buf, sizeof(buf), "%" PRIx64, s->header.features & ~QED_FEATURE_MASK); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "QED", buf); return -ENOTSUP; } @@ -494,7 +496,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) } } - s->need_check_timer = qemu_new_timer_ns(vm_clock, + s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, qed_need_check_timer_cb, s); out: @@ -505,6 +507,15 @@ out: return ret; } +static int bdrv_qed_refresh_limits(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; + + return 0; +} + /* We have nothing to do for QED reopen, stubs just return * success */ static int bdrv_qed_reopen_prepare(BDRVReopenState *state, @@ -518,7 +529,7 @@ static void bdrv_qed_close(BlockDriverState *bs) BDRVQEDState *s = bs->opaque; qed_cancel_need_check_timer(s); - qemu_free_timer(s->need_check_timer); + timer_free(s->need_check_timer); /* Ensure writes reach stable storage */ bdrv_flush(bs->file); @@ -535,7 +546,8 @@ static void bdrv_qed_close(BlockDriverState *bs) static int qed_create(const char *filename, uint32_t cluster_size, uint64_t image_size, uint32_t table_size, - const char *backing_file, const char *backing_fmt) + const char *backing_file, const char *backing_fmt, + Error **errp) { QEDHeader header = { .magic = QED_MAGIC, @@ -550,16 +562,22 @@ static int qed_create(const char *filename, uint32_t cluster_size, QEDHeader le_header; uint8_t *l1_table = NULL; size_t l1_size = header.cluster_size * header.table_size; + Error *local_err = NULL; int ret = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; - ret = bdrv_create_file(filename, NULL); + ret = bdrv_create_file(filename, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL, + &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -599,11 +617,12 @@ static int qed_create(const char *filename, uint32_t cluster_size, ret = 0; /* success */ out: g_free(l1_table); - bdrv_delete(bs); + bdrv_unref(bs); return ret; } -static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { uint64_t image_size = 0; uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; @@ -648,54 +667,70 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) } return qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt); + backing_file, backing_fmt, errp); } typedef struct { + BlockDriverState *bs; Coroutine *co; - int is_allocated; + uint64_t pos; + int64_t status; int *pnum; } QEDIsAllocatedCB; static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) { QEDIsAllocatedCB *cb = opaque; + BDRVQEDState *s = cb->bs->opaque; *cb->pnum = len / BDRV_SECTOR_SIZE; - cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); + switch (ret) { + case QED_CLUSTER_FOUND: + offset |= qed_offset_into_cluster(s, cb->pos); + cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + break; + case QED_CLUSTER_ZERO: + cb->status = BDRV_BLOCK_ZERO; + break; + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + cb->status = 0; + break; + default: + assert(ret < 0); + cb->status = ret; + break; + } + if (cb->co) { qemu_coroutine_enter(cb->co, NULL); } } -static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQEDState *s = bs->opaque; - uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; QEDIsAllocatedCB cb = { - .is_allocated = -1, + .bs = bs, + .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, + .status = BDRV_BLOCK_OFFSET_MASK, .pnum = pnum, }; QEDRequest request = { .l2_table = NULL }; - qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); /* Now sleep if the callback wasn't invoked immediately */ - while (cb.is_allocated == -1) { + while (cb.status == BDRV_BLOCK_OFFSET_MASK) { cb.co = qemu_coroutine_self(); qemu_coroutine_yield(); } qed_unref_l2_cache_entry(request.l2_table); - return cb.is_allocated; -} - -static int bdrv_qed_make_empty(BlockDriverState *bs) -{ - return -ENOTSUP; + return cb.status; } static BDRVQEDState *acb_to_s(QEDAIOCB *acb) @@ -1368,7 +1403,8 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) + int nb_sectors, + BdrvRequestFlags flags) { BlockDriverAIOCB *blockacb; BDRVQEDState *s = bs->opaque; @@ -1445,6 +1481,8 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) memset(bdi, 0, sizeof(*bdi)); bdi->cluster_size = s->header.cluster_size; bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = true; return 0; } @@ -1520,13 +1558,31 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs, return ret; } -static void bdrv_qed_invalidate_cache(BlockDriverState *bs) +static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQEDState *s = bs->opaque; + Error *local_err = NULL; + int ret; bdrv_qed_close(bs); + + bdrv_invalidate_cache(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + memset(s, 0, sizeof(BDRVQEDState)); - bdrv_qed_open(bs, NULL, bs->open_flags); + ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); + if (local_err) { + error_setg(errp, "Could not reopen qed layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qed layer"); + return; + } } static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, @@ -1575,14 +1631,14 @@ static BlockDriver bdrv_qed = { .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, .bdrv_create = bdrv_qed_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, - .bdrv_make_empty = bdrv_qed_make_empty, + .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, + .bdrv_refresh_limits = bdrv_qed_refresh_limits, .bdrv_change_backing_file = bdrv_qed_change_backing_file, .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, .bdrv_check = bdrv_qed_check, diff --git a/block/qed.h b/block/qed.h index 2b4ddedf3..5d65bea07 100644 --- a/block/qed.h +++ b/block/qed.h @@ -100,7 +100,7 @@ typedef struct { /* if (features & QED_F_BACKING_FILE) */ uint32_t backing_filename_offset; /* in bytes from start of header */ uint32_t backing_filename_size; /* in bytes */ -} QEDHeader; +} QEMU_PACKED QEDHeader; typedef struct { uint64_t offsets[0]; /* in bytes */ diff --git a/block/quorum.c b/block/quorum.c new file mode 100644 index 000000000..7f580a83b --- /dev/null +++ b/block/quorum.c @@ -0,0 +1,877 @@ +/* + * Quorum Block filter + * + * Copyright (C) 2012-2014 Nodalink, EURL. + * + * Author: + * Benoît Canet <benoit.canet@irqsave.net> + * + * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp) + * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc). + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <gnutls/gnutls.h> +#include <gnutls/crypto.h> +#include "block/block_int.h" +#include "qapi/qmp/qjson.h" + +#define HASH_LENGTH 32 + +#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" +#define QUORUM_OPT_BLKVERIFY "blkverify" + +/* This union holds a vote hash value */ +typedef union QuorumVoteValue { + char h[HASH_LENGTH]; /* SHA-256 hash */ + int64_t l; /* simpler 64 bits hash */ +} QuorumVoteValue; + +/* A vote item */ +typedef struct QuorumVoteItem { + int index; + QLIST_ENTRY(QuorumVoteItem) next; +} QuorumVoteItem; + +/* this structure is a vote version. A version is the set of votes sharing the + * same vote value. + * The set of votes will be tracked with the items field and its cardinality is + * vote_count. + */ +typedef struct QuorumVoteVersion { + QuorumVoteValue value; + int index; + int vote_count; + QLIST_HEAD(, QuorumVoteItem) items; + QLIST_ENTRY(QuorumVoteVersion) next; +} QuorumVoteVersion; + +/* this structure holds a group of vote versions together */ +typedef struct QuorumVotes { + QLIST_HEAD(, QuorumVoteVersion) vote_list; + bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); +} QuorumVotes; + +/* the following structure holds the state of one quorum instance */ +typedef struct BDRVQuorumState { + BlockDriverState **bs; /* children BlockDriverStates */ + int num_children; /* children count */ + int threshold; /* if less than threshold children reads gave the + * same result a quorum error occurs. + */ + bool is_blkverify; /* true if the driver is in blkverify mode + * Writes are mirrored on two children devices. + * On reads the two children devices' contents are + * compared and if a difference is spotted its + * location is printed and the code aborts. + * It is useful to debug other block drivers by + * comparing them with a reference one. + */ +} BDRVQuorumState; + +typedef struct QuorumAIOCB QuorumAIOCB; + +/* Quorum will create one instance of the following structure per operation it + * performs on its children. + * So for each read/write operation coming from the upper layer there will be + * $children_count QuorumChildRequest. + */ +typedef struct QuorumChildRequest { + BlockDriverAIOCB *aiocb; + QEMUIOVector qiov; + uint8_t *buf; + int ret; + QuorumAIOCB *parent; +} QuorumChildRequest; + +/* Quorum will use the following structure to track progress of each read/write + * operation received by the upper layer. + * This structure hold pointers to the QuorumChildRequest structures instances + * used to do operations on each children and track overall progress. + */ +struct QuorumAIOCB { + BlockDriverAIOCB common; + + /* Request metadata */ + uint64_t sector_num; + int nb_sectors; + + QEMUIOVector *qiov; /* calling IOV */ + + QuorumChildRequest *qcrs; /* individual child requests */ + int count; /* number of completed AIOCB */ + int success_count; /* number of successfully completed AIOCB */ + + QuorumVotes votes; + + bool is_read; + int vote_ret; +}; + +static void quorum_vote(QuorumAIOCB *acb); + +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); + BDRVQuorumState *s = acb->common.bs->opaque; + int i; + + /* cancel all callbacks */ + for (i = 0; i < s->num_children; i++) { + bdrv_aio_cancel(acb->qcrs[i].aiocb); + } + + g_free(acb->qcrs); + qemu_aio_release(acb); +} + +static AIOCBInfo quorum_aiocb_info = { + .aiocb_size = sizeof(QuorumAIOCB), + .cancel = quorum_aio_cancel, +}; + +static void quorum_aio_finalize(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + int i, ret = 0; + + if (acb->vote_ret) { + ret = acb->vote_ret; + } + + acb->common.cb(acb->common.opaque, ret); + + if (acb->is_read) { + for (i = 0; i < s->num_children; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } + } + + g_free(acb->qcrs); + qemu_aio_release(acb); +} + +static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return !memcmp(a->h, b->h, HASH_LENGTH); +} + +static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return a->l == b->l; +} + +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, + BlockDriverState *bs, + QEMUIOVector *qiov, + uint64_t sector_num, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); + int i; + + acb->common.bs->opaque = s; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->qiov = qiov; + acb->qcrs = g_new0(QuorumChildRequest, s->num_children); + acb->count = 0; + acb->success_count = 0; + acb->votes.compare = quorum_sha256_compare; + QLIST_INIT(&acb->votes.vote_list); + acb->is_read = false; + acb->vote_ret = 0; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = NULL; + acb->qcrs[i].ret = 0; + acb->qcrs[i].parent = acb; + } + + return acb; +} + +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +{ + QObject *data; + assert(node_name); + data = qobject_from_jsonf("{ 'node-name': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + node_name, acb->sector_num, acb->nb_sectors); + if (ret < 0) { + QDict *dict = qobject_to_qdict(data); + qdict_put(dict, "error", qstring_from_str(strerror(-ret))); + } + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); + qobject_decref(data); +} + +static void quorum_report_failure(QuorumAIOCB *acb) +{ + QObject *data; + const char *reference = acb->common.bs->device_name[0] ? + acb->common.bs->device_name : + acb->common.bs->node_name; + data = qobject_from_jsonf("{ 'reference': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + reference, acb->sector_num, acb->nb_sectors); + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); + qobject_decref(data); +} + +static int quorum_vote_error(QuorumAIOCB *acb); + +static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + + if (acb->success_count < s->threshold) { + acb->vote_ret = quorum_vote_error(acb); + quorum_report_failure(acb); + return true; + } + + return false; +} + +static void quorum_aio_cb(void *opaque, int ret) +{ + QuorumChildRequest *sacb = opaque; + QuorumAIOCB *acb = sacb->parent; + BDRVQuorumState *s = acb->common.bs->opaque; + + sacb->ret = ret; + acb->count++; + if (ret == 0) { + acb->success_count++; + } else { + quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); + } + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + if (acb->count < s->num_children) { + return; + } + + /* Do the vote on read */ + if (acb->is_read) { + quorum_vote(acb); + } else { + quorum_has_too_much_io_failed(acb); + } + + quorum_aio_finalize(acb); +} + +static void quorum_report_bad_versions(BDRVQuorumState *s, + QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + quorum_report_bad(acb, s->bs[item->index]->node_name, 0); + } + } +} + +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) +{ + int i; + assert(dest->niov == source->niov); + assert(dest->size == source->size); + for (i = 0; i < source->niov; i++) { + assert(dest->iov[i].iov_len == source->iov[i].iov_len); + memcpy(dest->iov[i].iov_base, + source->iov[i].iov_base, + source->iov[i].iov_len); + } +} + +static void quorum_count_vote(QuorumVotes *votes, + QuorumVoteValue *value, + int index) +{ + QuorumVoteVersion *v = NULL, *version = NULL; + QuorumVoteItem *item; + + /* look if we have something with this hash */ + QLIST_FOREACH(v, &votes->vote_list, next) { + if (votes->compare(&v->value, value)) { + version = v; + break; + } + } + + /* It's a version not yet in the list add it */ + if (!version) { + version = g_new0(QuorumVoteVersion, 1); + QLIST_INIT(&version->items); + memcpy(&version->value, value, sizeof(version->value)); + version->index = index; + version->vote_count = 0; + QLIST_INSERT_HEAD(&votes->vote_list, version, next); + } + + version->vote_count++; + + item = g_new0(QuorumVoteItem, 1); + item->index = index; + QLIST_INSERT_HEAD(&version->items, item, next); +} + +static void quorum_free_vote_list(QuorumVotes *votes) +{ + QuorumVoteVersion *version, *next_version; + QuorumVoteItem *item, *next_item; + + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { + QLIST_REMOVE(version, next); + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { + QLIST_REMOVE(item, next); + g_free(item); + } + g_free(version); + } +} + +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) +{ + int j, ret; + gnutls_hash_hd_t dig; + QEMUIOVector *qiov = &acb->qcrs[i].qiov; + + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); + + if (ret < 0) { + return ret; + } + + for (j = 0; j < qiov->niov; j++) { + ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len); + if (ret < 0) { + break; + } + } + + gnutls_hash_deinit(dig, (void *) hash); + return ret; +} + +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) +{ + int max = 0; + QuorumVoteVersion *candidate, *winner = NULL; + + QLIST_FOREACH(candidate, &votes->vote_list, next) { + if (candidate->vote_count > max) { + max = candidate->vote_count; + winner = candidate; + } + } + + return winner; +} + +/* qemu_iovec_compare is handy for blkverify mode because it returns the first + * differing byte location. Yet it is handcoded to compare vectors one byte + * after another so it does not benefit from the libc SIMD optimizations. + * quorum_iovec_compare is written for speed and should be used in the non + * blkverify mode of quorum. + */ +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + int result; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + assert(a->iov[i].iov_len == b->iov[i].iov_len); + result = memcmp(a->iov[i].iov_base, + b->iov[i].iov_base, + a->iov[i].iov_len); + if (result) { + return false; + } + } + + return true; +} + +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", + acb->sector_num, acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +static bool quorum_compare(QuorumAIOCB *acb, + QEMUIOVector *a, + QEMUIOVector *b) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + ssize_t offset; + + /* This driver will replace blkverify in this particular case */ + if (s->is_blkverify) { + offset = qemu_iovec_compare(a, b); + if (offset != -1) { + quorum_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + + (uint64_t)(offset / BDRV_SECTOR_SIZE)); + } + return true; + } + + return quorum_iovec_compare(a, b); +} + +/* Do a vote to get the error code */ +static int quorum_vote_error(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i, ret = 0; + bool error = false; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + ret = acb->qcrs[i].ret; + if (ret) { + error = true; + result_value.l = ret; + quorum_count_vote(&error_votes, &result_value, i); + } + } + + if (error) { + winner = quorum_get_vote_winner(&error_votes); + ret = winner->value.l; + } + + quorum_free_vote_list(&error_votes); + + return ret; +} + +static void quorum_vote(QuorumAIOCB *acb) +{ + bool quorum = true; + int i, j, ret; + QuorumVoteValue hash; + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner; + + if (quorum_has_too_much_io_failed(acb)) { + return; + } + + /* get the index of the first successful read */ + for (i = 0; i < s->num_children; i++) { + if (!acb->qcrs[i].ret) { + break; + } + } + + assert(i < s->num_children); + + /* compare this read with all other successful reads stopping at quorum + * failure + */ + for (j = i + 1; j < s->num_children; j++) { + if (acb->qcrs[j].ret) { + continue; + } + quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); + if (!quorum) { + break; + } + } + + /* Every successful read agrees */ + if (quorum) { + quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); + return; + } + + /* compute hashes for each successful read, also store indexes */ + for (i = 0; i < s->num_children; i++) { + if (acb->qcrs[i].ret) { + continue; + } + ret = quorum_compute_hash(acb, i, &hash); + /* if ever the hash computation failed */ + if (ret < 0) { + acb->vote_ret = ret; + goto free_exit; + } + quorum_count_vote(&acb->votes, &hash, i); + } + + /* vote to select the most represented version */ + winner = quorum_get_vote_winner(&acb->votes); + + /* if the winner count is smaller than threshold the read fails */ + if (winner->vote_count < s->threshold) { + quorum_report_failure(acb); + acb->vote_ret = -EIO; + goto free_exit; + } + + /* we have a winner: copy it */ + quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); + + /* some versions are bad print them */ + quorum_report_bad_versions(s, acb, &winner->value); + +free_exit: + /* free lists */ + quorum_free_vote_list(&acb->votes); +} + +static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, + nb_sectors, cb, opaque); + int i; + + acb->is_read = true; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size); + qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); + qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); + } + + for (i = 0; i < s->num_children; i++) { + bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); + } + + return &acb->common; +} + +static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, + cb, opaque); + int i; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, + nb_sectors, &quorum_aio_cb, + &acb->qcrs[i]); + } + + return &acb->common; +} + +static int64_t quorum_getlength(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int64_t result; + int i; + + /* check that all file have the same length */ + result = bdrv_getlength(s->bs[0]); + if (result < 0) { + return result; + } + for (i = 1; i < s->num_children; i++) { + int64_t value = bdrv_getlength(s->bs[i]); + if (value < 0) { + return value; + } + if (value != result) { + return -EIO; + } + } + + return result; +} + +static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_invalidate_cache(s->bs[i], &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} + +static coroutine_fn int quorum_co_flush(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i; + int result = 0; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + result = bdrv_co_flush(s->bs[i]); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } + + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + + quorum_free_vote_list(&error_votes); + + return result; +} + +static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bool perm = bdrv_recurse_is_first_non_filter(s->bs[i], + candidate); + if (perm) { + return true; + } + } + + return false; +} + +static int quorum_valid_threshold(int threshold, int num_children, Error **errp) +{ + + if (threshold < 1) { + error_set(errp, QERR_INVALID_PARAMETER_VALUE, + "vote-threshold", "value >= 1"); + return -ERANGE; + } + + if (threshold > num_children) { + error_setg(errp, "threshold may not exceed children count"); + return -ERANGE; + } + + return 0; +} + +static QemuOptsList quorum_runtime_opts = { + .name = "quorum", + .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head), + .desc = { + { + .name = QUORUM_OPT_VOTE_THRESHOLD, + .type = QEMU_OPT_NUMBER, + .help = "The number of vote needed for reaching quorum", + }, + { + .name = QUORUM_OPT_BLKVERIFY, + .type = QEMU_OPT_BOOL, + .help = "Trigger block verify mode if set", + }, + { /* end of list */ } + }, +}; + +static int quorum_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + QemuOpts *opts; + bool *opened; + QDict *sub = NULL; + QList *list = NULL; + const QListEntry *lentry; + int i; + int ret = 0; + + qdict_flatten(options); + qdict_extract_subqdict(options, &sub, "children."); + qdict_array_split(sub, &list); + + if (qdict_size(sub)) { + error_setg(&local_err, "Invalid option children.%s", + qdict_first(sub)->key); + ret = -EINVAL; + goto exit; + } + + /* count how many different children are present */ + s->num_children = qlist_size(list); + if (s->num_children < 2) { + error_setg(&local_err, + "Number of provided children must be greater than 1"); + ret = -EINVAL; + goto exit; + } + + opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + ret = -EINVAL; + goto exit; + } + + s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); + + /* and validate it against s->num_children */ + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); + if (ret < 0) { + goto exit; + } + + /* is the driver in blkverify mode */ + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && + s->num_children == 2 && s->threshold == 2) { + s->is_blkverify = true; + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { + fprintf(stderr, "blkverify mode is set by setting blkverify=on " + "and using two files with vote_threshold=2\n"); + } + + /* allocate the children BlockDriverState array */ + s->bs = g_new0(BlockDriverState *, s->num_children); + opened = g_new0(bool, s->num_children); + + for (i = 0, lentry = qlist_first(list); lentry; + lentry = qlist_next(lentry), i++) { + QDict *d; + QString *string; + + switch (qobject_type(lentry->value)) + { + /* List of options */ + case QTYPE_QDICT: + d = qobject_to_qdict(lentry->value); + QINCREF(d); + ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL, + &local_err); + break; + + /* QMP reference */ + case QTYPE_QSTRING: + string = qobject_to_qstring(lentry->value); + ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL, + flags, NULL, &local_err); + break; + + default: + error_setg(&local_err, "Specification of child block device %i " + "is invalid", i); + ret = -EINVAL; + } + + if (ret < 0) { + goto close_exit; + } + opened[i] = true; + } + + g_free(opened); + goto exit; + +close_exit: + /* cleanup on error */ + for (i = 0; i < s->num_children; i++) { + if (!opened[i]) { + continue; + } + bdrv_unref(s->bs[i]); + } + g_free(s->bs); + g_free(opened); +exit: + /* propagate error */ + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + } + QDECREF(list); + QDECREF(sub); + return ret; +} + +static void quorum_close(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_unref(s->bs[i]); + } + + g_free(s->bs); +} + +static BlockDriver bdrv_quorum = { + .format_name = "quorum", + .protocol_name = "quorum", + + .instance_size = sizeof(BDRVQuorumState), + + .bdrv_file_open = quorum_open, + .bdrv_close = quorum_close, + + .bdrv_co_flush_to_disk = quorum_co_flush, + + .bdrv_getlength = quorum_getlength, + + .bdrv_aio_readv = quorum_aio_readv, + .bdrv_aio_writev = quorum_aio_writev, + .bdrv_invalidate_cache = quorum_invalidate_cache, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, +}; + +static void bdrv_quorum_init(void) +{ + bdrv_register(&bdrv_quorum); +} + +block_init(bdrv_quorum_init); diff --git a/block/raw-aio.h b/block/raw-aio.h index c61f1595d..7ad0a8a0a 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -21,9 +21,10 @@ #define QEMU_AIO_IOCTL 0x0004 #define QEMU_AIO_FLUSH 0x0008 #define QEMU_AIO_DISCARD 0x0010 +#define QEMU_AIO_WRITE_ZEROES 0x0020 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ - QEMU_AIO_DISCARD) + QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/block/raw-posix.c b/block/raw-posix.c index ba721d3f5..1688e16c6 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -127,6 +127,8 @@ typedef struct BDRVRawState { int fd; int type; int open_flags; + size_t buf_align; + #if defined(__linux__) /* linux floppy specific */ int64_t fd_open_time; @@ -139,9 +141,11 @@ typedef struct BDRVRawState { void *aio_ctx; #endif #ifdef CONFIG_XFS - bool is_xfs : 1; + bool is_xfs:1; #endif - bool has_discard : 1; + bool has_discard:1; + bool has_write_zeroes:1; + bool discard_zeroes:1; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -211,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename) } #endif +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + char *buf; + unsigned int sector_size; + + /* For /dev/sg devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ + if (bs->sg || !(s->open_flags & O_DIRECT)) { + bs->request_alignment = 1; + s->buf_align = 1; + return; + } + + /* Try a few ioctls to get the right size */ + bs->request_alignment = 0; + s->buf_align = 0; + +#ifdef BLKSSZGET + if (ioctl(s->fd, BLKSSZGET, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DKIOCGETBLOCKSIZE + if (ioctl(s->fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DIOCGSECTORSIZE + if (ioctl(s->fd, DIOCGSECTORSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef CONFIG_XFS + if (s->is_xfs) { + struct dioattr da; + if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) { + bs->request_alignment = da.d_miniosz; + /* The kernel returns wrong information for d_mem */ + /* s->buf_align = da.d_mem; */ + } + } +#endif + + /* If we could not get the sizes so far, we can only guess them */ + if (!s->buf_align) { + size_t align; + buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { + s->buf_align = align; + break; + } + } + qemu_vfree(buf); + } + + if (!bs->request_alignment) { + size_t align; + buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf, align, 0) >= 0) { + bs->request_alignment = align; + break; + } + } + qemu_vfree(buf); + } +} + static void raw_parse_flags(int bdrv_flags, int *open_flags) { assert(open_flags != NULL); @@ -262,6 +336,17 @@ error: } #endif +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + static QemuOptsList raw_runtime_opts = { .name = "raw", .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), @@ -276,19 +361,19 @@ static QemuOptsList raw_runtime_opts = { }; static int raw_open_common(BlockDriverState *bs, QDict *options, - int bdrv_flags, int open_flags) + int bdrv_flags, int open_flags, Error **errp) { BDRVRawState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; const char *filename; int fd, ret; + struct stat st; - opts = qemu_opts_create_nofail(&raw_runtime_opts); + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -297,6 +382,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = raw_normalize_devicepath(&filename); if (ret != 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); goto fail; } @@ -318,14 +404,43 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { qemu_close(fd); ret = -errno; + error_setg_errno(errp, -ret, "Could not set AIO state"); + goto fail; + } +#endif + + s->has_discard = true; + s->has_write_zeroes = true; + + if (fstat(s->fd, &st) < 0) { + error_setg_errno(errp, errno, "Could not stat file"); goto fail; } + if (S_ISREG(st.st_mode)) { + s->discard_zeroes = true; + } + if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES + unsigned int arg; + if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { + s->discard_zeroes = true; + } +#endif +#ifdef __linux__ + /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do + * not rely on the contents of discarded blocks unless using O_DIRECT. + * Same for BLKZEROOUT. + */ + if (!(bs->open_flags & BDRV_O_NOCACHE)) { + s->discard_zeroes = false; + s->has_write_zeroes = false; + } #endif + } - s->has_discard = 1; #ifdef CONFIG_XFS if (platform_test_xfs_fd(s->fd)) { - s->is_xfs = 1; + s->is_xfs = true; } #endif @@ -335,12 +450,19 @@ fail: return ret; } -static int raw_open(BlockDriverState *bs, QDict *options, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; s->type = FTYPE_FILE; - return raw_open_common(bs, options, flags, 0); + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static int raw_reopen_prepare(BDRVReopenState *state, @@ -365,6 +487,7 @@ static int raw_reopen_prepare(BDRVReopenState *state, * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() * won't override aio_ctx if aio_ctx is non-NULL */ if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { + error_setg(errp, "Could not set AIO state"); return -1; } #endif @@ -416,13 +539,13 @@ static int raw_reopen_prepare(BDRVReopenState *state, assert(!(raw_s->open_flags & O_CREAT)); raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags); if (raw_s->fd == -1) { + error_setg_errno(errp, errno, "Could not reopen file"); ret = -1; } } return ret; } - static void raw_reopen_commit(BDRVReopenState *state) { BDRVRawReopenState *raw_s = state->opaque; @@ -458,23 +581,15 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +static int raw_refresh_limits(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; -/* XXX: use host sector size if necessary with: -#ifdef DIOCGSECTORSIZE - { - unsigned int sectorsize = 512; - if (!ioctl(fd, DIOCGSECTORSIZE, §orsize) && - sectorsize > bufsize) - bufsize = sectorsize; - } -#endif -#ifdef CONFIG_COCOA - uint32_t blockSize = 512; - if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { - bufsize = blockSize; - } -#endif -*/ + raw_probe_alignment(bs); + bs->bl.opt_mem_alignment = s->buf_align; + + return 0; +} static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { @@ -665,6 +780,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) } #ifdef CONFIG_XFS +static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { + DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} + static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) { struct xfs_flock64 fl; @@ -683,13 +815,49 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) } #endif +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ + int ret = -EOPNOTSUPP; + BDRVRawState *s = aiocb->bs->opaque; + + if (s->has_write_zeroes == 0) { + return -ENOTSUP; + } + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKZEROOUT + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } else { +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + } + + if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || + ret == -ENOTTY) { + s->has_write_zeroes = false; + ret = -ENOTSUP; + } + return ret; +} + static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; BDRVRawState *s = aiocb->bs->opaque; - if (s->has_discard == 0) { - return 0; + if (!s->has_discard) { + return -ENOTSUP; } if (aiocb->aio_type & QEMU_AIO_BLKDEV) { @@ -724,8 +892,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || ret == -ENOTTY) { - s->has_discard = 0; - ret = 0; + s->has_discard = false; + ret = -ENOTSUP; } return ret; } @@ -767,6 +935,9 @@ static int aio_worker(void *arg) case QEMU_AIO_DISCARD: ret = handle_aiocb_discard(aiocb); break; + case QEMU_AIO_WRITE_ZEROES: + ret = handle_aiocb_write_zeroes(aiocb); + break; default: fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); ret = -EINVAL; @@ -777,6 +948,29 @@ static int aio_worker(void *arg) return ret; } +static int paio_submit_co(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int type) +{ + RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + ThreadPool *pool; + + acb->bs = bs; + acb->aio_type = type; + acb->aio_fildes = fd; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + } + acb->aio_nbytes = nb_sectors * 512; + acb->aio_offset = sector_num * 512; + + trace_paio_submit_co(sector_num, nb_sectors, type); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_co(pool, aio_worker, acb); +} + static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type) @@ -1040,12 +1234,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return (int64_t)st.st_blocks * 512; } -static int raw_create(const char *filename, QEMUOptionParameter *options) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int result = 0; int64_t total_size = 0; + strstart(filename, "file:", &filename); + /* Read out options */ while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { @@ -1058,12 +1255,15 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) 0644); if (fd < 0) { result = -errno; + error_setg_errno(errp, -result, "Could not create file"); } else { if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); } if (qemu_close(fd) != 0) { result = -errno; + error_setg_errno(errp, -result, "Could not close the new file"); } } return result; @@ -1084,12 +1284,12 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. */ -static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { off_t start, data, hole; - int ret; + int64_t ret; ret = fd_open(bs); if (ret < 0) { @@ -1097,6 +1297,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, } start = sector_num * BDRV_SECTOR_SIZE; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; #ifdef CONFIG_FIEMAP @@ -1114,7 +1315,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { /* Assume everything is allocated. */ *pnum = nb_sectors; - return 1; + return ret; } if (f.fm.fm_mapped_extents == 0) { @@ -1127,6 +1328,9 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, } else { data = f.fe.fe_logical; hole = f.fe.fe_logical + f.fe.fe_length; + if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { + ret |= BDRV_BLOCK_ZERO; + } } #elif defined SEEK_HOLE && defined SEEK_DATA @@ -1141,7 +1345,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, /* Most likely EINVAL. Assume everything is allocated. */ *pnum = nb_sectors; - return 1; + return ret; } if (hole > start) { @@ -1154,19 +1358,21 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, } } #else - *pnum = nb_sectors; - return 1; + data = 0; + hole = start + nb_sectors * BDRV_SECTOR_SIZE; #endif if (data <= start) { /* On a data extent, compute sectors to the end of the extent. */ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); - return 1; } else { /* On a hole, compute sectors to the beginning of the next extent. */ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - return 0; + ret &= ~BDRV_BLOCK_DATA; + ret |= BDRV_BLOCK_ZERO; } + + return ret; } static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, @@ -1179,6 +1385,31 @@ static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD); } +static int coroutine_fn raw_co_write_zeroes( + BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD); + } + return -ENOTSUP; +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVRawState *s = bs->opaque; + + bdi->unallocated_blocks_are_zero = s->discard_zeroes; + bdi->can_write_zeroes_with_unmap = s->discard_zeroes; + return 0; +} + static QEMUOptionParameter raw_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1192,7 +1423,9 @@ static BlockDriver bdrv_file = { .format_name = "file", .protocol_name = "file", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe = NULL, /* no probe for protocols */ + .bdrv_parse_filename = raw_parse_filename, .bdrv_file_open = raw_open, .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, @@ -1200,15 +1433,18 @@ static BlockDriver bdrv_file = { .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = raw_co_is_allocated, + .bdrv_co_get_block_status = raw_co_get_block_status, + .bdrv_co_write_zeroes = raw_co_write_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = raw_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1325,9 +1561,20 @@ static int check_hdev_writable(BDRVRawState *s) return 0; } -static int hdev_open(BlockDriverState *bs, QDict *options, int flags) +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; const char *filename = qdict_get_str(options, "filename"); @@ -1371,8 +1618,11 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) } #endif - ret = raw_open_common(bs, options, flags, 0); + ret = raw_open_common(bs, options, flags, 0, &local_err); if (ret < 0) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; } @@ -1380,6 +1630,7 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) ret = check_hdev_writable(s); if (ret < 0) { raw_close(bs); + error_setg_errno(errp, -ret, "The device is not writable"); return ret; } } @@ -1498,12 +1749,45 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } -static int hdev_create(const char *filename, QEMUOptionParameter *options) +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + int rc; + + rc = fd_open(bs); + if (rc < 0) { + return rc; + } + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); + } + return -ENOTSUP; +} + +static int hdev_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int ret = 0; struct stat stat_buf; int64_t total_size = 0; + bool has_prefix; + + /* This function is used by all three protocol block drivers and therefore + * any of these three prefixes may be given. + * The return value has to be stored somewhere, otherwise this is an error + * due to -Werror=unused-value. */ + has_prefix = + strstart(filename, "host_device:", &filename) || + strstart(filename, "host_cdrom:" , &filename) || + strstart(filename, "host_floppy:", &filename); + + (void)has_prefix; /* Read out options */ while (options && options->name) { @@ -1514,15 +1798,23 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options) } fd = qemu_open(filename, O_WRONLY | O_BINARY); - if (fd < 0) - return -errno; + if (fd < 0) { + ret = -errno; + error_setg_errno(errp, -ret, "Could not open device"); + return ret; + } - if (fstat(fd, &stat_buf) < 0) + if (fstat(fd, &stat_buf) < 0) { ret = -errno; - else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) + error_setg_errno(errp, -ret, "Could not stat device"); + } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { + error_setg(errp, + "The given file is neither a block nor a character device"); ret = -ENODEV; - else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) + } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) { + error_setg(errp, "Device is too small"); ret = -ENOSPC; + } qemu_close(fd); return ret; @@ -1532,7 +1824,9 @@ static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = hdev_probe_device, + .bdrv_parse_filename = hdev_parse_filename, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1540,14 +1834,17 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, + .bdrv_co_write_zeroes = hdev_co_write_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = hdev_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1559,17 +1856,32 @@ static BlockDriver bdrv_host_device = { }; #ifdef __linux__ -static int floppy_open(BlockDriverState *bs, QDict *options, int flags) +static void floppy_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_floppy:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int floppy_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; s->type = FTYPE_FD; /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, options, flags, O_NONBLOCK); - if (ret) + ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); + if (ret) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; + } /* close fd so that we can reopen it as needed */ qemu_close(s->fd); @@ -1656,7 +1968,9 @@ static BlockDriver bdrv_host_floppy = { .format_name = "host_floppy", .protocol_name = "host_floppy", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = floppy_probe_device, + .bdrv_parse_filename = floppy_parse_filename, .bdrv_file_open = floppy_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1668,9 +1982,11 @@ static BlockDriver bdrv_host_floppy = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1679,15 +1995,35 @@ static BlockDriver bdrv_host_floppy = { .bdrv_media_changed = floppy_media_changed, .bdrv_eject = floppy_eject, }; +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static void cdrom_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_cdrom:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} +#endif -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) +#ifdef __linux__ +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; s->type = FTYPE_CD; /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ - return raw_open_common(bs, options, flags, O_NONBLOCK); + ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static int cdrom_probe_device(const char *filename) @@ -1757,7 +2093,9 @@ static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1769,9 +2107,11 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1787,16 +2127,22 @@ static BlockDriver bdrv_host_cdrom = { #endif /* __linux__ */ #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; s->type = FTYPE_CD; - ret = raw_open_common(bs, options, flags, 0); - if (ret) + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (ret) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; + } /* make sure the door isn't locked at this time */ ioctl(s->fd, CDIOCALLOW); @@ -1878,7 +2224,9 @@ static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1890,9 +2238,11 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, diff --git a/block/raw-win32.c b/block/raw-win32.c index 9b5b2af4e..48cb2c225 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -85,6 +85,7 @@ static size_t handle_aiocb_rw(RawWin32AIOData *aiocb) ret_count = 0; } if (ret_count != len) { + offset += ret_count; break; } offset += len; @@ -201,6 +202,35 @@ static int set_sparse(int fd) NULL, 0, NULL, 0, &returned, NULL); } +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + DWORD sectorsPerCluster, freeClusters, totalClusters, count; + DISK_GEOMETRY_EX dg; + BOOL status; + + if (s->type == FTYPE_CD) { + bs->request_alignment = 2048; + return; + } + if (s->type == FTYPE_HARDDISK) { + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + bs->request_alignment = dg.Geometry.BytesPerSector; + return; + } + /* try GetDiskFreeSpace too */ + } + + if (s->drive_path[0]) { + GetDiskFreeSpace(s->drive_path, §orsPerCluster, + &dg.Geometry.BytesPerSector, + &freeClusters, &totalClusters); + bs->request_alignment = dg.Geometry.BytesPerSector; + } +} + static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) { assert(access_flags != NULL); @@ -221,6 +251,17 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) } } +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + static QemuOptsList raw_runtime_opts = { .name = "raw", .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), @@ -234,7 +275,8 @@ static QemuOptsList raw_runtime_opts = { }, }; -static int raw_open(BlockDriverState *bs, QDict *options, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; int access_flags; @@ -246,11 +288,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) s->type = FTYPE_FILE; - opts = qemu_opts_create_nofail(&raw_runtime_opts); + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -262,11 +303,23 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) { aio = win32_aio_init(); if (aio == NULL) { + error_setg(errp, "Could not initialize AIO"); ret = -EINVAL; goto fail; } } + if (filename[0] && filename[1] == ':') { + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); + } else if (filename[0] == '\\' && filename[1] == '\\') { + s->drive_path[0] = 0; + } else { + /* Relative path. */ + char buf[MAX_PATH]; + GetCurrentDirectory(MAX_PATH, buf); + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); + } + s->hfile = CreateFile(filename, access_flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, overlapped, NULL); @@ -285,11 +338,13 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) ret = win32_aio_attach(aio, s->hfile); if (ret < 0) { CloseHandle(s->hfile); + error_setg_errno(errp, -ret, "Could not enable AIO"); goto fail; } s->aio = aio; } + raw_probe_alignment(bs); ret = 0; fail: qemu_opts_del(opts); @@ -420,11 +475,14 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return st.st_size; } -static int raw_create(const char *filename, QEMUOptionParameter *options) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int64_t total_size = 0; + strstart(filename, "file:", &filename); + /* Read out options */ while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { @@ -435,8 +493,10 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) + if (fd < 0) { + error_setg_errno(errp, errno, "Could not create file"); return -EIO; + } set_sparse(fd); ftruncate(fd, total_size * 512); qemu_close(fd); @@ -456,6 +516,8 @@ static BlockDriver bdrv_file = { .format_name = "file", .protocol_name = "file", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = raw_parse_filename, .bdrv_file_open = raw_open, .bdrv_close = raw_close, .bdrv_create = raw_create, @@ -531,17 +593,44 @@ static int hdev_probe_device(const char *filename) return 0; } -static int hdev_open(BlockDriverState *bs, QDict *options, int flags) +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; int access_flags, create_flags; + int ret = 0; DWORD overlapped; char device_name[64]; - const char *filename = qdict_get_str(options, "filename"); + + Error *local_err = NULL; + const char *filename; + + QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, + &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto done; + } + + filename = qemu_opt_get(opts, "filename"); if (strstart(filename, "/dev/cdrom", NULL)) { - if (find_cdrom(device_name, sizeof(device_name)) < 0) - return -ENOENT; + if (find_cdrom(device_name, sizeof(device_name)) < 0) { + error_setg(errp, "Could not open CD-ROM drive"); + ret = -ENOENT; + goto done; + } filename = device_name; } else { /* transform drive letters into device name */ @@ -564,17 +653,26 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) if (s->hfile == INVALID_HANDLE_VALUE) { int err = GetLastError(); - if (err == ERROR_ACCESS_DENIED) - return -EACCES; - return -1; + if (err == ERROR_ACCESS_DENIED) { + ret = -EACCES; + } else { + ret = -EINVAL; + } + error_setg_errno(errp, -ret, "Could not open device"); + goto done; } - return 0; + +done: + qemu_opts_del(opts); + return ret; } static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = hdev_parse_filename, .bdrv_probe_device = hdev_probe_device, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, @@ -583,7 +681,9 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, + .bdrv_get_allocated_file_size = raw_get_allocated_file_size, }; diff --git a/block/raw.c b/block/raw_bsd.c index 47518253f..01ea692a4 100644 --- a/block/raw.c +++ b/block/raw_bsd.c @@ -1,13 +1,17 @@ -/* - * Block driver for RAW format +/* BlockDriver implementation for "raw" * - * Copyright (c) 2006 Fabrice Bellard + * Copyright (C) 2010, 2013, Red Hat, Inc. + * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com> + * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com> + * + * Author: + * Laszlo Ersek <lersek@redhat.com> * * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in @@ -15,27 +19,27 @@ * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ -#include "qemu-common.h" #include "block/block_int.h" -#include "qemu/module.h" +#include "qemu/option.h" -static int raw_open(BlockDriverState *bs, QDict *options, int flags) -{ - bs->sg = bs->file->sg; - return 0; -} +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { 0 } +}; -/* We have nothing to do for raw reopen, stubs just return - * success */ -static int raw_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) +static int raw_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) { return 0; } @@ -54,22 +58,26 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); } -static void raw_close(BlockDriverState *bs) -{ -} - -static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); + *pnum = nb_sectors; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | + (sector_num << BDRV_SECTOR_BITS); } static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors) + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) { - return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors); + return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags); +} + +static int coroutine_fn raw_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + return bdrv_co_discard(bs->file, sector_num, nb_sectors); } static int64_t raw_getlength(BlockDriverState *bs) @@ -77,20 +85,20 @@ static int64_t raw_getlength(BlockDriverState *bs) return bdrv_getlength(bs->file); } -static int raw_truncate(BlockDriverState *bs, int64_t offset) +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { - return bdrv_truncate(bs->file, offset); + return bdrv_get_info(bs->file, bdi); } -static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +static int raw_refresh_limits(BlockDriverState *bs) { - return 1; /* everything can be opened as raw image */ + bs->bl = bs->file->bl; + return 0; } -static int coroutine_fn raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int raw_truncate(BlockDriverState *bs, int64_t offset) { - return bdrv_co_discard(bs->file, sector_num, nb_sectors); + return bdrv_truncate(bs->file, offset); } static int raw_is_inserted(BlockDriverState *bs) @@ -115,73 +123,79 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked) static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { - return bdrv_ioctl(bs->file, req, buf); + return bdrv_ioctl(bs->file, req, buf); } static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockDriverCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); -} - -static int raw_create(const char *filename, QEMUOptionParameter *options) + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, + void *opaque) { - return bdrv_create_file(filename, options); + return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); } -static QEMUOptionParameter raw_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } -}; - static int raw_has_zero_init(BlockDriverState *bs) { return bdrv_has_zero_init(bs->file); } -static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { - return bdrv_get_info(bs->file, bdi); -} - -static BlockDriver bdrv_raw = { - .format_name = "raw", - - /* It's really 0, but we need to make g_malloc() happy */ - .instance_size = 1, - - .bdrv_open = raw_open, - .bdrv_close = raw_close, - - .bdrv_reopen_prepare = raw_reopen_prepare, + Error *local_err = NULL; + int ret; - .bdrv_co_readv = raw_co_readv, - .bdrv_co_writev = raw_co_writev, - .bdrv_co_is_allocated = raw_co_is_allocated, - .bdrv_co_write_zeroes = raw_co_write_zeroes, - .bdrv_co_discard = raw_co_discard, + ret = bdrv_create_file(filename, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} - .bdrv_probe = raw_probe, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_truncate = raw_truncate, +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + bs->sg = bs->file->sg; + return 0; +} - .bdrv_is_inserted = raw_is_inserted, - .bdrv_media_changed = raw_media_changed, - .bdrv_eject = raw_eject, - .bdrv_lock_medium = raw_lock_medium, +static void raw_close(BlockDriverState *bs) +{ +} - .bdrv_ioctl = raw_ioctl, - .bdrv_aio_ioctl = raw_aio_ioctl, +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + /* smallest possible positive score so that raw is used if and only if no + * other block driver works + */ + return 1; +} - .bdrv_create = raw_create, - .create_options = raw_create_options, - .bdrv_has_zero_init = raw_has_zero_init, +static BlockDriver bdrv_raw = { + .format_name = "raw", + .bdrv_probe = &raw_probe, + .bdrv_reopen_prepare = &raw_reopen_prepare, + .bdrv_open = &raw_open, + .bdrv_close = &raw_close, + .bdrv_create = &raw_create, + .bdrv_co_readv = &raw_co_readv, + .bdrv_co_writev = &raw_co_writev, + .bdrv_co_write_zeroes = &raw_co_write_zeroes, + .bdrv_co_discard = &raw_co_discard, + .bdrv_co_get_block_status = &raw_co_get_block_status, + .bdrv_truncate = &raw_truncate, + .bdrv_getlength = &raw_getlength, + .has_variable_length = true, + .bdrv_get_info = &raw_get_info, + .bdrv_refresh_limits = &raw_refresh_limits, + .bdrv_is_inserted = &raw_is_inserted, + .bdrv_media_changed = &raw_media_changed, + .bdrv_eject = &raw_eject, + .bdrv_lock_medium = &raw_lock_medium, + .bdrv_ioctl = &raw_ioctl, + .bdrv_aio_ioctl = &raw_aio_ioctl, + .create_options = &raw_create_options[0], + .bdrv_has_zero_init = &raw_has_zero_init }; static void bdrv_raw_init(void) diff --git a/block/rbd.c b/block/rbd.c index cb7175121..dbc79f452 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -95,19 +95,13 @@ typedef struct RADOSCB { #define RBD_FD_WRITE 1 typedef struct BDRVRBDState { - int fds[2]; rados_t cluster; rados_ioctx_t io_ctx; rbd_image_t image; char name[RBD_MAX_IMAGE_NAME_SIZE]; - int qemu_aio_count; char *snap; - int event_reader_pos; - RADOSCB *event_rcb; } BDRVRBDState; -static void rbd_aio_bh_cb(void *opaque); - static int qemu_rbd_next_tok(char *dst, int dst_len, char *src, char delim, const char *name, @@ -288,7 +282,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) return ret; } -static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) +static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int64_t bytes = 0; int64_t objsize; @@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) } /* - * This aio completion is being called from qemu_rbd_aio_event_reader() - * and runs in qemu context. It schedules a bh, but just in case the aio - * was not cancelled before. + * This aio completion is being called from rbd_finish_bh() and runs in qemu + * BH context. */ static void qemu_rbd_complete_aio(RADOSCB *rcb) { @@ -401,44 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) acb->ret = r; } } - /* Note that acb->bh can be NULL in case where the aio was cancelled */ - acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); - qemu_bh_schedule(acb->bh); - g_free(rcb); -} - -/* - * aio fd read handler. It runs in the qemu context and calls the - * completion handling of completed rados aio operations. - */ -static void qemu_rbd_aio_event_reader(void *opaque) -{ - BDRVRBDState *s = opaque; - - ssize_t ret; - do { - char *p = (char *)&s->event_rcb; - - /* now read the rcb pointer that was sent from a non qemu thread */ - ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, - sizeof(s->event_rcb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_rcb)) { - s->event_reader_pos = 0; - qemu_rbd_complete_aio(s->event_rcb); - s->qemu_aio_count--; - } - } - } while (ret < 0 && errno == EINTR); -} + g_free(rcb); -static int qemu_rbd_aio_flush_cb(void *opaque) -{ - BDRVRBDState *s = opaque; + if (acb->cmd == RBD_AIO_READ) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + acb->status = 0; - return (s->qemu_aio_count > 0); + if (!acb->cancelled) { + qemu_aio_release(acb); + } } /* TODO Convert to fine grained options */ @@ -455,7 +424,8 @@ static QemuOptsList runtime_opts = { }, }; -static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) +static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRBDState *s = bs->opaque; char pool[RBD_MAX_POOL_NAME_SIZE]; @@ -468,9 +438,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) const char *filename; int r; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { qerror_report_err(local_err); error_free(local_err); qemu_opts_del(opts); @@ -545,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) bs->read_only = (s->snap != NULL); - s->event_reader_pos = 0; - r = qemu_pipe(s->fds); - if (r < 0) { - error_report("error opening eventfd"); - goto failed; - } - fcntl(s->fds[0], F_SETFL, O_NONBLOCK); - fcntl(s->fds[1], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, - NULL, qemu_rbd_aio_flush_cb, s); - - qemu_opts_del(opts); return 0; -failed: - rbd_close(s->image); failed_open: rados_ioctx_destroy(s->io_ctx); failed_shutdown: @@ -576,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; - close(s->fds[0]); - close(s->fds[1]); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL); - rbd_close(s->image); rados_ioctx_destroy(s->io_ctx); g_free(s->snap); @@ -607,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = { .cancel = qemu_rbd_aio_cancel, }; -static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) +static void rbd_finish_bh(void *opaque) { - int ret = 0; - while (1) { - fd_set wfd; - int fd = s->fds[RBD_FD_WRITE]; - - /* send the op pointer to the qemu thread that is responsible - for the aio/op completion. Must do it in a qemu thread context */ - ret = write(fd, (void *)&rcb, sizeof(rcb)); - if (ret >= 0) { - break; - } - if (errno == EINTR) { - continue; - } - if (errno != EAGAIN) { - break; - } - - FD_ZERO(&wfd); - FD_SET(fd, &wfd); - do { - ret = select(fd + 1, NULL, &wfd, NULL, NULL); - } while (ret < 0 && errno == EINTR); - } - - return ret; + RADOSCB *rcb = opaque; + qemu_bh_delete(rcb->acb->bh); + qemu_rbd_complete_aio(rcb); } /* @@ -642,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) * * Note: this function is being called from a non qemu thread so * we need to be careful about what we do here. Generally we only - * write to the block notification pipe, and do the rest of the - * io completion handling from qemu_rbd_aio_event_reader() which - * runs in a qemu context. + * schedule a BH, and do the rest of the io completion handling + * from rbd_finish_bh() which runs in a qemu context. */ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) { - int ret; + RBDAIOCB *acb = rcb->acb; + rcb->ret = rbd_aio_get_return_value(c); rbd_aio_release(c); - ret = qemu_rbd_send_pipe(rcb->s, rcb); - if (ret < 0) { - error_report("failed writing to acb->s->fds"); - g_free(rcb); - } -} - -/* Callback when all queued rbd_aio requests are complete */ - -static void rbd_aio_bh_cb(void *opaque) -{ - RBDAIOCB *acb = opaque; - if (acb->cmd == RBD_AIO_READ) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - acb->status = 0; - - if (!acb->cancelled) { - qemu_aio_release(acb); - } + acb->bh = qemu_bh_new(rbd_finish_bh, rcb); + qemu_bh_schedule(acb->bh); } static int rbd_aio_discard_wrapper(rbd_image_t image, @@ -741,8 +648,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, off = sector_num * BDRV_SECTOR_SIZE; size = nb_sectors * BDRV_SECTOR_SIZE; - s->qemu_aio_count++; /* All the RADOSCB */ - rcb = g_malloc(sizeof(RADOSCB)); rcb->done = 0; rcb->acb = acb; @@ -779,7 +684,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, failed: g_free(rcb); - s->qemu_aio_count--; qemu_aio_release(acb); return NULL; } @@ -903,12 +807,31 @@ static int qemu_rbd_snap_create(BlockDriverState *bs, } static int qemu_rbd_snap_remove(BlockDriverState *bs, - const char *snapshot_name) + const char *snapshot_id, + const char *snapshot_name, + Error **errp) { BDRVRBDState *s = bs->opaque; int r; + if (!snapshot_name) { + error_setg(errp, "rbd need a valid snapshot name"); + return -EINVAL; + } + + /* If snapshot_id is specified, it must be equal to name, see + qemu_rbd_snap_list() */ + if (snapshot_id && strcmp(snapshot_id, snapshot_name)) { + error_setg(errp, + "rbd do not support snapshot id, it should be NULL or " + "equal to snapshot name"); + return -EINVAL; + } + r = rbd_snap_remove(s->image, snapshot_name); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to remove the snapshot"); + } return r; } @@ -934,7 +857,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, do { snaps = g_malloc(sizeof(*snaps) * max_snaps); snap_count = rbd_snap_list(s->image, snaps, &max_snaps); - if (snap_count < 0) { + if (snap_count <= 0) { g_free(snaps); } } while (snap_count == -ERANGE); @@ -958,6 +881,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, sn_info->vm_clock_nsec = 0; } rbd_snap_list_end(snaps); + g_free(snaps); done: *psn_tab = sn_tab; @@ -993,6 +917,7 @@ static QEMUOptionParameter qemu_rbd_create_options[] = { static BlockDriver bdrv_rbd = { .format_name = "rbd", .instance_size = sizeof(BDRVRBDState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_create = qemu_rbd_create, diff --git a/block/sheepdog.c b/block/sheepdog.c index afe053376..0eb33ee80 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -91,6 +91,14 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +/* + * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and + * (SD_EC_MAX_STRIP - 1) for parity strips + * + * SD_MAX_COPIES is sum of number of data strips and parity strips. + */ +#define SD_EC_MAX_STRIP 16 +#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) #define SD_INODE_SIZE (sizeof(SheepdogInode)) #define CURRENT_VDI_ID 0 @@ -125,8 +133,9 @@ typedef struct SheepdogObjReq { uint32_t data_length; uint64_t oid; uint64_t cow_oid; - uint32_t copies; - uint32_t rsvd; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[6]; uint64_t offset; } SheepdogObjReq; @@ -138,7 +147,9 @@ typedef struct SheepdogObjRsp { uint32_t id; uint32_t data_length; uint32_t result; - uint32_t copies; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[2]; uint32_t pad[6]; } SheepdogObjRsp; @@ -150,8 +161,10 @@ typedef struct SheepdogVdiReq { uint32_t id; uint32_t data_length; uint64_t vdi_size; - uint32_t vdi_id; - uint32_t copies; + uint32_t base_vdi_id; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[2]; uint32_t snapid; uint32_t pad[3]; } SheepdogVdiReq; @@ -222,6 +235,11 @@ static inline uint64_t data_oid_to_idx(uint64_t oid) return oid & (MAX_DATA_OBJS - 1); } +static inline uint32_t oid_to_vid(uint64_t oid) +{ + return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; +} + static inline uint64_t vid_to_vdi_oid(uint32_t vid) { return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); @@ -289,11 +307,14 @@ struct SheepdogAIOCB { Coroutine *coroutine; void (*aio_done_func)(SheepdogAIOCB *); - bool canceled; + bool cancelable; + bool *finished; int nr_pending; }; typedef struct BDRVSheepdogState { + BlockDriverState *bs; + SheepdogInode inode; uint32_t min_dirty_data_idx; @@ -313,8 +334,11 @@ typedef struct BDRVSheepdogState { Coroutine *co_recv; uint32_t aioreq_seq_num; + + /* Every aio request must be linked to either of these queues. */ QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head; + QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; } BDRVSheepdogState; static const char * sd_strerror(int err) @@ -403,6 +427,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) { SheepdogAIOCB *acb = aio_req->aiocb; + acb->cancelable = false; QLIST_REMOVE(aio_req, aio_siblings); g_free(aio_req); @@ -411,23 +436,68 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) { - if (!acb->canceled) { - qemu_coroutine_enter(acb->coroutine, NULL); + qemu_coroutine_enter(acb->coroutine, NULL); + if (acb->finished) { + *acb->finished = true; } qemu_aio_release(acb); } +/* + * Check whether the specified acb can be canceled + * + * We can cancel aio when any request belonging to the acb is: + * - Not processed by the sheepdog server. + * - Not linked to the inflight queue. + */ +static bool sd_acb_cancelable(const SheepdogAIOCB *acb) +{ + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq; + + if (!acb->cancelable) { + return false; + } + + QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) { + if (aioreq->aiocb == acb) { + return false; + } + } + + return true; +} + static void sd_aio_cancel(BlockDriverAIOCB *blockacb) { SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq, *next; + bool finished = false; + + acb->finished = &finished; + while (!finished) { + if (sd_acb_cancelable(acb)) { + /* Remove outstanding requests from pending and failed queues. */ + QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); + } + } + QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); + } + } - /* - * Sheepdog cannot cancel the requests which are already sent to - * the servers, so we just complete the request with -EIO here. - */ - acb->ret = -EIO; - qemu_coroutine_enter(acb->coroutine, NULL); - acb->canceled = true; + assert(acb->nr_pending == 0); + sd_finish_aiocb(acb); + return; + } + qemu_aio_wait(); + } } static const AIOCBInfo sd_aiocb_info = { @@ -448,7 +518,8 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->nb_sectors = nb_sectors; acb->aio_done_func = NULL; - acb->canceled = false; + acb->cancelable = true; + acb->finished = NULL; acb->coroutine = qemu_coroutine_self(); acb->ret = 0; acb->nr_pending = 0; @@ -489,13 +560,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, int ret; ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); - if (ret < sizeof(*hdr)) { + if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); return ret; } ret = qemu_co_send(sockfd, data, *wlen); - if (ret < *wlen) { + if (ret != *wlen) { error_report("failed to send a req, %s", strerror(errno)); } @@ -509,13 +580,6 @@ static void restart_co_req(void *opaque) qemu_coroutine_enter(co, NULL); } -static int have_co_req(void *opaque) -{ - /* this handler is set only when there is a pending request, so - * always returns 1. */ - return 1; -} - typedef struct SheepdogReqCo { int sockfd; SheepdogReq *hdr; @@ -538,17 +602,17 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, co); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co); + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); - if (ret < sizeof(*hdr)) { + if (ret != sizeof(*hdr)) { error_report("failed to get a rsp, %s", strerror(errno)); ret = -errno; goto out; @@ -560,7 +624,7 @@ static coroutine_fn void do_co_req(void *opaque) if (*rlen) { ret = qemu_co_recv(sockfd, data, *rlen); - if (ret < *rlen) { + if (ret != *rlen) { error_report("failed to get the data, %s", strerror(errno)); ret = -errno; goto out; @@ -570,7 +634,7 @@ static coroutine_fn void do_co_req(void *opaque) out: /* there is at most one request for this sockfd, so it is safe to * set each handler to NULL. */ - qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL); srco->ret = ret; srco->finished = true; @@ -603,11 +667,13 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, return srco.ret; } -static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type); -static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); - +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); +static int get_sheep_fd(BDRVSheepdogState *s); +static void co_write_request(void *opaque); static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) { @@ -630,22 +696,59 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) { AIOReq *aio_req; SheepdogAIOCB *acb; - int ret; while ((aio_req = find_pending_req(s, oid)) != NULL) { acb = aio_req->aiocb; /* move aio_req from pending list to inflight one */ QLIST_REMOVE(aio_req, aio_siblings); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, acb->qiov->iov, - acb->qiov->niov, false, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - if (!acb->nr_pending) { - sd_finish_aiocb(acb); - } + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, false, + acb->aiocb_type); + } +} + +static coroutine_fn void reconnect_to_sdog(void *opaque) +{ + BDRVSheepdogState *s = opaque; + AIOReq *aio_req, *next; + + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); + close(s->fd); + s->fd = -1; + + /* Wait for outstanding write requests to be completed. */ + while (s->co_send != NULL) { + co_write_request(opaque); + } + + /* Try to reconnect the sheepdog server every one second. */ + while (s->fd < 0) { + s->fd = get_sheep_fd(s); + if (s->fd < 0) { + DPRINTF("Wait for connection to be established\n"); + co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, + 1000000000ULL); } + }; + + /* + * Now we have to resend all the request in the inflight queue. However, + * resend_aioreq() can yield and newly created requests can be added to the + * inflight queue before the coroutine is resumed. To avoid mixing them, we + * have to move all the inflight requests to the failed queue before + * resend_aioreq() is called. + */ + QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); + } + + /* Resend all the failed aio requests. */ + while (!QLIST_EMPTY(&s->failed_aio_head)) { + aio_req = QLIST_FIRST(&s->failed_aio_head); + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + resend_aioreq(s, aio_req); } } @@ -665,15 +768,11 @@ static void coroutine_fn aio_read_response(void *opaque) SheepdogAIOCB *acb; uint64_t idx; - if (QLIST_EMPTY(&s->inflight_aio_head)) { - goto out; - } - /* read a header */ ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); - if (ret < 0) { + if (ret != sizeof(rsp)) { error_report("failed to get the header, %s", strerror(errno)); - goto out; + goto err; } /* find the right aio_req from the inflight aio list */ @@ -684,7 +783,7 @@ static void coroutine_fn aio_read_response(void *opaque) } if (!aio_req) { error_report("cannot find aio_req %x", rsp.id); - goto out; + goto err; } acb = aio_req->aiocb; @@ -722,9 +821,9 @@ static void coroutine_fn aio_read_response(void *opaque) case AIOCB_READ_UDATA: ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, aio_req->iov_offset, rsp.data_length); - if (ret < 0) { + if (ret != rsp.data_length) { error_report("failed to get the data, %s", strerror(errno)); - goto out; + goto err; } break; case AIOCB_FLUSH_CACHE: @@ -755,11 +854,20 @@ static void coroutine_fn aio_read_response(void *opaque) case SD_RES_SUCCESS: break; case SD_RES_READONLY: - ret = resend_aioreq(s, aio_req); - if (ret == SD_RES_SUCCESS) { - goto out; + if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { + ret = reload_inode(s, 0, ""); + if (ret < 0) { + goto err; + } } - /* fall through */ + if (is_data_obj(aio_req->oid)) { + aio_req->oid = vid_to_data_oid(s->inode.vdi_id, + data_oid_to_idx(aio_req->oid)); + } else { + aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); + } + resend_aioreq(s, aio_req); + goto out; default: acb->ret = -EIO; error_report("%s", sd_strerror(rsp.result)); @@ -776,6 +884,10 @@ static void coroutine_fn aio_read_response(void *opaque) } out: s->co_recv = NULL; + return; +err: + s->co_recv = NULL; + reconnect_to_sdog(opaque); } static void co_read_response(void *opaque) @@ -796,18 +908,10 @@ static void co_write_request(void *opaque) qemu_coroutine_enter(s->co_send, NULL); } -static int aio_flush_request(void *opaque) -{ - BDRVSheepdogState *s = opaque; - - return !QLIST_EMPTY(&s->inflight_aio_head) || - !QLIST_EMPTY(&s->pending_aio_head); -} - /* - * Return a socket discriptor to read/write objects. + * Return a socket descriptor to read/write objects. * - * We cannot use this discriptor for other operations because + * We cannot use this descriptor for other operations because * the block driver may be on waiting response from the server. */ static int get_sheep_fd(BDRVSheepdogState *s) @@ -819,7 +923,7 @@ static int get_sheep_fd(BDRVSheepdogState *s) return fd; } - qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); + qemu_aio_set_fd_handler(fd, co_read_response, NULL, s); return fd; } @@ -1012,7 +1116,7 @@ out: return ret; } -static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type) { @@ -1069,36 +1173,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, qemu_co_mutex_lock(&s->lock); s->co_send = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, - aio_flush_request, s); + qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, s); socket_set_cork(s->fd, 1); /* send a header */ ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); + if (ret != sizeof(hdr)) { error_report("failed to send a req, %s", strerror(errno)); - return -errno; + goto out; } if (wlen) { ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); + if (ret != wlen) { error_report("failed to send a data, %s", strerror(errno)); - return -errno; } } - +out: socket_set_cork(s->fd, 0); - qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, - aio_flush_request, s); + qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, s); + s->co_send = NULL; qemu_co_mutex_unlock(&s->lock); - - return 0; } -static int read_write_object(int fd, char *buf, uint64_t oid, int copies, +static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool write, bool create, uint32_t cache_flags) { @@ -1146,7 +1244,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies, } } -static int read_object(int fd, char *buf, uint64_t oid, int copies, +static int read_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, uint32_t cache_flags) { @@ -1154,7 +1252,7 @@ static int read_object(int fd, char *buf, uint64_t oid, int copies, false, cache_flags); } -static int write_object(int fd, char *buf, uint64_t oid, int copies, +static int write_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool create, uint32_t cache_flags) { @@ -1198,51 +1296,62 @@ out: return ret; } -static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +/* Return true if the specified request is linked to the pending list. */ +static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req) { - SheepdogAIOCB *acb = aio_req->aiocb; - bool create = false; - int ret; - - ret = reload_inode(s, 0, ""); - if (ret < 0) { - return ret; + AIOReq *areq; + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { + if (areq != aio_req && areq->oid == aio_req->oid) { + /* + * Sheepdog cannot handle simultaneous create requests to the same + * object, so we cannot send the request until the previous request + * finishes. + */ + DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid); + aio_req->flags = 0; + aio_req->base_oid = 0; + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); + return true; + } } - aio_req->oid = vid_to_data_oid(s->inode.vdi_id, - data_oid_to_idx(aio_req->oid)); + return false; +} + +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + bool create = false; /* check whether this request becomes a CoW one */ - if (acb->aiocb_type == AIOCB_WRITE_UDATA) { + if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { int idx = data_oid_to_idx(aio_req->oid); - AIOReq *areq; - if (s->inode.data_vdi_id[idx] == 0) { - create = true; - goto out; - } if (is_data_obj_writable(&s->inode, idx)) { goto out; } - /* link to the pending list if there is another CoW request to - * the same object */ - QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { - if (areq != aio_req && areq->oid == aio_req->oid) { - DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid); - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); - return SD_RES_SUCCESS; - } + if (check_simultaneous_create(s, aio_req)) { + return; } - aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); - aio_req->flags |= SD_FLAG_CMD_COW; + if (s->inode.data_vdi_id[idx]) { + aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); + aio_req->flags |= SD_FLAG_CMD_COW; + } create = true; } out: - return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - create, acb->aiocb_type); + if (is_data_obj(aio_req->oid)) { + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + acb->aiocb_type); + } else { + struct iovec iov; + iov.iov_base = &s->inode; + iov.iov_len = sizeof(s->inode); + add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); + } } /* TODO Convert to fine grained options */ @@ -1259,7 +1368,8 @@ static QemuOptsList runtime_opts = { }, }; -static int sd_open(BlockDriverState *bs, QDict *options, int flags) +static int sd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { int ret, fd; uint32_t vid = 0; @@ -1271,9 +1381,11 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) Error *local_err = NULL; const char *filename; - opts = qemu_opts_create_nofail(&runtime_opts); + s->bs = bs; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { qerror_report_err(local_err); error_free(local_err); ret = -EINVAL; @@ -1284,6 +1396,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) QLIST_INIT(&s->inflight_aio_head); QLIST_INIT(&s->pending_aio_head); + QLIST_INIT(&s->failed_aio_head); s->fd = -1; memset(vdi, 0, sizeof(vdi)); @@ -1350,7 +1463,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) g_free(buf); return 0; out: - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); if (s->fd >= 0) { closesocket(s->fd); } @@ -1359,8 +1472,7 @@ out: return ret; } -static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, - uint32_t base_vid, uint32_t *vdi_id, int snapshot) +static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) { SheepdogVdiReq hdr; SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; @@ -1377,11 +1489,11 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, * does not fit in buf? For now, just truncate and avoid buffer overrun. */ memset(buf, 0, sizeof(buf)); - pstrcpy(buf, sizeof(buf), filename); + pstrcpy(buf, sizeof(buf), s->name); memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_NEW_VDI; - hdr.vdi_id = base_vid; + hdr.base_vdi_id = s->inode.vdi_id; wlen = SD_MAX_VDI_LEN; @@ -1389,7 +1501,9 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, hdr.snapid = snapshot; hdr.data_length = wlen; - hdr.vdi_size = vdi_size; + hdr.vdi_size = s->inode.vdi_size; + hdr.copy_policy = s->inode.copy_policy; + hdr.copies = s->inode.nr_copies; ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); @@ -1400,7 +1514,7 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, } if (rsp->result != SD_RES_SUCCESS) { - error_report("%s, %s", sd_strerror(rsp->result), filename); + error_report("%s, %s", sd_strerror(rsp->result), s->inode.name); return -EIO; } @@ -1417,10 +1531,14 @@ static int sd_prealloc(const char *filename) uint32_t idx, max_idx; int64_t vdi_size; void *buf = g_malloc0(SD_DATA_OBJ_SIZE); + Error *local_err = NULL; int ret; - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto out; } @@ -1447,32 +1565,86 @@ static int sd_prealloc(const char *filename) } out: if (bs) { - bdrv_delete(bs); + bdrv_unref(bs); } g_free(buf); return ret; } -static int sd_create(const char *filename, QEMUOptionParameter *options) +/* + * Sheepdog support two kinds of redundancy, full replication and erasure + * coding. + * + * # create a fully replicated vdi with x copies + * -o redundancy=x (1 <= x <= SD_MAX_COPIES) + * + * # create a erasure coded vdi with x data strips and y parity strips + * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) + */ +static int parse_redundancy(BDRVSheepdogState *s, const char *opt) +{ + struct SheepdogInode *inode = &s->inode; + const char *n1, *n2; + long copy, parity; + char p[10]; + + pstrcpy(p, sizeof(p), opt); + n1 = strtok(p, ":"); + n2 = strtok(NULL, ":"); + + if (!n1) { + return -EINVAL; + } + + copy = strtol(n1, NULL, 10); + if (copy > SD_MAX_COPIES || copy < 1) { + return -EINVAL; + } + if (!n2) { + inode->copy_policy = 0; + inode->nr_copies = copy; + return 0; + } + + if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { + return -EINVAL; + } + + parity = strtol(n2, NULL, 10); + if (parity >= SD_EC_MAX_STRIP || parity < 1) { + return -EINVAL; + } + + /* + * 4 bits for parity and 4 bits for data. + * We have to compress upper data bits because it can't represent 16 + */ + inode->copy_policy = ((copy / 2) << 4) + parity; + inode->nr_copies = copy + parity; + + return 0; +} + +static int sd_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int ret = 0; - uint32_t vid = 0, base_vid = 0; - int64_t vdi_size = 0; + uint32_t vid = 0; char *backing_file = NULL; BDRVSheepdogState *s; - char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; bool prealloc = false; + Error *local_err = NULL; s = g_malloc0(sizeof(BDRVSheepdogState)); - memset(vdi, 0, sizeof(vdi)); memset(tag, 0, sizeof(tag)); if (strstr(filename, "://")) { - ret = sd_parse_uri(s, filename, vdi, &snapid, tag); + ret = sd_parse_uri(s, filename, s->name, &snapid, tag); } else { - ret = parse_vdiname(s, filename, vdi, &snapid, tag); + ret = parse_vdiname(s, filename, s->name, &snapid, tag); } if (ret < 0) { goto out; @@ -1480,7 +1652,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - vdi_size = options->value.n; + s->inode.vdi_size = options->value.n; } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { backing_file = options->value.s; } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { @@ -1494,11 +1666,18 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) ret = -EINVAL; goto out; } + } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) { + if (options->value.s) { + ret = parse_redundancy(s, options->value.s); + if (ret < 0) { + goto out; + } + } } options++; } - if (vdi_size > SD_MAX_VDI_SIZE) { + if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { error_report("too big image size"); ret = -EINVAL; goto out; @@ -1506,7 +1685,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) if (backing_file) { BlockDriverState *bs; - BDRVSheepdogState *s; + BDRVSheepdogState *base; BlockDriver *drv; /* Currently, only Sheepdog backing image is supported. */ @@ -1517,25 +1696,28 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) goto out; } - ret = bdrv_file_open(&bs, backing_file, NULL, 0); + bs = NULL; + ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL, + &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto out; } - s = bs->opaque; + base = bs->opaque; - if (!is_snapshot(&s->inode)) { + if (!is_snapshot(&base->inode)) { error_report("cannot clone from a non snapshot vdi"); - bdrv_delete(bs); + bdrv_unref(bs); ret = -EINVAL; goto out; } - - base_vid = s->inode.vdi_id; - bdrv_delete(bs); + s->inode.vdi_id = base->inode.vdi_id; + bdrv_unref(bs); } - ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0); + ret = do_sd_create(s, &vid, 0); if (!prealloc || ret) { goto out; } @@ -1564,7 +1746,7 @@ static void sd_close(BlockDriverState *bs) memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_RELEASE_VDI; - hdr.vdi_id = s->inode.vdi_id; + hdr.base_vdi_id = s->inode.vdi_id; wlen = strlen(s->name) + 1; hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; @@ -1578,7 +1760,7 @@ static void sd_close(BlockDriverState *bs) error_report("%s, %s", sd_strerror(rsp->result), s->name); } - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); closesocket(s->fd); g_free(s->host_spec); } @@ -1630,7 +1812,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) */ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) { - int ret; BDRVSheepdogState *s = acb->common.bs->opaque; struct iovec iov; AIOReq *aio_req; @@ -1652,18 +1833,13 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), data_len, offset, 0, 0, offset); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); - if (ret) { - free_aio_req(s, aio_req); - acb->ret = -EIO; - goto out; - } + add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); acb->aio_done_func = sd_finish_aiocb; acb->aiocb_type = AIOCB_WRITE_UDATA; return; } -out: + sd_finish_aiocb(acb); } @@ -1673,7 +1849,7 @@ static bool sd_delete(BDRVSheepdogState *s) unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; SheepdogVdiReq hdr = { .opcode = SD_OP_DEL_VDI, - .vdi_id = s->inode.vdi_id, + .base_vdi_id = s->inode.vdi_id, .data_length = wlen, .flags = SD_FLAG_CMD_WRITE, }; @@ -1720,12 +1896,11 @@ static int sd_create_branch(BDRVSheepdogState *s) /* * Even If deletion fails, we will just create extra snapshot based on - * the workding VDI which was supposed to be deleted. So no need to + * the working VDI which was supposed to be deleted. So no need to * false bail out. */ deleted = sd_delete(s); - ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, - !deleted); + ret = do_sd_create(s, &vid, !deleted); if (ret) { goto out; } @@ -1849,35 +2024,16 @@ static int coroutine_fn sd_co_rw_vector(void *p) } aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); if (create) { - AIOReq *areq; - QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { - if (areq->oid == oid) { - /* - * Sheepdog cannot handle simultaneous create - * requests to the same object. So we cannot send - * the request until the previous request - * finishes. - */ - aio_req->flags = 0; - aio_req->base_oid = 0; - QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, - aio_siblings); - goto done; - } + if (check_simultaneous_create(s, aio_req)) { + goto done; } } - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - create, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - acb->ret = -EIO; - goto out; - } + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + acb->aiocb_type); done: offset = 0; idx++; @@ -1895,13 +2051,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, { SheepdogAIOCB *acb; int ret; + int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; + BDRVSheepdogState *s = bs->opaque; - if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE); + if (bs->growable && offset > s->inode.vdi_size) { + ret = sd_truncate(bs, offset); if (ret < 0) { return ret; } - bs->total_sectors = sector_num + nb_sectors; } acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); @@ -1945,7 +2102,6 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) BDRVSheepdogState *s = bs->opaque; SheepdogAIOCB *acb; AIOReq *aio_req; - int ret; if (s->cache_flags != SD_FLAG_CMD_CACHE) { return 0; @@ -1958,13 +2114,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), 0, 0, 0, 0, 0); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - qemu_aio_release(acb); - return ret; - } + add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); qemu_coroutine_yield(); return acb->ret; @@ -2014,8 +2164,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto cleanup; } - ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, - 1); + ret = do_sd_create(s, &new_vid, 1); if (ret < 0) { error_report("failed to create inode for snapshot. %s", strerror(errno)); @@ -2045,7 +2194,7 @@ cleanup: * We implement rollback(loadvm) operation to the specified snapshot by * 1) switch to the snapshot * 2) rely on sd_create_branch to delete working VDI and - * 3) create a new working VDI based on the speicified snapshot + * 3) create a new working VDI based on the specified snapshot */ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { @@ -2089,7 +2238,10 @@ out: return ret; } -static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +static int sd_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { /* FIXME: Delete specified snapshot id. */ return 0; @@ -2287,17 +2439,18 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, return acb->ret; } -static coroutine_fn int -sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) +static coroutine_fn int64_t +sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; - unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE, + uint64_t offset = sector_num * BDRV_SECTOR_SIZE; + unsigned long start = offset / SD_DATA_OBJ_SIZE, end = DIV_ROUND_UP((sector_num + nb_sectors) * BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); unsigned long idx; - int ret = 1; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; for (idx = start; idx < end; idx++) { if (inode->data_vdi_id[idx] == 0) { @@ -2321,6 +2474,22 @@ sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, return ret; } +static int64_t sd_get_allocated_file_size(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); + uint64_t size = 0; + + for (i = 0; i < last; i++) { + if (inode->data_vdi_id[i] == 0) { + continue; + } + size += SD_DATA_OBJ_SIZE; + } + return size; +} + static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -2337,6 +2506,11 @@ static QEMUOptionParameter sd_create_options[] = { .type = OPT_STRING, .help = "Preallocation mode (allowed values: off, full)" }, + { + .name = BLOCK_OPT_REDUNDANCY, + .type = OPT_STRING, + .help = "Redundancy of the image" + }, { NULL } }; @@ -2344,18 +2518,20 @@ static BlockDriver bdrv_sheepdog = { .format_name = "sheepdog", .protocol_name = "sheepdog", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2372,18 +2548,20 @@ static BlockDriver bdrv_sheepdog_tcp = { .format_name = "sheepdog", .protocol_name = "sheepdog+tcp", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2400,18 +2578,20 @@ static BlockDriver bdrv_sheepdog_unix = { .format_name = "sheepdog", .protocol_name = "sheepdog+unix", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, diff --git a/block/snapshot.c b/block/snapshot.c index 6c6d9deea..85c52ff45 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -25,6 +25,24 @@ #include "block/snapshot.h" #include "block/block_int.h" +QemuOptsList internal_snapshot_opts = { + .name = "snapshot", + .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head), + .desc = { + { + .name = SNAPSHOT_OPT_ID, + .type = QEMU_OPT_STRING, + .help = "snapshot id" + },{ + .name = SNAPSHOT_OPT_NAME, + .type = QEMU_OPT_STRING, + .help = "snapshot name" + },{ + /* end of list */ + } + }, +}; + int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, const char *name) { @@ -48,6 +66,79 @@ int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, return ret; } +/** + * Look up an internal snapshot by @id and @name. + * @bs: block device to search + * @id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @sn_info: location to store information on the snapshot found + * @errp: location to store error, will be set only for exception + * + * This function will traverse snapshot list in @bs to search the matching + * one, @id and @name are the matching condition: + * If both @id and @name are specified, find the first one with id @id and + * name @name. + * If only @id is specified, find the first one with id @id. + * If only @name is specified, find the first one with name @name. + * if none is specified, abort(). + * + * Returns: true when a snapshot is found and @sn_info will be filled, false + * when error or not found. If all operation succeed but no matching one is + * found, @errp will NOT be set. + */ +bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name, + QEMUSnapshotInfo *sn_info, + Error **errp) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i; + bool ret = false; + + assert(id || name); + + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list"); + return false; + } else if (nb_sns == 0) { + return false; + } + + if (id && name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (id) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } + + g_free(sn_tab); + return ret; +} + int bdrv_can_snapshot(BlockDriverState *bs) { BlockDriver *drv = bs->drv; @@ -97,9 +188,9 @@ int bdrv_snapshot_goto(BlockDriverState *bs, if (bs->file) { drv->bdrv_close(bs); ret = bdrv_snapshot_goto(bs->file, snapshot_id); - open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); if (open_ret < 0) { - bdrv_delete(bs->file); + bdrv_unref(bs->file); bs->drv = NULL; return open_ret; } @@ -109,21 +200,73 @@ int bdrv_snapshot_goto(BlockDriverState *bs, return -ENOTSUP; } -int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +/** + * Delete an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, delete the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, delete the first one with id + * @snapshot_id. + * If only @name is specified, delete the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on failure. If @bs is not inserted, return + * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs + * does not support internal snapshot deletion, return -ENOTSUP. If @bs does + * not support parameter @snapshot_id or @name, or one of them is not correctly + * specified, return -EINVAL. If @bs can't find one matching @id and @name, + * return -ENOENT. If @errp != NULL, it will always be filled with error + * message on failure. + */ +int bdrv_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { BlockDriver *drv = bs->drv; if (!drv) { + error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } if (drv->bdrv_snapshot_delete) { - return drv->bdrv_snapshot_delete(bs, snapshot_id); + return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); } if (bs->file) { - return bdrv_snapshot_delete(bs->file, snapshot_id); + return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); } + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, bdrv_get_device_name(bs), + "internal snapshot deletion"); return -ENOTSUP; } +void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err); + } + + if (ret < 0) { + error_propagate(errp, local_err); + } +} + int bdrv_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_info) { @@ -140,18 +283,71 @@ int bdrv_snapshot_list(BlockDriverState *bs, return -ENOTSUP; } +/** + * Temporarily load an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, load the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, load the first one with id + * @snapshot_id. + * If only @name is specified, load the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on fail. If @bs is not inserted, return + * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support + * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and + * @name, return -ENOENT. If @errp != NULL, it will always be filled on + * failure. + */ int bdrv_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_name) + const char *snapshot_id, + const char *name, + Error **errp) { BlockDriver *drv = bs->drv; + if (!drv) { + error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } if (!bs->read_only) { + error_setg(errp, "Device is not readonly"); return -EINVAL; } if (drv->bdrv_snapshot_load_tmp) { - return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); + return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); } + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, bdrv_get_device_name(bs), + "temporarily load internal snapshot"); return -ENOTSUP; } + +int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); + } + + if (local_err) { + error_propagate(errp, local_err); + } + + return ret; +} diff --git a/block/ssh.c b/block/ssh.c index d7e7bf8dd..aa63c9d20 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -608,7 +608,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, return ret; } -static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags) +static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, + Error **errp) { BDRVSSHState *s = bs->opaque; int ret; @@ -650,7 +651,8 @@ static QEMUOptionParameter ssh_create_options[] = { { NULL } }; -static int ssh_create(const char *filename, QEMUOptionParameter *options) +static int ssh_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int r, ret; Error *local_err = NULL; @@ -740,14 +742,6 @@ static void restart_coroutine(void *opaque) qemu_coroutine_enter(co, NULL); } -/* Always true because when we have called set_fd_handler there is - * always a request being processed. - */ -static int return_true(void *opaque) -{ - return 1; -} - static coroutine_fn void set_fd_handler(BDRVSSHState *s) { int r; @@ -766,13 +760,13 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s) DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, rd_handler, wr_handler); - qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co); + qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, co); } static coroutine_fn void clear_fd_handler(BDRVSSHState *s) { DPRINTF("s->sock=%d", s->sock); - qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL); } /* A non-blocking call returned EAGAIN, so yield, ensuring the diff --git a/block/stream.c b/block/stream.c index 7fe9e486b..dd0b4ac3d 100644 --- a/block/stream.c +++ b/block/stream.c @@ -57,6 +57,11 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, BlockDriverState *intermediate; intermediate = top->backing_hd; + /* Must assign before bdrv_delete() to prevent traversing dangling pointer + * while we delete backing image instances. + */ + top->backing_hd = base; + while (intermediate) { BlockDriverState *unused; @@ -68,9 +73,10 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, unused = intermediate; intermediate = intermediate->backing_hd; unused->backing_hd = NULL; - bdrv_delete(unused); + bdrv_unref(unused); } - top->backing_hd = base; + + bdrv_refresh_limits(top); } static void coroutine_fn stream_run(void *opaque) @@ -84,6 +90,11 @@ static void coroutine_fn stream_run(void *opaque) int n = 0; void *buf; + if (!bs->backing_hd) { + block_job_completed(&s->common, 0); + return; + } + s->common.len = bdrv_getlength(bs); if (s->common.len < 0) { block_job_completed(&s->common, s->common.len); @@ -110,21 +121,22 @@ wait: /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } - ret = bdrv_co_is_allocated(bs, sector_num, - STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + copy = false; + + ret = bdrv_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ - copy = false; - } else { + } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [sector_num, sector_num+n). */ - ret = bdrv_co_is_allocated_above(bs->backing_hd, base, - sector_num, n, &n); + ret = bdrv_is_allocated_above(bs->backing_hd, base, + sector_num, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { @@ -134,7 +146,7 @@ wait: copy = (ret == 1); } trace_stream_one_iteration(s, sector_num, n, ret); - if (ret >= 0 && copy) { + if (copy) { if (s->common.speed) { delay_ns = ratelimit_calculate_delay(&s->limit, n); if (delay_ns > 0) { @@ -198,9 +210,9 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static const BlockJobType stream_job_type = { +static const BlockJobDriver stream_job_driver = { .instance_size = sizeof(StreamBlockJob), - .job_type = "stream", + .job_type = BLOCK_JOB_TYPE_STREAM, .set_speed = stream_set_speed, }; @@ -219,7 +231,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, return; } - s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp); + s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); if (!s) { return; } diff --git a/block/vdi.c b/block/vdi.c index 8a915257e..820cd376b 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -31,7 +31,7 @@ * Allocation of blocks could be optimized (less writes to block map and * header). * - * Read and write of adjacents blocks could be done in one operation + * Read and write of adjacent blocks could be done in one operation * (current code uses one operation per block (1 MiB). * * The code is not thread safe (missing locks for changes in header and @@ -120,6 +120,11 @@ typedef unsigned char uuid_t[16]; #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) +/* max blocks in image is (0xffffffff / 4) */ +#define VDI_BLOCKS_IN_IMAGE_MAX 0x3fffffff +#define VDI_DISK_SIZE_MAX ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \ + (uint64_t)DEFAULT_CLUSTER_SIZE) + #if !defined(CONFIG_UUID) static inline void uuid_generate(uuid_t out) { @@ -165,7 +170,7 @@ typedef struct { uuid_t uuid_link; uuid_t uuid_parent; uint64_t unused2[7]; -} VdiHeader; +} QEMU_PACKED VdiHeader; typedef struct { /* The block map entries are little endian (even in memory). */ @@ -331,6 +336,7 @@ static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) logout("\n"); bdi->cluster_size = s->block_size; bdi->vm_state_offset = 0; + bdi->unallocated_blocks_are_zero = true; return 0; } @@ -364,7 +370,8 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) return result; } -static int vdi_open(BlockDriverState *bs, QDict *options, int flags) +static int vdi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVdiState *s = bs->opaque; VdiHeader header; @@ -383,6 +390,14 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags) vdi_header_print(&header); #endif + if (header.disk_size > VDI_DISK_SIZE_MAX) { + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + header.disk_size, VDI_DISK_SIZE_MAX); + ret = -ENOTSUP; + goto fail; + } + if (header.disk_size % SECTOR_SIZE != 0) { /* 'VBoxManage convertfromraw' can create images with odd disk sizes. We accept them but round the disk size to the next multiple of @@ -393,43 +408,56 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags) } if (header.signature != VDI_SIGNATURE) { - logout("bad vdi signature %08x\n", header.signature); - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in VDI format (bad signature %08x)", header.signature); + ret = -EINVAL; goto fail; } else if (header.version != VDI_VERSION_1_1) { - logout("unsupported version %u.%u\n", - header.version >> 16, header.version & 0xffff); + error_setg(errp, "unsupported VDI image (version %u.%u)", + header.version >> 16, header.version & 0xffff); ret = -ENOTSUP; goto fail; } else if (header.offset_bmap % SECTOR_SIZE != 0) { /* We only support block maps which start on a sector boundary. */ - logout("unsupported block map offset 0x%x B\n", header.offset_bmap); + error_setg(errp, "unsupported VDI image (unaligned block map offset " + "0x%x)", header.offset_bmap); ret = -ENOTSUP; goto fail; } else if (header.offset_data % SECTOR_SIZE != 0) { /* We only support data blocks which start on a sector boundary. */ - logout("unsupported data offset 0x%x B\n", header.offset_data); + error_setg(errp, "unsupported VDI image (unaligned data offset 0x%x)", + header.offset_data); ret = -ENOTSUP; goto fail; } else if (header.sector_size != SECTOR_SIZE) { - logout("unsupported sector size %u B\n", header.sector_size); + error_setg(errp, "unsupported VDI image (sector size %u is not %u)", + header.sector_size, SECTOR_SIZE); ret = -ENOTSUP; goto fail; - } else if (header.block_size != 1 * MiB) { - logout("unsupported block size %u B\n", header.block_size); + } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { + error_setg(errp, "unsupported VDI image (block size %u is not %u)", + header.block_size, DEFAULT_CLUSTER_SIZE); ret = -ENOTSUP; goto fail; } else if (header.disk_size > (uint64_t)header.blocks_in_image * header.block_size) { - logout("unsupported disk size %" PRIu64 " B\n", header.disk_size); + error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", " + "image bitmap has room for %" PRIu64 ")", + header.disk_size, + (uint64_t)header.blocks_in_image * header.block_size); ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_link)) { - logout("link uuid != 0, unsupported\n"); + error_setg(errp, "unsupported VDI image (non-NULL link UUID)"); ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_parent)) { - logout("parent uuid != 0, unsupported\n"); + error_setg(errp, "unsupported VDI image (non-NULL parent UUID)"); + ret = -ENOTSUP; + goto fail; + } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) { + error_setg(errp, "unsupported VDI image " + "(too many blocks %u, max is %u)", + header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX); ret = -ENOTSUP; goto fail; } @@ -470,7 +498,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state, return 0; } -static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ @@ -479,12 +507,23 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, size_t sector_in_block = sector_num % s->block_sectors; int n_sectors = s->block_sectors - sector_in_block; uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); + uint64_t offset; + int result; + logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); if (n_sectors > nb_sectors) { n_sectors = nb_sectors; } *pnum = n_sectors; - return VDI_IS_ALLOCATED(bmap_entry); + result = VDI_IS_ALLOCATED(bmap_entry); + if (!result) { + return 0; + } + + offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + sector_in_block * SECTOR_SIZE; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; } static int vdi_co_read(BlockDriverState *bs, @@ -633,7 +672,8 @@ static int vdi_co_write(BlockDriverState *bs, return ret; } -static int vdi_create(const char *filename, QEMUOptionParameter *options) +static int vdi_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int result = 0; @@ -668,11 +708,20 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) options++; } + if (bytes > VDI_DISK_SIZE_MAX) { + result = -ENOTSUP; + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + bytes, VDI_DISK_SIZE_MAX); + goto exit; + } + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644); if (fd < 0) { - return -errno; + result = -errno; + goto exit; } /* We need enough blocks to store the given disk size, @@ -733,6 +782,7 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) result = -errno; } +exit: return result; } @@ -780,7 +830,7 @@ static BlockDriver bdrv_vdi = { .bdrv_reopen_prepare = vdi_reopen_prepare, .bdrv_create = vdi_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = vdi_co_is_allocated, + .bdrv_co_get_block_status = vdi_co_get_block_status, .bdrv_make_empty = vdi_make_empty, .bdrv_read = vdi_co_read, diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c new file mode 100644 index 000000000..fe879ed99 --- /dev/null +++ b/block/vhdx-endian.c @@ -0,0 +1,216 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/vhdx.h" + +#include <uuid/uuid.h> + + +/* + * All the VHDX formats on disk are little endian - the following + * are helper import/export functions to correctly convert + * endianness from disk read to native cpu format, and back again. + */ + + +/* VHDX File Header */ + + +void vhdx_header_le_import(VHDXHeader *h) +{ + assert(h != NULL); + + le32_to_cpus(&h->signature); + le32_to_cpus(&h->checksum); + le64_to_cpus(&h->sequence_number); + + leguid_to_cpus(&h->file_write_guid); + leguid_to_cpus(&h->data_write_guid); + leguid_to_cpus(&h->log_guid); + + le16_to_cpus(&h->log_version); + le16_to_cpus(&h->version); + le32_to_cpus(&h->log_length); + le64_to_cpus(&h->log_offset); +} + +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h) +{ + assert(orig_h != NULL); + assert(new_h != NULL); + + new_h->signature = cpu_to_le32(orig_h->signature); + new_h->checksum = cpu_to_le32(orig_h->checksum); + new_h->sequence_number = cpu_to_le64(orig_h->sequence_number); + + new_h->file_write_guid = orig_h->file_write_guid; + new_h->data_write_guid = orig_h->data_write_guid; + new_h->log_guid = orig_h->log_guid; + + cpu_to_leguids(&new_h->file_write_guid); + cpu_to_leguids(&new_h->data_write_guid); + cpu_to_leguids(&new_h->log_guid); + + new_h->log_version = cpu_to_le16(orig_h->log_version); + new_h->version = cpu_to_le16(orig_h->version); + new_h->log_length = cpu_to_le32(orig_h->log_length); + new_h->log_offset = cpu_to_le64(orig_h->log_offset); +} + + +/* VHDX Log Headers */ + + +void vhdx_log_desc_le_import(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->signature); + le32_to_cpus(&d->trailing_bytes); + le64_to_cpus(&d->leading_bytes); + le64_to_cpus(&d->file_offset); + le64_to_cpus(&d->sequence_number); +} + +void vhdx_log_desc_le_export(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->signature); + cpu_to_le32s(&d->trailing_bytes); + cpu_to_le64s(&d->leading_bytes); + cpu_to_le64s(&d->file_offset); + cpu_to_le64s(&d->sequence_number); +} + +void vhdx_log_data_le_export(VHDXLogDataSector *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->data_signature); + cpu_to_le32s(&d->sequence_high); + cpu_to_le32s(&d->sequence_low); +} + +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_length); + le32_to_cpus(&hdr->tail); + le64_to_cpus(&hdr->sequence_number); + le32_to_cpus(&hdr->descriptor_count); + leguid_to_cpus(&hdr->log_guid); + le64_to_cpus(&hdr->flushed_file_offset); + le64_to_cpus(&hdr->last_file_offset); +} + +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_length); + cpu_to_le32s(&hdr->tail); + cpu_to_le64s(&hdr->sequence_number); + cpu_to_le32s(&hdr->descriptor_count); + cpu_to_leguids(&hdr->log_guid); + cpu_to_le64s(&hdr->flushed_file_offset); + cpu_to_le64s(&hdr->last_file_offset); +} + + +/* Region table entries */ +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_count); +} + +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_count); +} + +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->guid); + le64_to_cpus(&e->file_offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} + +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->guid); + cpu_to_le64s(&e->file_offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} + + +/* Metadata headers & table */ +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + le64_to_cpus(&hdr->signature); + le16_to_cpus(&hdr->entry_count); +} + +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le64s(&hdr->signature); + cpu_to_le16s(&hdr->entry_count); +} + +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->item_id); + le32_to_cpus(&e->offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->item_id); + cpu_to_le32s(&e->offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} diff --git a/block/vhdx-log.c b/block/vhdx-log.c new file mode 100644 index 000000000..a77c040ee --- /dev/null +++ b/block/vhdx-log.c @@ -0,0 +1,1021 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This file covers the functionality of the metadata log writing, parsing, and + * replay. + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "block/vhdx.h" + + +typedef struct VHDXLogSequence { + bool valid; + uint32_t count; + VHDXLogEntries log; + VHDXLogEntryHeader hdr; +} VHDXLogSequence; + +typedef struct VHDXLogDescEntries { + VHDXLogEntryHeader hdr; + VHDXLogDescriptor desc[]; +} VHDXLogDescEntries; + +static const MSGUID zero_guid = { 0 }; + +/* The log located on the disk is circular buffer containing + * sectors of 4096 bytes each. + * + * It is assumed for the read/write functions below that the + * circular buffer scheme uses a 'one sector open' to indicate + * the buffer is full. Given the validation methods used for each + * sector, this method should be compatible with other methods that + * do not waste a sector. + */ + + +/* Allow peeking at the hdr entry at the beginning of the current + * read index, without advancing the read index */ +static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, + VHDXLogEntryHeader *hdr) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + assert(hdr != NULL); + + /* peek is only supported on sector boundaries */ + if (log->read % VHDX_LOG_SECTOR_SIZE) { + ret = -EFAULT; + goto exit; + } + + read = log->read; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + if ((read + sizeof(VHDXLogEntryHeader)) > log->length) { + read = 0; + } + + if (read == log->write) { + ret = -EINVAL; + goto exit; + } + + offset = log->offset + read; + + ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader)); + if (ret < 0) { + goto exit; + } + +exit: + return ret; +} + +/* Index increment for log, based on sector boundaries */ +static int vhdx_log_inc_idx(uint32_t idx, uint64_t length) +{ + idx += VHDX_LOG_SECTOR_SIZE; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + return idx >= length ? 0 : idx; +} + + +/* Reset the log to empty */ +static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s) +{ + MSGUID guid = { 0 }; + s->log.read = s->log.write = 0; + /* a log guid of 0 indicates an empty log to any parser of v0 + * VHDX logs */ + vhdx_update_headers(bs, s, false, &guid); +} + +/* Reads num_sectors from the log (all log sectors are 4096 bytes), + * into buffer 'buffer'. Upon return, *sectors_read will contain + * the number of sectors successfully read. + * + * It is assumed that 'buffer' is already allocated, and of sufficient + * size (i.e. >= 4096*num_sectors). + * + * If 'peek' is true, then the tail (read) pointer for the circular buffer is + * not modified. + * + * 0 is returned on success, -errno otherwise. */ +static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_read, void *buffer, + uint32_t num_sectors, bool peek) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + read = log->read; + + *sectors_read = 0; + while (num_sectors) { + if (read == log->write) { + /* empty */ + break; + } + offset = log->offset + read; + + ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + read = vhdx_log_inc_idx(read, log->length); + + *sectors_read = *sectors_read + 1; + num_sectors--; + } + +exit: + if (!peek) { + log->read = read; + } + return ret; +} + +/* Writes num_sectors to the log (all log sectors are 4096 bytes), + * from buffer 'buffer'. Upon return, *sectors_written will contain + * the number of sectors successfully written. + * + * It is assumed that 'buffer' is at least 4096*num_sectors large. + * + * 0 is returned on success, -errno otherwise */ +static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_written, void *buffer, + uint32_t num_sectors) +{ + int ret = 0; + uint64_t offset; + uint32_t write; + void *buffer_tmp; + BDRVVHDXState *s = bs->opaque; + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + write = log->write; + + buffer_tmp = buffer; + while (num_sectors) { + + offset = log->offset + write; + write = vhdx_log_inc_idx(write, log->length); + if (write == log->read) { + /* full */ + break; + } + ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + buffer_tmp += VHDX_LOG_SECTOR_SIZE; + + log->write = write; + *sectors_written = *sectors_written + 1; + num_sectors--; + } + +exit: + return ret; +} + + +/* Validates a log entry header */ +static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, + BDRVVHDXState *s) +{ + int valid = false; + + if (memcmp(&hdr->signature, "loge", 4)) { + goto exit; + } + + /* if the individual entry length is larger than the whole log + * buffer, that is obviously invalid */ + if (log->length < hdr->entry_length) { + goto exit; + } + + /* length of entire entry must be in units of 4KB (log sector size) */ + if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) { + goto exit; + } + + /* per spec, sequence # must be > 0 */ + if (hdr->sequence_number == 0) { + goto exit; + } + + /* log entries are only valid if they match the file-wide log guid + * found in the active header */ + if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) { + goto exit; + } + + if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) { + goto exit; + } + + valid = true; + +exit: + return valid; +} + +/* + * Given a log header, this will validate that the descriptors and the + * corresponding data sectors (if applicable) + * + * Validation consists of: + * 1. Making sure the sequence numbers matches the entry header + * 2. Verifying a valid signature ('zero' or 'desc' for descriptors) + * 3. File offset field is a multiple of 4KB + * 4. If a data descriptor, the corresponding data sector + * has its signature ('data') and matching sequence number + * + * @desc: the data buffer containing the descriptor + * @hdr: the log entry header + * + * Returns true if valid + */ +static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, + VHDXLogEntryHeader *hdr) +{ + bool ret = false; + + if (desc->sequence_number != hdr->sequence_number) { + goto exit; + } + if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) { + goto exit; + } + + if (!memcmp(&desc->signature, "zero", 4)) { + if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { + /* valid */ + ret = true; + } + } else if (!memcmp(&desc->signature, "desc", 4)) { + /* valid */ + ret = true; + } + +exit: + return ret; +} + + +/* Prior to sector data for a log entry, there is the header + * and the descriptors referenced in the header: + * + * [] = 4KB sector + * + * [ hdr, desc ][ desc ][ ... ][ data ][ ... ] + * + * The first sector in a log entry has a 64 byte header, and + * up to 126 32-byte descriptors. If more descriptors than + * 126 are required, then subsequent sectors can have up to 128 + * descriptors. Each sector is 4KB. Data follows the descriptor + * sectors. + * + * This will return the number of sectors needed to encompass + * the passed number of descriptors in desc_cnt. + * + * This will never return 0, even if desc_cnt is 0. + */ +static int vhdx_compute_desc_sectors(uint32_t desc_cnt) +{ + uint32_t desc_sectors; + + desc_cnt += 2; /* account for header in first sector */ + desc_sectors = desc_cnt / 128; + if (desc_cnt % 128) { + desc_sectors++; + } + + return desc_sectors; +} + + +/* Reads the log header, and subsequent descriptors (if any). This + * will allocate all the space for buffer, which must be NULL when + * passed into this function. Each descriptor will also be validated, + * and error returned if any are invalid. */ +static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, VHDXLogDescEntries **buffer) +{ + int ret = 0; + uint32_t desc_sectors; + uint32_t sectors_read; + VHDXLogEntryHeader hdr; + VHDXLogDescEntries *desc_entries = NULL; + int i; + + assert(*buffer == NULL); + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto exit; + } + vhdx_log_entry_hdr_le_import(&hdr); + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + ret = -EINVAL; + goto exit; + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE); + + ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, + desc_sectors, false); + if (ret < 0) { + goto free_and_exit; + } + if (sectors_read != desc_sectors) { + ret = -EINVAL; + goto free_and_exit; + } + + /* put in proper endianness, and validate each desc */ + for (i = 0; i < hdr.descriptor_count; i++) { + vhdx_log_desc_le_import(&desc_entries->desc[i]); + if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) { + ret = -EINVAL; + goto free_and_exit; + } + } + + *buffer = desc_entries; + goto exit; + +free_and_exit: + qemu_vfree(desc_entries); +exit: + return ret; +} + + +/* Flushes the descriptor described by desc to the VHDX image file. + * If the descriptor is a data descriptor, than 'data' must be non-NULL, + * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be + * written. + * + * Verification is performed to make sure the sequence numbers of a data + * descriptor match the sequence number in the desc. + * + * For a zero descriptor, it may describe multiple sectors to fill with zeroes. + * In this case, it should be noted that zeroes are written to disk, and the + * image file is not extended as a sparse file. */ +static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, + VHDXLogDataSector *data) +{ + int ret = 0; + uint64_t seq, file_offset; + uint32_t offset = 0; + void *buffer = NULL; + uint64_t count = 1; + int i; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + if (!memcmp(&desc->signature, "desc", 4)) { + /* data sector */ + if (data == NULL) { + ret = -EFAULT; + goto exit; + } + + /* The sequence number of the data sector must match that + * in the descriptor */ + seq = data->sequence_high; + seq <<= 32; + seq |= data->sequence_low & 0xffffffff; + + if (seq != desc->sequence_number) { + ret = -EINVAL; + goto exit; + } + + /* Each data sector is in total 4096 bytes, however the first + * 8 bytes, and last 4 bytes, are located in the descriptor */ + memcpy(buffer, &desc->leading_bytes, 8); + offset += 8; + + memcpy(buffer+offset, data->data, 4084); + offset += 4084; + + memcpy(buffer+offset, &desc->trailing_bytes, 4); + + } else if (!memcmp(&desc->signature, "zero", 4)) { + /* write 'count' sectors of sector */ + memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); + count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; + } + + file_offset = desc->file_offset; + + /* count is only > 1 if we are writing zeroes */ + for (i = 0; i < count; i++) { + ret = bdrv_pwrite_sync(bs->file, file_offset, buffer, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + file_offset += VHDX_LOG_SECTOR_SIZE; + } + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Flush the entire log (as described by 'logs') to the VHDX image + * file, and then set the log to 'empty' status once complete. + * + * The log entries should be validate prior to flushing */ +static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + int i; + uint32_t cnt, sectors_read; + uint64_t new_file_size; + void *data = NULL; + VHDXLogDescEntries *desc_entries = NULL; + VHDXLogEntryHeader hdr_tmp = { 0 }; + + cnt = logs->count; + + data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + /* each iteration represents one log sequence, which may span multiple + * sectors */ + while (cnt--) { + ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp); + if (ret < 0) { + goto exit; + } + /* if the log shows a FlushedFileOffset larger than our current file + * size, then that means the file has been truncated / corrupted, and + * we must refused to open it / use it */ + if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) { + ret = -EINVAL; + goto exit; + } + + ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries); + if (ret < 0) { + goto exit; + } + + for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { + if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) { + /* data sector, so read a sector to flush */ + ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, + data, 1, false); + if (ret < 0) { + goto exit; + } + if (sectors_read != 1) { + ret = -EINVAL; + goto exit; + } + } + + ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); + if (ret < 0) { + goto exit; + } + } + if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) { + new_file_size = desc_entries->hdr.last_file_offset; + if (new_file_size % (1024*1024)) { + /* round up to nearest 1MB boundary */ + new_file_size = ((new_file_size >> 20) + 1) << 20; + bdrv_truncate(bs->file, new_file_size); + } + } + qemu_vfree(desc_entries); + desc_entries = NULL; + } + + bdrv_flush(bs); + /* once the log is fully flushed, indicate that we have an empty log + * now. This also sets the log guid to 0, to indicate an empty log */ + vhdx_log_reset(bs, s); + +exit: + qemu_vfree(data); + qemu_vfree(desc_entries); + return ret; +} + +static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, uint64_t seq, + bool *valid, VHDXLogEntryHeader *entry) +{ + int ret = 0; + VHDXLogEntryHeader hdr; + void *buffer = NULL; + uint32_t i, desc_sectors, total_sectors, crc; + uint32_t sectors_read = 0; + VHDXLogDescEntries *desc_buffer = NULL; + + *valid = false; + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto inc_and_exit; + } + + vhdx_log_entry_hdr_le_import(&hdr); + + + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + goto inc_and_exit; + } + + if (seq > 0) { + if (hdr.sequence_number != seq + 1) { + goto inc_and_exit; + } + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + + /* Read desc sectors, and calculate log checksum */ + + total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; + + + /* read_desc() will increment the read idx */ + ret = vhdx_log_read_desc(bs, s, log, &desc_buffer); + if (ret < 0) { + goto free_and_exit; + } + + crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer, + desc_sectors * VHDX_LOG_SECTOR_SIZE, 4); + crc ^= 0xffffffff; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + if (total_sectors > desc_sectors) { + for (i = 0; i < total_sectors - desc_sectors; i++) { + sectors_read = 0; + ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer, + 1, false); + if (ret < 0 || sectors_read != 1) { + goto free_and_exit; + } + crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1); + crc ^= 0xffffffff; + } + } + crc ^= 0xffffffff; + if (crc != desc_buffer->hdr.checksum) { + goto free_and_exit; + } + + *valid = true; + *entry = hdr; + goto free_and_exit; + +inc_and_exit: + log->read = vhdx_log_inc_idx(log->read, log->length); + +free_and_exit: + qemu_vfree(buffer); + qemu_vfree(desc_buffer); + return ret; +} + +/* Search through the log circular buffer, and find the valid, active + * log sequence, if any exists + * */ +static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + uint32_t tail; + bool seq_valid = false; + VHDXLogSequence candidate = { 0 }; + VHDXLogEntryHeader hdr = { 0 }; + VHDXLogEntries curr_log; + + memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries)); + curr_log.write = curr_log.length; /* assume log is full */ + curr_log.read = 0; + + + /* now we will go through the whole log sector by sector, until + * we find a valid, active log sequence, or reach the end of the + * log buffer */ + for (;;) { + uint64_t curr_seq = 0; + VHDXLogSequence current = { 0 }; + + tail = curr_log.read; + + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + + if (seq_valid) { + current.valid = true; + current.log = curr_log; + current.log.read = tail; + current.log.write = curr_log.read; + current.count = 1; + current.hdr = hdr; + + + for (;;) { + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + if (seq_valid == false) { + break; + } + current.log.write = curr_log.read; + current.count++; + + curr_seq = hdr.sequence_number; + } + } + + if (current.valid) { + if (candidate.valid == false || + current.hdr.sequence_number > candidate.hdr.sequence_number) { + candidate = current; + } + } + + if (curr_log.read < tail) { + break; + } + } + + *logs = candidate; + + if (candidate.valid) { + /* this is the next sequence number, for writes */ + s->log.sequence = candidate.hdr.sequence_number + 1; + } + + +exit: + return ret; +} + +/* Parse the replay log. Per the VHDX spec, if the log is present + * it must be replayed prior to opening the file, even read-only. + * + * If read-only, we must replay the log in RAM (or refuse to open + * a dirty VHDX file read-only) */ +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp) +{ + int ret = 0; + VHDXHeader *hdr; + VHDXLogSequence logs = { 0 }; + + hdr = s->headers[s->curr_header]; + + *flushed = false; + + /* s->log.hdr is freed in vhdx_close() */ + if (s->log.hdr == NULL) { + s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader)); + } + + s->log.offset = hdr->log_offset; + s->log.length = hdr->log_length; + + if (s->log.offset < VHDX_LOG_MIN_SIZE || + s->log.offset % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + /* per spec, only log version of 0 is supported */ + if (hdr->log_version != 0) { + ret = -EINVAL; + goto exit; + } + + /* If either the log guid, or log length is zero, + * then a replay log is not present */ + if (guid_eq(hdr->log_guid, zero_guid)) { + goto exit; + } + + if (hdr->log_length == 0) { + goto exit; + } + + if (hdr->log_length % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + + /* The log is present, we need to find if and where there is an active + * sequence of valid entries present in the log. */ + + ret = vhdx_log_search(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + if (logs.valid) { + if (bs->read_only) { + ret = -EPERM; + error_setg_errno(errp, EPERM, + "VHDX image file '%s' opened read-only, but " + "contains a log that needs to be replayed. To " + "replay the log, execute:\n qemu-img check -r " + "all '%s'", + bs->filename, bs->filename); + goto exit; + } + /* now flush the log */ + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + *flushed = true; + } + + +exit: + return ret; +} + + + +static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, + VHDXLogDataSector *sector, void *data, + uint64_t seq) +{ + /* 8 + 4084 + 4 = 4096, 1 log sector */ + memcpy(&desc->leading_bytes, data, 8); + data += 8; + cpu_to_le64s(&desc->leading_bytes); + memcpy(sector->data, data, 4084); + data += 4084; + memcpy(&desc->trailing_bytes, data, 4); + cpu_to_le32s(&desc->trailing_bytes); + data += 4; + + sector->sequence_high = (uint32_t) (seq >> 32); + sector->sequence_low = (uint32_t) (seq & 0xffffffff); + sector->data_signature = VHDX_LOG_DATA_SIGNATURE; + + vhdx_log_desc_le_export(desc); + vhdx_log_data_le_export(sector); +} + + +static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + void *buffer = NULL; + void *merged_sector = NULL; + void *data_tmp, *sector_write; + unsigned int i; + int sector_offset; + uint32_t desc_sectors, sectors, total_length; + uint32_t sectors_written = 0; + uint32_t aligned_length; + uint32_t leading_length = 0; + uint32_t trailing_length = 0; + uint32_t partial_sectors = 0; + uint32_t bytes_written = 0; + uint64_t file_offset; + VHDXHeader *header; + VHDXLogEntryHeader new_hdr; + VHDXLogDescriptor *new_desc = NULL; + VHDXLogDataSector *data_sector = NULL; + MSGUID new_guid = { 0 }; + + header = s->headers[s->curr_header]; + + /* need to have offset read data, and be on 4096 byte boundary */ + + if (length > header->log_length) { + /* no log present. we could create a log here instead of failing */ + ret = -EINVAL; + goto exit; + } + + if (guid_eq(header->log_guid, zero_guid)) { + vhdx_guid_generate(&new_guid); + vhdx_update_headers(bs, s, false, &new_guid); + } else { + /* currently, we require that the log be flushed after + * every write. */ + ret = -ENOTSUP; + goto exit; + } + + /* 0 is an invalid sequence number, but may also represent the first + * log write (or a wrapped seq) */ + if (s->log.sequence == 0) { + s->log.sequence = 1; + } + + sector_offset = offset % VHDX_LOG_SECTOR_SIZE; + file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE; + + aligned_length = length; + + /* add in the unaligned head and tail bytes */ + if (sector_offset) { + leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset); + leading_length = leading_length > length ? length : leading_length; + aligned_length -= leading_length; + partial_sectors++; + } + + sectors = aligned_length / VHDX_LOG_SECTOR_SIZE; + trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE); + if (trailing_length) { + partial_sectors++; + } + + sectors += partial_sectors; + + /* sectors is now how many sectors the data itself takes, not + * including the header and descriptor metadata */ + + new_hdr = (VHDXLogEntryHeader) { + .signature = VHDX_LOG_SIGNATURE, + .tail = s->log.tail, + .sequence_number = s->log.sequence, + .descriptor_count = sectors, + .reserved = 0, + .flushed_file_offset = bdrv_getlength(bs->file), + .last_file_offset = bdrv_getlength(bs->file), + }; + + new_hdr.log_guid = header->log_guid; + + desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count); + + total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE; + new_hdr.entry_length = total_length; + + vhdx_log_entry_hdr_le_export(&new_hdr); + + buffer = qemu_blockalign(bs, total_length); + memcpy(buffer, &new_hdr, sizeof(new_hdr)); + + new_desc = (VHDXLogDescriptor *) (buffer + sizeof(new_hdr)); + data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE); + data_tmp = data; + + /* All log sectors are 4KB, so for any partial sectors we must + * merge the data with preexisting data from the final file + * destination */ + merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + for (i = 0; i < sectors; i++) { + new_desc->signature = VHDX_LOG_DESC_SIGNATURE; + new_desc->sequence_number = s->log.sequence; + new_desc->file_offset = file_offset; + + if (i == 0 && leading_length) { + /* partial sector at the front of the buffer */ + ret = bdrv_pread(bs->file, file_offset, merged_sector, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector + sector_offset, data_tmp, leading_length); + bytes_written = leading_length; + sector_write = merged_sector; + } else if (i == sectors - 1 && trailing_length) { + /* partial sector at the end of the buffer */ + ret = bdrv_pread(bs->file, + file_offset, + merged_sector + trailing_length, + VHDX_LOG_SECTOR_SIZE - trailing_length); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector, data_tmp, trailing_length); + bytes_written = trailing_length; + sector_write = merged_sector; + } else { + bytes_written = VHDX_LOG_SECTOR_SIZE; + sector_write = data_tmp; + } + + /* populate the raw sector data into the proper structures, + * as well as update the descriptor, and convert to proper + * endianness */ + vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write, + s->log.sequence); + + data_tmp += bytes_written; + data_sector++; + new_desc++; + file_offset += VHDX_LOG_SECTOR_SIZE; + } + + /* checksum covers entire entry, from the log header through the + * last data sector */ + vhdx_update_checksum(buffer, total_length, + offsetof(VHDXLogEntryHeader, checksum)); + cpu_to_le32s((uint32_t *)(buffer + 4)); + + /* now write to the log */ + ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, + desc_sectors + sectors); + if (ret < 0) { + goto exit; + } + + if (sectors_written != desc_sectors + sectors) { + /* instead of failing, we could flush the log here */ + ret = -EINVAL; + goto exit; + } + + s->log.sequence++; + /* write new tail */ + s->log.tail = s->log.write; + +exit: + qemu_vfree(buffer); + qemu_vfree(merged_sector); + return ret; +} + +/* Perform a log write, and then immediately flush the entire log */ +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + VHDXLogSequence logs = { .valid = true, + .count = 1, + .hdr = { 0 } }; + + + /* Make sure data written (new and/or changed blocks) is stable + * on disk, before creating log entry */ + bdrv_flush(bs); + ret = vhdx_log_write(bs, s, data, length, offset); + if (ret < 0) { + goto exit; + } + logs.log = s->log; + + /* Make sure log is stable on disk */ + bdrv_flush(bs); + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + s->log = logs.log; + +exit: + return ret; +} + diff --git a/block/vhdx.c b/block/vhdx.c index e9704b1fd..509baaf48 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -6,9 +6,9 @@ * Authors: * Jeff Cody <jcody@redhat.com> * - * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 * * This work is licensed under the terms of the GNU LGPL, version 2 or later. * See the COPYING.LIB file in the top-level directory. @@ -20,7 +20,22 @@ #include "qemu/module.h" #include "qemu/crc32c.h" #include "block/vhdx.h" +#include "migration/migration.h" +#include <uuid/uuid.h> +#include <glib.h> + +/* Options for VHDX creation */ + +#define VHDX_BLOCK_OPT_LOG_SIZE "log_size" +#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size" +#define VHDX_BLOCK_OPT_ZERO "block_state_zero" + +typedef enum VHDXImageType { + VHDX_TYPE_DYNAMIC = 0, + VHDX_TYPE_FIXED, + VHDX_TYPE_DIFFERENCING, /* Currently unsupported */ +} VHDXImageType; /* Several metadata and region table data entries are identified by * guids in a MS-specific GUID format. */ @@ -103,16 +118,6 @@ static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7, META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ META_PHYS_SECTOR_SIZE_PRESENT) -typedef struct VHDXMetadataEntries { - VHDXMetadataTableEntry file_parameters_entry; - VHDXMetadataTableEntry virtual_disk_size_entry; - VHDXMetadataTableEntry page83_data_entry; - VHDXMetadataTableEntry logical_sector_size_entry; - VHDXMetadataTableEntry phys_sector_size_entry; - VHDXMetadataTableEntry parent_locator_entry; - uint16_t present; -} VHDXMetadataEntries; - typedef struct VHDXSectorInfo { uint32_t bat_idx; /* BAT entry index */ @@ -123,43 +128,31 @@ typedef struct VHDXSectorInfo { uint64_t block_offset; /* block offset, in bytes */ } VHDXSectorInfo; +/* Calculates new checksum. + * + * Zero is substituted during crc calculation for the original crc field + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * Note: The resulting checksum is in the CPU endianness, not necessarily + * in the file format endianness (LE). Any header export to disk should + * make sure that vhdx_header_le_export() is used to convert to the + * correct endianness + */ +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) +{ + uint32_t crc; + assert(buf != NULL); + assert(size > (crc_offset + sizeof(crc))); -typedef struct BDRVVHDXState { - CoMutex lock; - - int curr_header; - VHDXHeader *headers[2]; - - VHDXRegionTableHeader rt; - VHDXRegionTableEntry bat_rt; /* region table for the BAT */ - VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ - - VHDXMetadataTableHeader metadata_hdr; - VHDXMetadataEntries metadata_entries; - - VHDXFileParameters params; - uint32_t block_size; - uint32_t block_size_bits; - uint32_t sectors_per_block; - uint32_t sectors_per_block_bits; - - uint64_t virtual_disk_size; - uint32_t logical_sector_size; - uint32_t physical_sector_size; - - uint64_t chunk_ratio; - uint32_t chunk_ratio_bits; - uint32_t logical_sector_size_bits; - - uint32_t bat_entries; - VHDXBatEntry *bat; - uint64_t bat_offset; - - VHDXParentLocatorHeader parent_header; - VHDXParentLocatorEntry *parent_entries; + memset(buf + crc_offset, 0, sizeof(crc)); + crc = crc32c(0xffffffff, buf, size); + memcpy(buf + crc_offset, &crc, sizeof(crc)); -} BDRVVHDXState; + return crc; +} uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, int crc_offset) @@ -212,6 +205,71 @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) /* + * This generates a UUID that is compliant with the MS GUIDs used + * in the VHDX spec (and elsewhere). + */ +void vhdx_guid_generate(MSGUID *guid) +{ + uuid_t uuid; + assert(guid != NULL); + + uuid_generate(uuid); + memcpy(guid, uuid, sizeof(MSGUID)); +} + +/* Check for region overlaps inside the VHDX image */ +static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length) +{ + int ret = 0; + uint64_t end; + VHDXRegionEntry *r; + + end = start + length; + QLIST_FOREACH(r, &s->regions, entries) { + if (!((start >= r->end) || (end <= r->start))) { + ret = -EINVAL; + goto exit; + } + } + +exit: + return ret; +} + +/* Register a region for future checks */ +static void vhdx_region_register(BDRVVHDXState *s, + uint64_t start, uint64_t length) +{ + VHDXRegionEntry *r; + + r = g_malloc0(sizeof(*r)); + + r->start = start; + r->end = start + length; + + QLIST_INSERT_HEAD(&s->regions, r, entries); +} + +/* Free all registered regions */ +static void vhdx_region_unregister_all(BDRVVHDXState *s) +{ + VHDXRegionEntry *r, *r_next; + + QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) { + QLIST_REMOVE(r, entries); + g_free(r); + } +} + +static void vhdx_set_shift_bits(BDRVVHDXState *s) +{ + s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); + s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); + s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); + s->block_size_bits = 31 - clz32(s->block_size); +} + +/* * Per the MS VHDX Specification, for every VHDX file: * - The header section is fixed size - 1 MB * - The header section is always the first "object" @@ -230,30 +288,124 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -/* All VHDX structures on disk are little endian */ -static void vhdx_header_le_import(VHDXHeader *h) +/* + * Writes the header to the specified offset. + * + * This will optionally read in buffer data from disk (otherwise zero-fill), + * and then update the header checksum. Header is converted to proper + * endianness before being written to the specified file offset + */ +static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, + uint64_t offset, bool read) +{ + uint8_t *buffer = NULL; + int ret; + VHDXHeader header_le; + + assert(bs_file != NULL); + assert(hdr != NULL); + + /* the header checksum is not over just the packed size of VHDXHeader, + * but rather over the entire 'reserved' range for the header, which is + * 4KB (VHDX_HEADER_SIZE). */ + + buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); + if (read) { + /* if true, we can't assume the extra reserved bytes are 0 */ + ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); + if (ret < 0) { + goto exit; + } + } else { + memset(buffer, 0, VHDX_HEADER_SIZE); + } + + /* overwrite the actual VHDXHeader portion */ + memcpy(buffer, hdr, sizeof(VHDXHeader)); + hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, + offsetof(VHDXHeader, checksum)); + vhdx_header_le_export(hdr, &header_le); + ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader)); + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Update the VHDX headers + * + * This follows the VHDX spec procedures for header updates. + * + * - non-current header is updated with largest sequence number + */ +static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) { - assert(h != NULL); + int ret = 0; + int hdr_idx = 0; + uint64_t header_offset = VHDX_HEADER1_OFFSET; + + VHDXHeader *active_header; + VHDXHeader *inactive_header; + + /* operate on the non-current header */ + if (s->curr_header == 0) { + hdr_idx = 1; + header_offset = VHDX_HEADER2_OFFSET; + } + + active_header = s->headers[s->curr_header]; + inactive_header = s->headers[hdr_idx]; - le32_to_cpus(&h->signature); - le32_to_cpus(&h->checksum); - le64_to_cpus(&h->sequence_number); + inactive_header->sequence_number = active_header->sequence_number + 1; - leguid_to_cpus(&h->file_write_guid); - leguid_to_cpus(&h->data_write_guid); - leguid_to_cpus(&h->log_guid); + /* a new file guid must be generated before any file write, including + * headers */ + inactive_header->file_write_guid = s->session_guid; + + /* a new data guid only needs to be generated before any guest-visible + * writes (i.e. something observable via virtual disk read) */ + if (generate_data_write_guid) { + vhdx_guid_generate(&inactive_header->data_write_guid); + } - le16_to_cpus(&h->log_version); - le16_to_cpus(&h->version); - le32_to_cpus(&h->log_length); - le64_to_cpus(&h->log_offset); + /* update the log guid if present */ + if (log_guid) { + inactive_header->log_guid = *log_guid; + } + + ret = vhdx_write_header(bs->file, inactive_header, header_offset, true); + if (ret < 0) { + goto exit; + } + s->curr_header = hdr_idx; + +exit: + return ret; } +/* + * The VHDX spec calls for header updates to be performed twice, so that both + * the current and non-current header have valid info + */ +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) +{ + int ret; + + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + if (ret < 0) { + return ret; + } + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + return ret; +} /* opens the specified header block from the VHDX file header section */ -static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) +static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, + Error **errp) { - int ret = 0; + int ret; VHDXHeader *header1; VHDXHeader *header2; bool h1_valid = false; @@ -262,6 +414,7 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) uint64_t h2_seq = 0; uint8_t *buffer; + /* header1 & header2 are freed in vhdx_close() */ header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); @@ -310,7 +463,6 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) } else if (!h1_valid && h2_valid) { s->curr_header = 1; } else if (!h1_valid && !h2_valid) { - ret = -EINVAL; goto fail; } else { /* If both headers are valid, then we choose the active one by the @@ -321,24 +473,22 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) } else if (h2_seq > h1_seq) { s->curr_header = 1; } else { - ret = -EINVAL; goto fail; } } - ret = 0; - + vhdx_region_register(s, s->headers[s->curr_header]->log_offset, + s->headers[s->curr_header]->log_length); goto exit; fail: - qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found"); + error_setg_errno(errp, -ret, "No valid VHDX header found"); qemu_vfree(header1); qemu_vfree(header2); s->headers[0] = NULL; s->headers[1] = NULL; exit: qemu_vfree(buffer); - return ret; } @@ -362,10 +512,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } memcpy(&s->rt, buffer, sizeof(s->rt)); - le32_to_cpus(&s->rt.signature); - le32_to_cpus(&s->rt.checksum); - le32_to_cpus(&s->rt.entry_count); - le32_to_cpus(&s->rt.reserved); + vhdx_region_header_le_import(&s->rt); offset += sizeof(s->rt); if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) || @@ -384,10 +531,16 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); offset += sizeof(rt_entry); - leguid_to_cpus(&rt_entry.guid); - le64_to_cpus(&rt_entry.file_offset); - le32_to_cpus(&rt_entry.length); - le32_to_cpus(&rt_entry.data_bits); + vhdx_region_entry_le_import(&rt_entry); + + /* check for region overlap between these entries, and any + * other memory regions in the file */ + ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length); + if (ret < 0) { + goto fail; + } + + vhdx_region_register(s, rt_entry.file_offset, rt_entry.length); /* see if we recognize the entry */ if (guid_eq(rt_entry.guid, bat_guid)) { @@ -419,6 +572,12 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } } + + if (!bat_rt_found || !metadata_rt_found) { + ret = -EINVAL; + goto fail; + } + ret = 0; fail: @@ -462,9 +621,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); offset += sizeof(s->metadata_hdr); - le64_to_cpus(&s->metadata_hdr.signature); - le16_to_cpus(&s->metadata_hdr.reserved); - le16_to_cpus(&s->metadata_hdr.entry_count); + vhdx_metadata_header_le_import(&s->metadata_hdr); if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) { ret = -EINVAL; @@ -483,11 +640,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&md_entry, buffer + offset, sizeof(md_entry)); offset += sizeof(md_entry); - leguid_to_cpus(&md_entry.item_id); - le32_to_cpus(&md_entry.offset); - le32_to_cpus(&md_entry.length); - le32_to_cpus(&md_entry.data_bits); - le32_to_cpus(&md_entry.reserved2); + vhdx_metadata_entry_le_import(&md_entry); if (guid_eq(md_entry.item_id, file_param_guid)) { if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { @@ -627,12 +780,20 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) le32_to_cpus(&s->logical_sector_size); le32_to_cpus(&s->physical_sector_size); - if (s->logical_sector_size == 0 || s->params.block_size == 0) { + if (s->params.block_size < VHDX_BLOCK_SIZE_MIN || + s->params.block_size > VHDX_BLOCK_SIZE_MAX) { ret = -EINVAL; goto exit; } - /* both block_size and sector_size are guaranteed powers of 2 */ + /* only 2 supported sector sizes */ + if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) { + ret = -EINVAL; + goto exit; + } + + /* Both block_size and sector_size are guaranteed powers of 2, below. + Due to range checks above, s->sectors_per_block can never be < 256 */ s->sectors_per_block = s->params.block_size / s->logical_sector_size; s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * (uint64_t)s->logical_sector_size / @@ -660,10 +821,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) goto exit; } - s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); - s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); - s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); - s->block_size_bits = 31 - clz32(s->block_size); + vhdx_set_shift_bits(s); ret = 0; @@ -672,61 +830,64 @@ exit: return ret; } -/* Parse the replay log. Per the VHDX spec, if the log is present - * it must be replayed prior to opening the file, even read-only. - * - * If read-only, we must replay the log in RAM (or refuse to open - * a dirty VHDX file read-only */ -static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s) +/* + * Calculate the number of BAT entries, including sector + * bitmap entries. + */ +static void vhdx_calc_bat_entries(BDRVVHDXState *s) { - int ret = 0; - int i; - VHDXHeader *hdr; - - hdr = s->headers[s->curr_header]; + uint32_t data_blocks_cnt, bitmap_blocks_cnt; - /* either the log guid, or log length is zero, - * then a replay log is present */ - for (i = 0; i < sizeof(hdr->log_guid.data4); i++) { - ret |= hdr->log_guid.data4[i]; - } - if (hdr->log_guid.data1 == 0 && - hdr->log_guid.data2 == 0 && - hdr->log_guid.data3 == 0 && - ret == 0) { - goto exit; + data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; + if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { + data_blocks_cnt++; } - - /* per spec, only log version of 0 is supported */ - if (hdr->log_version != 0) { - ret = -EINVAL; - goto exit; + bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; + if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { + bitmap_blocks_cnt++; } - if (hdr->log_length == 0) { - goto exit; + if (s->parent_entries) { + s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); + } else { + s->bat_entries = data_blocks_cnt + + ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); } - /* We currently do not support images with logs to replay */ - ret = -ENOTSUP; - -exit: - return ret; } +static void vhdx_close(BlockDriverState *bs) +{ + BDRVVHDXState *s = bs->opaque; + qemu_vfree(s->headers[0]); + s->headers[0] = NULL; + qemu_vfree(s->headers[1]); + s->headers[1] = NULL; + qemu_vfree(s->bat); + s->bat = NULL; + qemu_vfree(s->parent_entries); + s->parent_entries = NULL; + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + qemu_vfree(s->log.hdr); + s->log.hdr = NULL; + vhdx_region_unregister_all(s); +} -static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVHDXState *s = bs->opaque; int ret = 0; uint32_t i; uint64_t signature; - uint32_t data_blocks_cnt, bitmap_blocks_cnt; - + Error *local_err = NULL; s->bat = NULL; + s->first_visible_write = true; qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->regions); /* validate the file signature */ ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); @@ -738,46 +899,40 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - ret = vhdx_parse_header(bs, s); - if (ret) { + /* This is used for any header updates, for the file_write_guid. + * The spec dictates that a new value should be used for the first + * header update */ + vhdx_guid_generate(&s->session_guid); + + vhdx_parse_header(bs, s, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; goto fail; } - ret = vhdx_parse_log(bs, s); - if (ret) { + ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); + if (ret < 0) { goto fail; } ret = vhdx_open_region_tables(bs, s); - if (ret) { + if (ret < 0) { goto fail; } ret = vhdx_parse_metadata(bs, s); - if (ret) { + if (ret < 0) { goto fail; } + s->block_size = s->params.block_size; /* the VHDX spec dictates that virtual_disk_size is always a multiple of * logical_sector_size */ bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; - data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; - if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { - data_blocks_cnt++; - } - bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; - if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { - bitmap_blocks_cnt++; - } - - if (s->parent_entries) { - s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); - } else { - s->bat_entries = data_blocks_cnt + - ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); - } + vhdx_calc_bat_entries(s); s->bat_offset = s->bat_rt.file_offset; @@ -787,6 +942,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } + /* s->bat is freed in vhdx_close() */ s->bat = qemu_blockalign(bs, s->bat_rt.length); ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); @@ -794,23 +950,46 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } + uint64_t payblocks = s->chunk_ratio; + /* endian convert, and verify populated BAT field file offsets against + * region table and log entries */ for (i = 0; i < s->bat_entries; i++) { le64_to_cpus(&s->bat[i]); + if (payblocks--) { + /* payload bat entries */ + if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == + PAYLOAD_BLOCK_FULLY_PRESENT) { + ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, + s->block_size); + if (ret < 0) { + goto fail; + } + } + } else { + payblocks = s->chunk_ratio; + /* Once differencing files are supported, verify sector bitmap + * blocks here */ + } } if (flags & BDRV_O_RDWR) { - ret = -ENOTSUP; - goto fail; + ret = vhdx_update_headers(bs, s, false, NULL); + if (ret < 0) { + goto fail; + } } - /* TODO: differencing files, write */ + /* TODO: differencing files */ + + /* Disable migration when VHDX images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vhdx", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); return 0; fail: - qemu_vfree(s->headers[0]); - qemu_vfree(s->headers[1]); - qemu_vfree(s->bat); - qemu_vfree(s->parent_entries); + vhdx_close(bs); return ret; } @@ -850,7 +1029,7 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; - sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS; + sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK; sinfo->block_offset = block_offset << s->logical_sector_size_bits; @@ -864,11 +1043,22 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, * in the block, and add in the payload data block offset * in the file, in bytes, to get the final read address */ - sinfo->file_offset <<= 20; /* now in bytes, rather than 1MB units */ sinfo->file_offset += sinfo->block_offset; } +static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVHDXState *s = bs->opaque; + + bdi->cluster_size = s->block_size; + + bdi->unallocated_blocks_are_zero = + (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0; + + return 0; +} + static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) @@ -905,7 +1095,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, /* return zero */ qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); break; - case PAYLOAD_BLOCK_FULL_PRESENT: + case PAYLOAD_BLOCK_FULLY_PRESENT: qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_readv(bs->file, sinfo.file_offset >> BDRV_SECTOR_BITS, @@ -935,24 +1125,792 @@ exit: return ret; } +/* + * Allocate a new payload block at the end of the file. + * + * Allocation will happen at 1MB alignment inside the file + * + * Returns the file offset start of the new payload block + */ +static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t *new_offset) +{ + *new_offset = bdrv_getlength(bs->file); + + /* per the spec, the address for a block is in units of 1MB */ + *new_offset = ROUND_UP(*new_offset, 1024 * 1024); + + return bdrv_truncate(bs->file, *new_offset + s->block_size); +} + +/* + * Update the BAT table entry with the new file offset, and the new entry + * state */ +static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXSectorInfo *sinfo, + uint64_t *bat_entry_le, + uint64_t *bat_offset, int state) +{ + /* The BAT entry is a uint64, with 44 bits for the file offset in units of + * 1MB, and 3 bits for the block state. */ + s->bat[sinfo->bat_idx] = sinfo->file_offset; + + s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; + *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); + *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); + +} + +/* Per the spec, on the first write of guest-visible data to the file the + * data write guid must be updated in the header */ +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + if (s->first_visible_write) { + s->first_visible_write = false; + ret = vhdx_update_headers(bs, s, true, NULL); + } + return ret; +} static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return -ENOTSUP; + int ret = -ENOTSUP; + BDRVVHDXState *s = bs->opaque; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + uint64_t bat_entry = 0; + uint64_t bat_entry_offset = 0; + QEMUIOVector hd_qiov; + struct iovec iov1 = { 0 }; + struct iovec iov2 = { 0 }; + int sectors_to_write; + int bat_state; + uint64_t bat_prior_offset = 0; + bool bat_update = false; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + while (nb_sectors > 0) { + bool use_zero_buffers = false; + bat_update = false; + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + sectors_to_write = sinfo.sectors_avail; + + qemu_iovec_reset(&hd_qiov); + /* check the payload block state */ + bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; + switch (bat_state) { + case PAYLOAD_BLOCK_ZERO: + /* in this case, we need to preserve zero writes for + * data that is not part of this write, so we must pad + * the rest of the buffer to zeroes */ + + /* if we are on a posix system with ftruncate() that extends + * a file, then it is zero-filled for us. On Win32, the raw + * layer uses SetFilePointer and SetFileEnd, which does not + * zero fill AFAIK */ + + /* Queue another write of zero buffers if the underlying file + * does not zero-fill on file extension */ + + if (bdrv_has_zero_init(bs->file) == 0) { + use_zero_buffers = true; + + /* zero fill the front, if any */ + if (sinfo.block_offset) { + iov1.iov_len = sinfo.block_offset; + iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); + memset(iov1.iov_base, 0, iov1.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, + sinfo.block_offset); + sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; + } + + /* our actual data */ + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + + /* zero fill the back, if any */ + if ((sinfo.bytes_avail - sinfo.block_offset) < + s->block_size) { + iov2.iov_len = s->block_size - + (sinfo.bytes_avail + sinfo.block_offset); + iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); + memset(iov2.iov_base, 0, iov2.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, + sinfo.block_offset); + sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; + } + } + + /* fall through */ + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNMAPPED: /* fall through */ + case PAYLOAD_BLOCK_UNDEFINED: /* fall through */ + bat_prior_offset = sinfo.file_offset; + ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); + if (ret < 0) { + goto exit; + } + /* once we support differencing files, this may also be + * partially present */ + /* update block state to the newly specified state */ + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, + PAYLOAD_BLOCK_FULLY_PRESENT); + bat_update = true; + /* since we just allocated a block, file_offset is the + * beginning of the payload block. It needs to be the + * write address, which includes the offset into the block */ + if (!use_zero_buffers) { + sinfo.file_offset += sinfo.block_offset; + } + /* fall through */ + case PAYLOAD_BLOCK_FULLY_PRESENT: + /* if the file offset address is in the header zone, + * there is a problem */ + if (sinfo.file_offset < (1024 * 1024)) { + ret = -EFAULT; + goto error_bat_restore; + } + + if (!use_zero_buffers) { + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + } + /* block exists, so we can just overwrite it */ + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sectors_to_write, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto error_bat_restore; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + + if (bat_update) { + /* this will update the BAT entry into the log journal, and + * then flush the log journal out to disk */ + ret = vhdx_log_write_and_flush(bs, s, &bat_entry, + sizeof(VHDXBatEntry), + bat_entry_offset); + if (ret < 0) { + goto exit; + } + } + + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + + } + } + + goto exit; + +error_bat_restore: + if (bat_update) { + /* keep metadata in sync, and restore the bat entry state + * if error. */ + sinfo.file_offset = bat_prior_offset; + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, bat_state); + } +exit: + qemu_vfree(iov1.iov_base); + qemu_vfree(iov2.iov_base); + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; } -static void vhdx_close(BlockDriverState *bs) + +/* + * Create VHDX Headers + * + * There are 2 headers, and the highest sequence number will represent + * the active header + */ +static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, + uint32_t log_size) +{ + int ret = 0; + VHDXHeader *hdr = NULL; + + hdr = g_malloc0(sizeof(VHDXHeader)); + + hdr->signature = VHDX_HEADER_SIGNATURE; + hdr->sequence_number = g_random_int(); + hdr->log_version = 0; + hdr->version = 1; + hdr->log_length = log_size; + hdr->log_offset = VHDX_HEADER_SECTION_END; + vhdx_guid_generate(&hdr->file_write_guid); + vhdx_guid_generate(&hdr->data_write_guid); + + ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); + if (ret < 0) { + goto exit; + } + hdr->sequence_number++; + ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); + if (ret < 0) { + goto exit; + } + +exit: + g_free(hdr); + return ret; +} + + +/* + * Create the Metadata entries. + * + * For more details on the entries, see section 3.5 (pg 29) in the + * VHDX 1.00 specification. + * + * We support 5 metadata entries (all required by spec): + * File Parameters, + * Virtual Disk Size, + * Page 83 Data, + * Logical Sector Size, + * Physical Sector Size + * + * The first 64KB of the Metadata section is reserved for the metadata + * header and entries; beyond that, the metadata items themselves reside. + */ +static int vhdx_create_new_metadata(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint64_t metadata_offset, + VHDXImageType type) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + void *entry_buffer; + VHDXMetadataTableHeader *md_table;; + VHDXMetadataTableEntry *md_table_entry; + + /* Metadata entries */ + VHDXFileParameters *mt_file_params; + VHDXVirtualDiskSize *mt_virtual_size; + VHDXPage83Data *mt_page83; + VHDXVirtualDiskLogicalSectorSize *mt_log_sector_size; + VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; + + entry_buffer = g_malloc0(sizeof(VHDXFileParameters) + + sizeof(VHDXVirtualDiskSize) + + sizeof(VHDXPage83Data) + + sizeof(VHDXVirtualDiskLogicalSectorSize) + + sizeof(VHDXVirtualDiskPhysicalSectorSize)); + + mt_file_params = entry_buffer; + offset += sizeof(VHDXFileParameters); + mt_virtual_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskSize); + mt_page83 = entry_buffer + offset; + offset += sizeof(VHDXPage83Data); + mt_log_sector_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskLogicalSectorSize); + mt_phys_sector_size = entry_buffer + offset; + + mt_file_params->block_size = cpu_to_le32(block_size); + if (type == VHDX_TYPE_FIXED) { + mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED; + cpu_to_le32s(&mt_file_params->data_bits); + } + + vhdx_guid_generate(&mt_page83->page_83_data); + cpu_to_leguids(&mt_page83->page_83_data); + mt_virtual_size->virtual_disk_size = cpu_to_le64(image_size); + mt_log_sector_size->logical_sector_size = cpu_to_le32(sector_size); + mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size); + + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + md_table = buffer; + + md_table->signature = VHDX_METADATA_SIGNATURE; + md_table->entry_count = 5; + vhdx_metadata_header_le_export(md_table); + + + /* This will reference beyond the reserved table portion */ + offset = 64 * KiB; + + md_table_entry = buffer + sizeof(VHDXMetadataTableHeader); + + md_table_entry[0].item_id = file_param_guid; + md_table_entry[0].offset = offset; + md_table_entry[0].length = sizeof(VHDXFileParameters); + md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED; + offset += md_table_entry[0].length; + vhdx_metadata_entry_le_export(&md_table_entry[0]); + + md_table_entry[1].item_id = virtual_size_guid; + md_table_entry[1].offset = offset; + md_table_entry[1].length = sizeof(VHDXVirtualDiskSize); + md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[1].length; + vhdx_metadata_entry_le_export(&md_table_entry[1]); + + md_table_entry[2].item_id = page83_guid; + md_table_entry[2].offset = offset; + md_table_entry[2].length = sizeof(VHDXPage83Data); + md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[2].length; + vhdx_metadata_entry_le_export(&md_table_entry[2]); + + md_table_entry[3].item_id = logical_sector_guid; + md_table_entry[3].offset = offset; + md_table_entry[3].length = sizeof(VHDXVirtualDiskLogicalSectorSize); + md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[3].length; + vhdx_metadata_entry_le_export(&md_table_entry[3]); + + md_table_entry[4].item_id = phys_sector_guid; + md_table_entry[4].offset = offset; + md_table_entry[4].length = sizeof(VHDXVirtualDiskPhysicalSectorSize); + md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + vhdx_metadata_entry_le_export(&md_table_entry[4]); + + ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + +exit: + g_free(buffer); + g_free(entry_buffer); + return ret; +} + +/* This create the actual BAT itself. We currently only support + * 'Dynamic' and 'Fixed' image types. + * + * Dynamic images: default state of the BAT is all zeroes. + * + * Fixed images: default state of the BAT is fully populated, with + * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. + */ +static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t image_size, VHDXImageType type, + bool use_zero_blocks, VHDXRegionTableEntry *rt_bat) +{ + int ret = 0; + uint64_t data_file_offset; + uint64_t total_sectors = 0; + uint64_t sector_num = 0; + uint64_t unused; + int block_state; + VHDXSectorInfo sinfo; + + assert(s->bat == NULL); + + /* this gives a data start after BAT/bitmap entries, and well + * past any metadata entries (with a 4 MB buffer for future + * expansion */ + data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB; + total_sectors = image_size >> s->logical_sector_size_bits; + + if (type == VHDX_TYPE_DYNAMIC) { + /* All zeroes, so we can just extend the file - the end of the BAT + * is the furthest thing we have written yet */ + ret = bdrv_truncate(bs, data_file_offset); + if (ret < 0) { + goto exit; + } + } else if (type == VHDX_TYPE_FIXED) { + ret = bdrv_truncate(bs, data_file_offset + image_size); + if (ret < 0) { + goto exit; + } + } else { + ret = -ENOTSUP; + goto exit; + } + + if (type == VHDX_TYPE_FIXED || + use_zero_blocks || + bdrv_has_zero_init(bs) == 0) { + /* for a fixed file, the default BAT entry is not zero */ + s->bat = g_malloc0(rt_bat->length); + block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : + PAYLOAD_BLOCK_NOT_PRESENT; + block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; + /* fill the BAT by emulating sector writes of sectors_per_block size */ + while (sector_num < total_sectors) { + vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo); + sinfo.file_offset = data_file_offset + + (sector_num << s->logical_sector_size_bits); + sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); + vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, + block_state); + cpu_to_le64s(&s->bat[sinfo.bat_idx]); + sector_num += s->sectors_per_block; + } + ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length); + if (ret < 0) { + goto exit; + } + } + + + +exit: + g_free(s->bat); + return ret; +} + +/* Creates the region table header, and region table entries. + * There are 2 supported region table entries: BAT, and Metadata/ + * + * As the calculations for the BAT region table are also needed + * to create the BAT itself, we will also cause the BAT to be + * created. + */ +static int vhdx_create_new_region_table(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint32_t log_size, + bool use_zero_blocks, + VHDXImageType type, + uint64_t *metadata_offset) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + BDRVVHDXState *s = NULL; + VHDXRegionTableHeader *region_table; + VHDXRegionTableEntry *rt_bat; + VHDXRegionTableEntry *rt_metadata; + + assert(metadata_offset != NULL); + + /* Populate enough of the BDRVVHDXState to be able to use the + * pre-existing BAT calculation, translation, and update functions */ + s = g_malloc0(sizeof(BDRVVHDXState)); + + s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * + (uint64_t) sector_size / (uint64_t) block_size; + + s->sectors_per_block = block_size / sector_size; + s->virtual_disk_size = image_size; + s->block_size = block_size; + s->logical_sector_size = sector_size; + + vhdx_set_shift_bits(s); + + vhdx_calc_bat_entries(s); + + /* At this point the VHDX state is populated enough for creation */ + + /* a single buffer is used so we can calculate the checksum over the + * entire 64KB block */ + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + region_table = buffer; + offset += sizeof(VHDXRegionTableHeader); + rt_bat = buffer + offset; + offset += sizeof(VHDXRegionTableEntry); + rt_metadata = buffer + offset; + + region_table->signature = VHDX_REGION_SIGNATURE; + region_table->entry_count = 2; /* BAT and Metadata */ + + rt_bat->guid = bat_guid; + rt_bat->length = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB); + rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB); + s->bat_offset = rt_bat->file_offset; + + rt_metadata->guid = metadata_guid; + rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length, + MiB); + rt_metadata->length = 1 * MiB; /* min size, and more than enough */ + *metadata_offset = rt_metadata->file_offset; + + vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, + offsetof(VHDXRegionTableHeader, checksum)); + + + /* The region table gives us the data we need to create the BAT, + * so do that now */ + ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat); + + /* Now write out the region headers to disk */ + vhdx_region_header_le_export(region_table); + vhdx_region_entry_le_export(rt_bat); + vhdx_region_entry_le_export(rt_metadata); + + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + +exit: + g_free(s); + g_free(buffer); + return ret; +} + +/* We need to create the following elements: + * + * .-----------------------------------------------------------------. + * | (A) | (B) | (C) | (D) | (E) | + * | File ID | Header1 | Header 2 | Region Tbl 1 | Region Tbl 2 | + * | | | | | | + * .-----------------------------------------------------------------. + * 0 64KB 128KB 192KB 256KB 320KB + * + * + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * | (F) | (G) | (H) | | + * | Journal Log | BAT / Bitmap | Metadata | .... data ...... | + * | | | | | + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * 1MB + */ +static int vhdx_create(const char *filename, QEMUOptionParameter *options, + Error **errp) +{ + int ret = 0; + uint64_t image_size = (uint64_t) 2 * GiB; + uint32_t log_size = 1 * MiB; + uint32_t block_size = 0; + uint64_t signature; + uint64_t metadata_offset; + bool use_zero_blocks = false; + + gunichar2 *creator = NULL; + glong creator_items; + BlockDriverState *bs; + const char *type = NULL; + VHDXImageType image_type; + Error *local_err = NULL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_size = options->value.n; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_LOG_SIZE)) { + log_size = options->value.n; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_BLOCK_SIZE)) { + block_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { + type = options->value.s; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_ZERO)) { + use_zero_blocks = options->value.n != 0; + } + options++; + } + + if (image_size > VHDX_MAX_IMAGE_SIZE) { + error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); + ret = -EINVAL; + goto exit; + } + + if (type == NULL) { + type = "dynamic"; + } + + if (!strcmp(type, "dynamic")) { + image_type = VHDX_TYPE_DYNAMIC; + } else if (!strcmp(type, "fixed")) { + image_type = VHDX_TYPE_FIXED; + } else if (!strcmp(type, "differencing")) { + error_setg_errno(errp, ENOTSUP, + "Differencing files not yet supported"); + ret = -ENOTSUP; + goto exit; + } else { + ret = -EINVAL; + goto exit; + } + + /* These are pretty arbitrary, and mainly designed to keep the BAT + * size reasonable to load into RAM */ + if (block_size == 0) { + if (image_size > 32 * TiB) { + block_size = 64 * MiB; + } else if (image_size > (uint64_t) 100 * GiB) { + block_size = 32 * MiB; + } else if (image_size > 1 * GiB) { + block_size = 16 * MiB; + } else { + block_size = 8 * MiB; + } + } + + + /* make the log size close to what was specified, but must be + * min 1MB, and multiple of 1MB */ + log_size = ROUND_UP(log_size, MiB); + + block_size = ROUND_UP(block_size, MiB); + block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : + block_size; + + ret = bdrv_create_file(filename, options, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + /* Create (A) */ + + /* The creator field is optional, but may be useful for + * debugging / diagnostics */ + creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, + &creator_items, NULL); + signature = cpu_to_le64(VHDX_FILE_SIGNATURE); + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + if (ret < 0) { + goto delete_and_exit; + } + if (creator) { + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); + if (ret < 0) { + goto delete_and_exit; + } + } + + + /* Creates (B),(C) */ + ret = vhdx_create_new_headers(bs, image_size, log_size); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (D),(E),(G) explicitly. (F) created as by-product */ + ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + log_size, use_zero_blocks, image_type, + &metadata_offset); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (H) */ + ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + metadata_offset, image_type); + if (ret < 0) { + goto delete_and_exit; + } + + + +delete_and_exit: + bdrv_unref(bs); +exit: + g_free(creator); + return ret; +} + +/* If opened r/w, the VHDX driver will automatically replay the log, + * if one is present, inside the vhdx_open() call. + * + * If qemu-img check -r all is called, the image is automatically opened + * r/w and any log has already been replayed, so there is nothing (currently) + * for us to do here + */ +static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) { BDRVVHDXState *s = bs->opaque; - qemu_vfree(s->headers[0]); - qemu_vfree(s->headers[1]); - qemu_vfree(s->bat); - qemu_vfree(s->parent_entries); + + if (s->log_replayed_on_open) { + result->corruptions_fixed++; + } + return 0; } +static QEMUOptionParameter vhdx_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size; max of 64TB." + }, + { + .name = VHDX_BLOCK_OPT_LOG_SIZE, + .type = OPT_SIZE, + .value.n = 1 * MiB, + .help = "Log size; min 1MB." + }, + { + .name = VHDX_BLOCK_OPT_BLOCK_SIZE, + .type = OPT_SIZE, + .value.n = 0, + .help = "Block Size; min 1MB, max 256MB. " \ + "0 means auto-calculate based on image size." + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = OPT_STRING, + .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ + "Default is 'dynamic'." + }, + { + .name = VHDX_BLOCK_OPT_ZERO, + .type = OPT_FLAG, + .help = "Force use of payload blocks of type 'ZERO'. Non-standard." + }, + { NULL } +}; + static BlockDriver bdrv_vhdx = { .format_name = "vhdx", .instance_size = sizeof(BDRVVHDXState), @@ -962,6 +1920,11 @@ static BlockDriver bdrv_vhdx = { .bdrv_reopen_prepare = vhdx_reopen_prepare, .bdrv_co_readv = vhdx_co_readv, .bdrv_co_writev = vhdx_co_writev, + .bdrv_create = vhdx_create, + .bdrv_get_info = vhdx_get_info, + .bdrv_check = vhdx_check, + + .create_options = vhdx_create_options, }; static void bdrv_vhdx_init(void) diff --git a/block/vhdx.h b/block/vhdx.h index fb687ed2d..8103d4c44 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -6,9 +6,9 @@ * Authors: * Jeff Cody <jcody@redhat.com> * - * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 * * This work is licensed under the terms of the GNU LGPL, version 2 or later. * See the COPYING.LIB file in the top-level directory. @@ -18,6 +18,11 @@ #ifndef BLOCK_VHDX_H #define BLOCK_VHDX_H +#define KiB (1 * 1024) +#define MiB (KiB * 1024) +#define GiB (MiB * 1024) +#define TiB ((uint64_t) GiB * 1024) + /* Structures and fields present in the VHDX file */ /* The header section has the following blocks, @@ -30,14 +35,15 @@ * 0.........64KB...........128KB........192KB..........256KB................1MB */ -#define VHDX_HEADER_BLOCK_SIZE (64*1024) +#define VHDX_HEADER_BLOCK_SIZE (64 * 1024) #define VHDX_FILE_ID_OFFSET 0 -#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE*1) -#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE*2) -#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE*3) - +#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE * 1) +#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 2) +#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE * 3) +#define VHDX_REGION_TABLE2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 4) +#define VHDX_HEADER_SECTION_END (1 * MiB) /* * A note on the use of MS-GUID fields. For more details on the GUID, * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. @@ -55,10 +61,11 @@ /* These structures are ones that are defined in the VHDX specification * document */ +#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL /* "vhdxfile" in ASCII */ typedef struct VHDXFileIdentifier { uint64_t signature; /* "vhdxfile" in ASCII */ uint16_t creator[256]; /* optional; utf-16 string to identify - the vhdx file creator. Diagnotistic + the vhdx file creator. Diagnostic only */ } VHDXFileIdentifier; @@ -67,7 +74,7 @@ typedef struct VHDXFileIdentifier { * Microsoft is not just 16 bytes though - it is a structure that is defined, * so we need to follow it here so that endianness does not trip us up */ -typedef struct MSGUID { +typedef struct QEMU_PACKED MSGUID { uint32_t data1; uint16_t data2; uint16_t data3; @@ -77,14 +84,15 @@ typedef struct MSGUID { #define guid_eq(a, b) \ (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) -#define VHDX_HEADER_SIZE (4*1024) /* although the vhdx_header struct in disk - is only 582 bytes, for purposes of crc - the header is the first 4KB of the 64KB - block */ +#define VHDX_HEADER_SIZE (4 * 1024) /* although the vhdx_header struct in disk + is only 582 bytes, for purposes of crc + the header is the first 4KB of the 64KB + block */ /* The full header is 4KB, although the actual header data is much smaller. * But for the checksum calculation, it is over the entire 4KB structure, * not just the defined portion of it */ +#define VHDX_HEADER_SIGNATURE 0x64616568 typedef struct QEMU_PACKED VHDXHeader { uint32_t signature; /* "head" in ASCII */ uint32_t checksum; /* CRC-32C hash of the whole header */ @@ -92,7 +100,7 @@ typedef struct QEMU_PACKED VHDXHeader { VHDX file has 2 of these headers, and only the header with the highest sequence number is valid */ - MSGUID file_write_guid; /* 128 bit unique identifier. Must be + MSGUID file_write_guid; /* 128 bit unique identifier. Must be updated to new, unique value before the first modification is made to file */ @@ -114,9 +122,9 @@ typedef struct QEMU_PACKED VHDXHeader { there is no valid log. If non-zero, log entries with this guid are valid. */ - uint16_t log_version; /* version of the log format. Mustn't be - zero, unless log_guid is also zero */ - uint16_t version; /* version of th evhdx file. Currently, + uint16_t log_version; /* version of the log format. Must be + set to zero */ + uint16_t version; /* version of the vhdx file. Currently, only supported version is "1" */ uint32_t log_length; /* length of the log. Must be multiple of 1MB */ @@ -125,6 +133,7 @@ typedef struct QEMU_PACKED VHDXHeader { } VHDXHeader; /* Header for the region table block */ +#define VHDX_REGION_SIGNATURE 0x69676572 /* "regi" in ASCII */ typedef struct QEMU_PACKED VHDXRegionTableHeader { uint32_t signature; /* "regi" in ASCII */ uint32_t checksum; /* CRC-32C hash of the 64KB table */ @@ -151,7 +160,10 @@ typedef struct QEMU_PACKED VHDXRegionTableEntry { /* ---- LOG ENTRY STRUCTURES ---- */ +#define VHDX_LOG_MIN_SIZE (1024 * 1024) +#define VHDX_LOG_SECTOR_SIZE 4096 #define VHDX_LOG_HDR_SIZE 64 +#define VHDX_LOG_SIGNATURE 0x65676f6c typedef struct QEMU_PACKED VHDXLogEntryHeader { uint32_t signature; /* "loge" in ASCII */ uint32_t checksum; /* CRC-32C hash of the 64KB table */ @@ -174,7 +186,8 @@ typedef struct QEMU_PACKED VHDXLogEntryHeader { } VHDXLogEntryHeader; #define VHDX_LOG_DESC_SIZE 32 - +#define VHDX_LOG_DESC_SIGNATURE 0x63736564 +#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a typedef struct QEMU_PACKED VHDXLogDescriptor { uint32_t signature; /* "zero" or "desc" in ASCII */ union { @@ -194,6 +207,7 @@ typedef struct QEMU_PACKED VHDXLogDescriptor { vhdx_log_entry_header */ } VHDXLogDescriptor; +#define VHDX_LOG_DATA_SIGNATURE 0x61746164 typedef struct QEMU_PACKED VHDXLogDataSector { uint32_t data_signature; /* "data" in ASCII */ uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */ @@ -212,19 +226,19 @@ typedef struct QEMU_PACKED VHDXLogDataSector { #define PAYLOAD_BLOCK_UNDEFINED 1 #define PAYLOAD_BLOCK_ZERO 2 #define PAYLOAD_BLOCK_UNMAPPED 5 -#define PAYLOAD_BLOCK_FULL_PRESENT 6 +#define PAYLOAD_BLOCK_FULLY_PRESENT 6 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 #define SB_BLOCK_NOT_PRESENT 0 #define SB_BLOCK_PRESENT 6 /* per the spec */ -#define VHDX_MAX_SECTORS_PER_BLOCK (1<<23) +#define VHDX_MAX_SECTORS_PER_BLOCK (1 << 23) /* upper 44 bits are the file offset in 1MB units lower 3 bits are the state other bits are reserved */ #define VHDX_BAT_STATE_BIT_MASK 0x07 -#define VHDX_BAT_FILE_OFF_BITS (64-44) +#define VHDX_BAT_FILE_OFF_MASK 0xFFFFFFFFFFF00000ULL /* upper 44 bits */ typedef uint64_t VHDXBatEntry; /* ---- METADATA REGION STRUCTURES ---- */ @@ -233,6 +247,7 @@ typedef uint64_t VHDXBatEntry; #define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */ #define VHDX_METADATA_TABLE_MAX_SIZE \ (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) +#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL /* "metadata" in ASCII */ typedef struct QEMU_PACKED VHDXMetadataTableHeader { uint64_t signature; /* "metadata" in ASCII */ uint16_t reserved; @@ -252,8 +267,8 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry { metadata region */ /* note: if length = 0, so is offset */ uint32_t length; /* length of metadata. <= 1MB. */ - uint32_t data_bits; /* least-significant 3 bits are flags, the - rest are reserved (see above) */ + uint32_t data_bits; /* least-significant 3 bits are flags, + the rest are reserved (see above) */ uint32_t reserved2; } VHDXMetadataTableEntry; @@ -262,13 +277,16 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry { If set indicates a fixed size VHDX file */ #define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */ +#define VHDX_BLOCK_SIZE_MIN (1 * MiB) +#define VHDX_BLOCK_SIZE_MAX (256 * MiB) typedef struct QEMU_PACKED VHDXFileParameters { uint32_t block_size; /* size of each payload block, always power of 2, <= 256MB and >= 1MB. */ - uint32_t data_bits; /* least-significant 2 bits are flags, the rest - are reserved (see above) */ + uint32_t data_bits; /* least-significant 2 bits are flags, + the rest are reserved (see above) */ } VHDXFileParameters; +#define VHDX_MAX_IMAGE_SIZE ((uint64_t) 64 * TiB) typedef struct QEMU_PACKED VHDXVirtualDiskSize { uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes. Must be multiple of the sector size, @@ -276,7 +294,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskSize { } VHDXVirtualDiskSize; typedef struct QEMU_PACKED VHDXPage83Data { - MSGUID page_83_data[16]; /* unique id for scsi devices that + MSGUID page_83_data; /* unique id for scsi devices that support page 0x83 */ } VHDXPage83Data; @@ -291,7 +309,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { } VHDXVirtualDiskPhysicalSectorSize; typedef struct QEMU_PACKED VHDXParentLocatorHeader { - MSGUID locator_type[16]; /* type of the parent virtual disk. */ + MSGUID locator_type; /* type of the parent virtual disk. */ uint16_t reserved; uint16_t key_value_count; /* number of key/value pairs for this locator */ @@ -308,18 +326,125 @@ typedef struct QEMU_PACKED VHDXParentLocatorEntry { /* ----- END VHDX SPECIFICATION STRUCTURES ---- */ +typedef struct VHDXMetadataEntries { + VHDXMetadataTableEntry file_parameters_entry; + VHDXMetadataTableEntry virtual_disk_size_entry; + VHDXMetadataTableEntry page83_data_entry; + VHDXMetadataTableEntry logical_sector_size_entry; + VHDXMetadataTableEntry phys_sector_size_entry; + VHDXMetadataTableEntry parent_locator_entry; + uint16_t present; +} VHDXMetadataEntries; + +typedef struct VHDXLogEntries { + uint64_t offset; + uint64_t length; + uint32_t write; + uint32_t read; + VHDXLogEntryHeader *hdr; + void *desc_buffer; + uint64_t sequence; + uint32_t tail; +} VHDXLogEntries; + +typedef struct VHDXRegionEntry { + uint64_t start; + uint64_t end; + QLIST_ENTRY(VHDXRegionEntry) entries; +} VHDXRegionEntry; + +typedef struct BDRVVHDXState { + CoMutex lock; + + int curr_header; + VHDXHeader *headers[2]; + + VHDXRegionTableHeader rt; + VHDXRegionTableEntry bat_rt; /* region table for the BAT */ + VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ + + VHDXMetadataTableHeader metadata_hdr; + VHDXMetadataEntries metadata_entries; + + VHDXFileParameters params; + uint32_t block_size; + uint32_t block_size_bits; + uint32_t sectors_per_block; + uint32_t sectors_per_block_bits; + + uint64_t virtual_disk_size; + uint32_t logical_sector_size; + uint32_t physical_sector_size; + + uint64_t chunk_ratio; + uint32_t chunk_ratio_bits; + uint32_t logical_sector_size_bits; + + uint32_t bat_entries; + VHDXBatEntry *bat; + uint64_t bat_offset; + bool first_visible_write; + MSGUID session_guid; + + VHDXLogEntries log; + + VHDXParentLocatorHeader parent_header; + VHDXParentLocatorEntry *parent_entries; + + Error *migration_blocker; + + bool log_replayed_on_open; + + QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; +} BDRVVHDXState; + +void vhdx_guid_generate(MSGUID *guid); + +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw, + MSGUID *log_guid); + +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset); uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, int crc_offset); bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp); + +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset); -static void leguid_to_cpus(MSGUID *guid) +static inline void leguid_to_cpus(MSGUID *guid) { le32_to_cpus(&guid->data1); le16_to_cpus(&guid->data2); le16_to_cpus(&guid->data3); } +static inline void cpu_to_leguids(MSGUID *guid) +{ + cpu_to_le32s(&guid->data1); + cpu_to_le16s(&guid->data2); + cpu_to_le16s(&guid->data3); +} + +void vhdx_header_le_import(VHDXHeader *h); +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); +void vhdx_log_desc_le_import(VHDXLogDescriptor *d); +void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_export(VHDXLogDataSector *d); +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr); +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr); +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e); +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e); +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e); +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e); +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s); + #endif diff --git a/block/vmdk.c b/block/vmdk.c index 346bb5cad..b69988d16 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -105,18 +105,22 @@ typedef struct VmdkExtent { uint32_t l2_cache_offsets[L2_CACHE_SIZE]; uint32_t l2_cache_counts[L2_CACHE_SIZE]; - unsigned int cluster_sectors; + int64_t cluster_sectors; + char *type; } VmdkExtent; typedef struct BDRVVmdkState { CoMutex lock; uint64_t desc_offset; bool cid_updated; + bool cid_checked; + uint32_t cid; uint32_t parent_cid; int num_extents; /* Extent array with num_extents entries, ascend ordered by address */ VmdkExtent *extents; Error *migration_blocker; + char *create_type; } BDRVVmdkState; typedef struct VmdkMetaData { @@ -197,8 +201,6 @@ static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) } } -#define CHECK_CID 1 - #define SECTOR_SIZE 512 #define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ #define BUF_SIZE 4096 @@ -215,8 +217,9 @@ static void vmdk_free_extents(BlockDriverState *bs) g_free(e->l1_table); g_free(e->l2_cache); g_free(e->l1_backup_table); + g_free(e->type); if (e->file != bs->file) { - bdrv_delete(e->file); + bdrv_unref(e->file); } } g_free(s->extents); @@ -301,19 +304,18 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) static int vmdk_is_cid_valid(BlockDriverState *bs) { -#ifdef CHECK_CID BDRVVmdkState *s = bs->opaque; BlockDriverState *p_bs = bs->backing_hd; uint32_t cur_pcid; - if (p_bs) { + if (!s->cid_checked && p_bs) { cur_pcid = vmdk_read_cid(p_bs, 0); if (s->parent_cid != cur_pcid) { /* CID not valid */ return 0; } } -#endif + s->cid_checked = true; /* CID valid */ return 1; } @@ -331,8 +333,7 @@ static int vmdk_reopen_prepare(BDRVReopenState *state, assert(state->bs != NULL); if (queue == NULL) { - error_set(errp, ERROR_CLASS_GENERIC_ERROR, - "No reopen queue for VMDK extents"); + error_setg(errp, "No reopen queue for VMDK extents"); goto exit; } @@ -391,15 +392,24 @@ static int vmdk_add_extent(BlockDriverState *bs, int64_t l1_offset, int64_t l1_backup_offset, uint32_t l1_size, int l2_size, uint64_t cluster_sectors, - VmdkExtent **new_extent) + VmdkExtent **new_extent, + Error **errp) { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; if (cluster_sectors > 0x200000) { /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ - error_report("invalid granularity, image may be corrupt"); - return -EINVAL; + error_setg(errp, "Invalid granularity, image may be corrupt"); + return -EFBIG; + } + if (l1_size > 512 * 1024 * 1024) { + /* Although with big capacity and small l1_entry_sectors, we can get a + * big l1_size, we don't want unbounded value to allocate the table. + * Limit it to 512M, which is 16PB for default cluster and L2 table + * size */ + error_setg(errp, "L1 size too big"); + return -EFBIG; } s->extents = g_realloc(s->extents, @@ -416,7 +426,7 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l1_size = l1_size; extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l2_size = l2_size; - extent->cluster_sectors = cluster_sectors; + extent->cluster_sectors = flat ? sectors : cluster_sectors; if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; @@ -430,7 +440,8 @@ static int vmdk_add_extent(BlockDriverState *bs, return 0; } -static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) +static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, + Error **errp) { int ret; int l1_size, i; @@ -439,10 +450,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) l1_size = extent->l1_size * sizeof(uint32_t); extent->l1_table = g_malloc(l1_size); ret = bdrv_pread(extent->file, - extent->l1_table_offset, - extent->l1_table, - l1_size); + extent->l1_table_offset, + extent->l1_table, + l1_size); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 table from extent '%s'", + extent->file->filename); goto fail_l1; } for (i = 0; i < extent->l1_size; i++) { @@ -452,10 +466,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) if (extent->l1_backup_table_offset) { extent->l1_backup_table = g_malloc(l1_size); ret = bdrv_pread(extent->file, - extent->l1_backup_table_offset, - extent->l1_backup_table, - l1_size); + extent->l1_backup_table_offset, + extent->l1_backup_table, + l1_size); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 backup table from extent '%s'", + extent->file->filename); goto fail_l1b; } for (i = 0; i < extent->l1_size; i++) { @@ -473,9 +490,9 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) return ret; } -static int vmdk_open_vmdk3(BlockDriverState *bs, - BlockDriverState *file, - int flags) +static int vmdk_open_vmfs_sparse(BlockDriverState *bs, + BlockDriverState *file, + int flags, Error **errp) { int ret; uint32_t magic; @@ -484,20 +501,24 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->filename); return ret; } - - ret = vmdk_add_extent(bs, - bs->file, false, - le32_to_cpu(header.disk_sectors), - le32_to_cpu(header.l1dir_offset) << 9, - 0, 1 << 6, 1 << 9, - le32_to_cpu(header.granularity), - &extent); + ret = vmdk_add_extent(bs, file, false, + le32_to_cpu(header.disk_sectors), + le32_to_cpu(header.l1dir_offset) << 9, + 0, + le32_to_cpu(header.l1dir_size), + 4096, + le32_to_cpu(header.granularity), + &extent, + errp); if (ret < 0) { return ret; } - ret = vmdk_init_tables(bs, extent); + ret = vmdk_init_tables(bs, extent, errp); if (ret) { /* free extent allocated by vmdk_add_extent */ vmdk_free_last_extent(bs); @@ -505,31 +526,71 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, return ret; } -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - uint64_t desc_offset); +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + Error **errp); + +static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, + Error **errp) +{ + int64_t size; + char *buf; + int ret; + + size = bdrv_getlength(file); + if (size < 0) { + error_setg_errno(errp, -size, "Could not access file"); + return NULL; + } + + size = MIN(size, 1 << 20); /* avoid unbounded allocation */ + buf = g_malloc0(size + 1); + + ret = bdrv_pread(file, desc_offset, buf, size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read from file"); + g_free(buf); + return NULL; + } + + return buf; +} static int vmdk_open_vmdk4(BlockDriverState *bs, BlockDriverState *file, - int flags) + int flags, Error **errp) { int ret; uint32_t magic; uint32_t l1_size, l1_entry_sectors; VMDK4Header header; VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; int64_t l1_backup_offset = 0; ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { - return ret; + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->filename); + return -EINVAL; } if (header.capacity == 0) { uint64_t desc_offset = le64_to_cpu(header.desc_offset); if (desc_offset) { - return vmdk_open_desc_file(bs, flags, desc_offset << 9); + char *buf = vmdk_read_desc(file, desc_offset << 9, errp); + if (!buf) { + return -EINVAL; + } + ret = vmdk_open_desc_file(bs, flags, buf, errp); + g_free(buf); + return ret; } } + if (!s->create_type) { + s->create_type = g_strdup("monolithicSparse"); + } + if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { /* * The footer takes precedence over the header, so read it in. The @@ -577,17 +638,24 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, header = footer.header; } - if (le32_to_cpu(header.version) >= 3) { + if (le32_to_cpu(header.version) > 3) { char buf[64]; snprintf(buf, sizeof(buf), "VMDK version %d", le32_to_cpu(header.version)); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "vmdk", buf); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "vmdk", buf); return -ENOTSUP; + } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { + /* VMware KB 2064959 explains that version 3 added support for + * persistent changed block tracking (CBT), and backup software can + * read it as version=1 if it doesn't care about the changed area + * information. So we are safe to enable read only. */ + error_setg(errp, "VMDK version 3 must be read only"); + return -EINVAL; } if (le32_to_cpu(header.num_gtes_per_gt) > 512) { - error_report("L2 table size too big"); + error_setg(errp, "L2 table size too big"); return -EINVAL; } @@ -598,17 +666,16 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) / l1_entry_sectors; - if (l1_size > 512 * 1024 * 1024) { - /* although with big capacity and small l1_entry_sectors, we can get a - * big l1_size, we don't want unbounded value to allocate the table. - * Limit it to 512M, which is 16PB for default cluster and L2 table - * size */ - error_report("L1 size too big"); - return -EFBIG; - } if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } + if (bdrv_getlength(file) < + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { + error_setg(errp, "File truncated, expecting at least %lld bytes", + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE); + return -EINVAL; + } + ret = vmdk_add_extent(bs, file, false, le64_to_cpu(header.capacity), le64_to_cpu(header.gd_offset) << 9, @@ -616,16 +683,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, l1_size, le32_to_cpu(header.num_gtes_per_gt), le64_to_cpu(header.granularity), - &extent); + &extent, + errp); if (ret < 0) { return ret; } extent->compressed = le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + if (extent->compressed) { + g_free(s->create_type); + s->create_type = g_strdup("streamOptimized"); + } extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; extent->version = le32_to_cpu(header.version); extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; - ret = vmdk_init_tables(bs, extent); + ret = vmdk_init_tables(bs, extent, errp); if (ret) { /* free extent allocated by vmdk_add_extent */ vmdk_free_last_extent(bs); @@ -662,31 +734,28 @@ static int vmdk_parse_description(const char *desc, const char *opt_name, /* Open an extent file and append to bs array */ static int vmdk_open_sparse(BlockDriverState *bs, - BlockDriverState *file, - int flags) + BlockDriverState *file, int flags, + char *buf, Error **errp) { uint32_t magic; - if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) { - return -EIO; - } - - magic = be32_to_cpu(magic); + magic = ldl_be_p(buf); switch (magic) { case VMDK3_MAGIC: - return vmdk_open_vmdk3(bs, file, flags); + return vmdk_open_vmfs_sparse(bs, file, flags, errp); break; case VMDK4_MAGIC: - return vmdk_open_vmdk4(bs, file, flags); + return vmdk_open_vmdk4(bs, file, flags, errp); break; default: - return -EMEDIUMTYPE; + error_setg(errp, "Image not in VMDK format"); + return -EINVAL; break; } } static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, - const char *desc_file_path) + const char *desc_file_path, Error **errp) { int ret; char access[11]; @@ -697,6 +766,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, int64_t flat_offset; char extent_path[PATH_MAX]; BlockDriverState *extent_file; + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent; while (*p) { /* parse extent line: @@ -711,116 +782,141 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, goto next_line; } else if (!strcmp(type, "FLAT")) { if (ret != 5 || flat_offset < 0) { + error_setg(errp, "Invalid extent lines: \n%s", p); + return -EINVAL; + } + } else if (!strcmp(type, "VMFS")) { + if (ret == 4) { + flat_offset = 0; + } else { + error_setg(errp, "Invalid extent lines:\n%s", p); return -EINVAL; } } else if (ret != 4) { + error_setg(errp, "Invalid extent lines:\n%s", p); return -EINVAL; } if (sectors <= 0 || - (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) || + (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && + strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || (strcmp(access, "RW"))) { goto next_line; } path_combine(extent_path, sizeof(extent_path), desc_file_path, fname); - ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags); + extent_file = NULL; + ret = bdrv_open(&extent_file, extent_path, NULL, NULL, + bs->open_flags | BDRV_O_PROTOCOL, NULL, errp); if (ret) { return ret; } /* save to extents array */ - if (!strcmp(type, "FLAT")) { + if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { /* FLAT extent */ - VmdkExtent *extent; ret = vmdk_add_extent(bs, extent_file, true, sectors, - 0, 0, 0, 0, sectors, &extent); + 0, 0, 0, 0, 0, &extent, errp); if (ret < 0) { return ret; } extent->flat_start_offset = flat_offset << 9; - } else if (!strcmp(type, "SPARSE")) { - /* SPARSE extent */ - ret = vmdk_open_sparse(bs, extent_file, bs->open_flags); + } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { + /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ + char *buf = vmdk_read_desc(extent_file, 0, errp); + if (!buf) { + ret = -EINVAL; + } else { + ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp); + } if (ret) { - bdrv_delete(extent_file); + g_free(buf); + bdrv_unref(extent_file); return ret; } + extent = &s->extents[s->num_extents - 1]; } else { - fprintf(stderr, - "VMDK: Not supported extent type \"%s\""".\n", type); + error_setg(errp, "Unsupported extent type '%s'", type); return -ENOTSUP; } + extent->type = g_strdup(type); next_line: /* move to next line */ - while (*p && *p != '\n') { + while (*p) { + if (*p == '\n') { + p++; + break; + } p++; } - p++; } return 0; } -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - uint64_t desc_offset) +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + Error **errp) { int ret; - char *buf = NULL; char ct[128]; BDRVVmdkState *s = bs->opaque; - int64_t size; - - size = bdrv_getlength(bs->file); - if (size < 0) { - return -EINVAL; - } - size = MIN(size, 1 << 20); /* avoid unbounded allocation */ - buf = g_malloc0(size + 1); - - ret = bdrv_pread(bs->file, desc_offset, buf, size); - if (ret < 0) { - goto exit; - } if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { - ret = -EMEDIUMTYPE; + error_setg(errp, "invalid VMDK image descriptor"); + ret = -EINVAL; goto exit; } if (strcmp(ct, "monolithicFlat") && + strcmp(ct, "vmfs") && + strcmp(ct, "vmfsSparse") && strcmp(ct, "twoGbMaxExtentSparse") && strcmp(ct, "twoGbMaxExtentFlat")) { - fprintf(stderr, - "VMDK: Not supported image type \"%s\""".\n", ct); + error_setg(errp, "Unsupported image type '%s'", ct); ret = -ENOTSUP; goto exit; } + s->create_type = g_strdup(ct); s->desc_offset = 0; - ret = vmdk_parse_extents(buf, bs, bs->file->filename); + ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp); exit: - g_free(buf); return ret; } -static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { + char *buf = NULL; int ret; BDRVVmdkState *s = bs->opaque; + uint32_t magic; - if (vmdk_open_sparse(bs, bs->file, flags) == 0) { - s->desc_offset = 0x200; - } else { - ret = vmdk_open_desc_file(bs, flags, 0); - if (ret) { - goto fail; - } + buf = vmdk_read_desc(bs->file, 0, errp); + if (!buf) { + return -EINVAL; + } + + magic = ldl_be_p(buf); + switch (magic) { + case VMDK3_MAGIC: + case VMDK4_MAGIC: + ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp); + s->desc_offset = 0x200; + break; + default: + ret = vmdk_open_desc_file(bs, flags, buf, errp); + break; } + if (ret) { + goto fail; + } + /* try to open parent images, if exist */ ret = vmdk_parent_open(bs); if (ret) { goto fail; } + s->cid = vmdk_read_cid(bs, 0); s->parent_cid = vmdk_read_cid(bs, 1); qemu_co_mutex_init(&s->lock); @@ -829,14 +925,34 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, "vmdk", bs->device_name, "live migration"); migrate_add_blocker(s->migration_blocker); - + g_free(buf); return 0; fail: + g_free(buf); + g_free(s->create_type); + s->create_type = NULL; vmdk_free_extents(bs); return ret; } + +static int vmdk_refresh_limits(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + if (!s->extents[i].flat) { + bs->bl.write_zeroes_alignment = + MAX(bs->bl.write_zeroes_alignment, + s->extents[i].cluster_sectors); + } + } + + return 0; +} + static int get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, uint64_t cluster_offset, @@ -1039,7 +1155,7 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVVmdkState *s = bs->opaque; @@ -1056,7 +1172,24 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, sector_num * 512, 0, &offset); qemu_co_mutex_unlock(&s->lock); - ret = (ret == VMDK_OK || ret == VMDK_ZEROED); + switch (ret) { + case VMDK_ERROR: + ret = -EIO; + break; + case VMDK_UNALLOC: + ret = 0; + break; + case VMDK_ZEROED: + ret = BDRV_BLOCK_ZERO; + break; + case VMDK_OK: + ret = BDRV_BLOCK_DATA; + if (extent->file == bs->file && !extent->compressed) { + ret |= BDRV_BLOCK_OFFSET_VALID | offset; + } + + break; + } index_in_cluster = sector_num % extent->cluster_sectors; n = extent->cluster_sectors - index_in_cluster; @@ -1254,15 +1387,14 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; - int n, ret; - int64_t index_in_cluster; + int ret; + int64_t index_in_cluster, n; uint64_t extent_begin_sector, extent_relative_sector_num; uint64_t cluster_offset; VmdkMetaData m_data; if (sector_num > bs->total_sectors) { - fprintf(stderr, - "(VMDK) Wrong offset: sector_num=0x%" PRIx64 + error_report("Wrong offset: sector_num=0x%" PRIx64 " total_sectors=0x%" PRIx64 "\n", sector_num, bs->total_sectors); return -EIO; @@ -1282,9 +1414,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ - fprintf(stderr, - "VMDK: can't write to allocated cluster" - " for streamOptimized\n"); + error_report("Could not write to allocated cluster" + " for streamOptimized"); return -EIO; } else { /* allocate */ @@ -1366,7 +1497,8 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) + int nb_sectors, + BdrvRequestFlags flags) { int ret; BDRVVmdkState *s = bs->opaque; @@ -1381,25 +1513,36 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, return ret; } - static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress, bool zeroed_grain) + bool flat, bool compress, bool zeroed_grain, + Error **errp) { int ret, i; - int fd = 0; + BlockDriverState *bs = NULL; VMDK4Header header; - uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; + Error *local_err; + uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; + uint32_t *gd_buf = NULL; + int gd_buf_size; + + ret = bdrv_create_file(filename, NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); - if (fd < 0) { - return -errno; + assert(bs == NULL); + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; } + if (flat) { - ret = ftruncate(fd, filesize); + ret = bdrv_truncate(bs, filesize); if (ret < 0) { - ret = -errno; + error_setg_errno(errp, -ret, "Could not truncate file"); } goto exit; } @@ -1410,24 +1553,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; - header.capacity = filesize / 512; + header.capacity = filesize / BDRV_SECTOR_SIZE; header.granularity = 128; - header.num_gtes_per_gt = 512; + header.num_gtes_per_gt = BDRV_SECTOR_SIZE; - grains = (filesize / 512 + header.granularity - 1) / header.granularity; - gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9; - gt_count = - (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt; - gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; + grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); + gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), + BDRV_SECTOR_SIZE); + gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); header.desc_offset = 1; header.desc_size = 20; header.rgd_offset = header.desc_offset + header.desc_size; - header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); + header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); header.grain_offset = - ((header.gd_offset + gd_size + (gt_size * gt_count) + - header.granularity - 1) / header.granularity) * - header.granularity; + ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), + header.granularity); /* swap endianness for all header fields */ header.version = cpu_to_le32(header.version); header.flags = cpu_to_le32(header.flags); @@ -1447,58 +1589,65 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = qemu_write_full(fd, &magic, sizeof(magic)); - if (ret != sizeof(magic)) { - ret = -errno; + ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = qemu_write_full(fd, &header, sizeof(header)); - if (ret != sizeof(header)) { - ret = -errno; + ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9); + ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { - ret = -errno; + error_setg_errno(errp, -ret, "Could not truncate file"); goto exit; } /* write grain directory */ - lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size; + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; + gd_buf = g_malloc0(gd_buf_size); + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } /* write backup grain directory */ - lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size; + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } ret = 0; - exit: - qemu_close(fd); +exit: + if (bs) { + bdrv_unref(bs); + } + g_free(gd_buf); return ret; } static int filename_decompose(const char *filename, char *path, char *prefix, - char *postfix, size_t buf_len) + char *postfix, size_t buf_len, Error **errp) { const char *p, *q; if (filename == NULL || !strlen(filename)) { - fprintf(stderr, "Vmdk: no filename provided.\n"); + error_setg(errp, "No filename provided"); return VMDK_ERROR; } p = strrchr(filename, '/'); @@ -1532,10 +1681,13 @@ static int filename_decompose(const char *filename, char *path, char *prefix, return VMDK_OK; } -static int vmdk_create(const char *filename, QEMUOptionParameter *options) +static int vmdk_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { - int fd, idx = 0; - char desc[BUF_SIZE]; + int idx = 0; + BlockDriverState *new_bs = NULL; + Error *local_err; + char *desc = NULL; int64_t total_size = 0, filesize; const char *adapter_type = NULL; const char *backing_file = NULL; @@ -1543,7 +1695,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) int flags = 0; int ret = 0; bool flat, split, compress; - char ext_desc_lines[BUF_SIZE] = ""; + GString *ext_desc_lines; char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX]; const int64_t split_size = 0x80000000; /* VMDK has constant split size */ const char *desc_extent_line; @@ -1551,6 +1703,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) uint32_t parent_cid = 0xffffffff; uint32_t number_heads = 16; bool zeroed_grain = false; + uint32_t desc_offset = 0, desc_len; const char desc_template[] = "# Disk DescriptorFile\n" "version=1\n" @@ -1571,8 +1724,11 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) "ddb.geometry.sectors = \"63\"\n" "ddb.adapterType = \"%s\"\n"; - if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) { - return -EINVAL; + ext_desc_lines = g_string_new(NULL); + + if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { + ret = -EINVAL; + goto exit; } /* Read out options */ while (options && options->name) { @@ -1597,8 +1753,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) strcmp(adapter_type, "buslogic") && strcmp(adapter_type, "lsilogic") && strcmp(adapter_type, "legacyESX")) { - fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type); - return -EINVAL; + error_setg(errp, "Unknown adapter type: '%s'", adapter_type); + ret = -EINVAL; + goto exit; } if (strcmp(adapter_type, "ide") != 0) { /* that's the number of heads with which vmware operates when @@ -1613,8 +1770,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) strcmp(fmt, "twoGbMaxExtentSparse") && strcmp(fmt, "twoGbMaxExtentFlat") && strcmp(fmt, "streamOptimized")) { - fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt); - return -EINVAL; + error_setg(errp, "Unknown subformat: '%s'", fmt); + ret = -EINVAL; + goto exit; } split = !(strcmp(fmt, "twoGbMaxExtentFlat") && strcmp(fmt, "twoGbMaxExtentSparse")); @@ -1627,22 +1785,29 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) desc_extent_line = "RW %lld SPARSE \"%s\"\n"; } if (flat && backing_file) { - /* not supporting backing file for flat image */ - return -ENOTSUP; + error_setg(errp, "Flat image can't have backing file"); + ret = -ENOTSUP; + goto exit; + } + if (flat && zeroed_grain) { + error_setg(errp, "Flat image can't enable zeroed grain"); + ret = -ENOTSUP; + goto exit; } if (backing_file) { - BlockDriverState *bs = bdrv_new(""); - ret = bdrv_open(bs, backing_file, NULL, 0, NULL); + BlockDriverState *bs = NULL; + ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_NO_BACKING, NULL, + errp); if (ret != 0) { - bdrv_delete(bs); - return ret; + goto exit; } if (strcmp(bs->drv->format_name, "vmdk")) { - bdrv_delete(bs); - return -EINVAL; + bdrv_unref(bs); + ret = -EINVAL; + goto exit; } parent_cid = vmdk_read_cid(bs, 0); - bdrv_delete(bs); + bdrv_unref(bs); snprintf(parent_desc_line, sizeof(parent_desc_line), "parentFileNameHint=\"%s\"", backing_file); } @@ -1672,51 +1837,66 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) path, desc_filename); if (vmdk_create_extent(ext_filename, size, - flat, compress, zeroed_grain)) { - return -EINVAL; + flat, compress, zeroed_grain, errp)) { + ret = -EINVAL; + goto exit; } filesize -= size; /* Format description line */ snprintf(desc_line, sizeof(desc_line), - desc_extent_line, size / 512, desc_filename); - pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line); + desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); + g_string_append(ext_desc_lines, desc_line); } /* generate descriptor file */ - snprintf(desc, sizeof(desc), desc_template, - (unsigned int)time(NULL), - parent_cid, - fmt, - parent_desc_line, - ext_desc_lines, - (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), - total_size / (int64_t)(63 * number_heads * 512), number_heads, - adapter_type); - if (split || flat) { - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); + desc = g_strdup_printf(desc_template, + (unsigned int)time(NULL), + parent_cid, + fmt, + parent_desc_line, + ext_desc_lines->str, + (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + total_size / + (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), + number_heads, + adapter_type); + desc_len = strlen(desc); + /* the descriptor offset = 0x200 */ + if (!split && !flat) { + desc_offset = 0x200; } else { - fd = qemu_open(filename, - O_WRONLY | O_BINARY | O_LARGEFILE, - 0644); - } - if (fd < 0) { - return -errno; + ret = bdrv_create_file(filename, options, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not create image file"); + goto exit; + } } - /* the descriptor offset = 0x200 */ - if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) { - ret = -errno; + assert(new_bs == NULL); + ret = bdrv_open(&new_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); goto exit; } - ret = qemu_write_full(fd, desc, strlen(desc)); - if (ret != strlen(desc)) { - ret = -errno; + ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); goto exit; } - ret = 0; + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { + ret = bdrv_truncate(new_bs, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not truncate file"); + } + } exit: - qemu_close(fd); + if (new_bs) { + bdrv_unref(new_bs); + } + g_free(desc); + g_string_free(ext_desc_lines, true); return ret; } @@ -1725,6 +1905,7 @@ static void vmdk_close(BlockDriverState *bs) BDRVVmdkState *s = bs->opaque; vmdk_free_extents(bs); + g_free(s->create_type); migrate_del_blocker(s->migration_blocker); error_free(s->migration_blocker); @@ -1786,6 +1967,101 @@ static int vmdk_has_zero_init(BlockDriverState *bs) return 1; } +static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) +{ + ImageInfo *info = g_new0(ImageInfo, 1); + + *info = (ImageInfo){ + .filename = g_strdup(extent->file->filename), + .format = g_strdup(extent->type), + .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, + .compressed = extent->compressed, + .has_compressed = extent->compressed, + .cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE, + .has_cluster_size = !extent->flat, + }; + + return info; +} + +static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent = NULL; + int64_t sector_num = 0; + int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; + int ret; + uint64_t cluster_offset; + + if (fix) { + return -ENOTSUP; + } + + for (;;) { + if (sector_num >= total_sectors) { + return 0; + } + extent = find_extent(s, sector_num, extent); + if (!extent) { + fprintf(stderr, + "ERROR: could not find extent for sector %" PRId64 "\n", + sector_num); + break; + } + ret = get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, + 0, &cluster_offset); + if (ret == VMDK_ERROR) { + fprintf(stderr, + "ERROR: could not get cluster_offset for sector %" + PRId64 "\n", sector_num); + break; + } + if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) { + fprintf(stderr, + "ERROR: cluster offset for sector %" + PRId64 " points after EOF\n", sector_num); + break; + } + sector_num += extent->cluster_sectors; + } + + result->corruptions++; + return 0; +} + +static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); + ImageInfoList **next; + + *spec_info = (ImageInfoSpecific){ + .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK, + { + .vmdk = g_new0(ImageInfoSpecificVmdk, 1), + }, + }; + + *spec_info->vmdk = (ImageInfoSpecificVmdk) { + .create_type = g_strdup(s->create_type), + .cid = s->cid, + .parent_cid = s->parent_cid, + }; + + next = &spec_info->vmdk->extents; + for (i = 0; i < s->num_extents; i++) { + *next = g_new0(ImageInfoList, 1); + (*next)->value = vmdk_get_extent_info(&s->extents[i]); + (*next)->next = NULL; + next = &(*next)->next; + } + + return spec_info; +} + static QEMUOptionParameter vmdk_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1828,6 +2104,7 @@ static BlockDriver bdrv_vmdk = { .instance_size = sizeof(BDRVVmdkState), .bdrv_probe = vmdk_probe, .bdrv_open = vmdk_open, + .bdrv_check = vmdk_check, .bdrv_reopen_prepare = vmdk_reopen_prepare, .bdrv_read = vmdk_co_read, .bdrv_write = vmdk_co_write, @@ -1835,9 +2112,11 @@ static BlockDriver bdrv_vmdk = { .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_co_is_allocated = vmdk_co_is_allocated, + .bdrv_co_get_block_status = vmdk_co_get_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, + .bdrv_get_specific_info = vmdk_get_specific_info, + .bdrv_refresh_limits = vmdk_refresh_limits, .create_options = vmdk_create_options, }; diff --git a/block/vpc.c b/block/vpc.c index fe4f311d5..2e25f5723 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -45,8 +45,10 @@ enum vhd_type { // Seconds since Jan 1, 2000 0:00:00 (UTC) #define VHD_TIMESTAMP_BASE 946684800 +#define VHD_MAX_SECTORS (65535LL * 255 * 255) + // always big-endian -struct vhd_footer { +typedef struct vhd_footer { char creator[8]; // "conectix" uint32_t features; uint32_t version; @@ -79,9 +81,9 @@ struct vhd_footer { uint8_t uuid[16]; uint8_t in_saved_state; -}; +} QEMU_PACKED VHDFooter; -struct vhd_dyndisk_header { +typedef struct vhd_dyndisk_header { char magic[8]; // "cxsparse" // Offset of next header structure, 0xFFFFFFFF if none @@ -111,7 +113,7 @@ struct vhd_dyndisk_header { uint32_t reserved; uint64_t data_offset; } parent_locator[8]; -}; +} QEMU_PACKED VHDDynDiskHeader; typedef struct BDRVVPCState { CoMutex lock; @@ -155,14 +157,16 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int vpc_open(BlockDriverState *bs, QDict *options, int flags) +static int vpc_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVPCState *s = bs->opaque; int i; - struct vhd_footer* footer; - struct vhd_dyndisk_header* dyndisk_header; + VHDFooter *footer; + VHDDynDiskHeader *dyndisk_header; uint8_t buf[HEADER_SIZE]; uint32_t checksum; + uint64_t computed_size; int disk_type = VHD_DYNAMIC; int ret; @@ -171,7 +175,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - footer = (struct vhd_footer*) s->footer_buf; + footer = (VHDFooter *) s->footer_buf; if (strncmp(footer->creator, "conectix", 8)) { int64_t offset = bdrv_getlength(bs->file); if (offset < 0) { @@ -189,7 +193,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } if (strncmp(footer->creator, "conectix", 8)) { - ret = -EMEDIUMTYPE; + error_setg(errp, "invalid VPC image"); + ret = -EINVAL; goto fail; } disk_type = VHD_FIXED; @@ -210,8 +215,17 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + /* images created with disk2vhd report a far higher virtual size + * than expected with the cyls * heads * sectors_per_cyl formula. + * use the footer->size instead if the image was created with + * disk2vhd. + */ + if (!strncmp(footer->creator_app, "d2v", 4)) { + bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE; + } + /* Allow a maximum disk size of approximately 2 TB */ - if (bs->total_sectors >= 65535LL * 255 * 255) { + if (bs->total_sectors >= VHD_MAX_SECTORS) { ret = -EFBIG; goto fail; } @@ -223,7 +237,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - dyndisk_header = (struct vhd_dyndisk_header *) buf; + dyndisk_header = (VHDDynDiskHeader *) buf; if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { ret = -EINVAL; @@ -231,10 +245,31 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) } s->block_size = be32_to_cpu(dyndisk_header->block_size); + if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) { + error_setg(errp, "Invalid block size %" PRIu32, s->block_size); + ret = -EINVAL; + goto fail; + } s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); - s->pagetable = g_malloc(s->max_table_entries * 4); + + if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { + ret = -EINVAL; + goto fail; + } + if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) { + ret = -EINVAL; + goto fail; + } + + computed_size = (uint64_t) s->max_table_entries * s->block_size; + if (computed_size < bs->total_sectors * 512) { + ret = -EINVAL; + goto fail; + } + + s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4); s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); @@ -259,6 +294,13 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) } } + if (s->free_data_block_offset > bdrv_getlength(bs->file)) { + error_setg(errp, "block-vpc: free_data_block_offset points after " + "the end of file. The image has been truncated."); + ret = -EINVAL; + goto fail; + } + s->last_bitmap_offset = (int64_t) -1; #ifdef CACHE @@ -280,7 +322,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) return 0; fail: - g_free(s->pagetable); + qemu_vfree(s->pagetable); #ifdef CACHE g_free(s->pageentry_u8); #endif @@ -438,6 +480,19 @@ fail: return -1; } +static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVPCState *s = (BDRVVPCState *)bs->opaque; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (cpu_to_be32(footer->type) != VHD_FIXED) { + bdi->cluster_size = s->block_size; + } + + bdi->unallocated_blocks_are_zero = true; + return 0; +} + static int vpc_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { @@ -445,7 +500,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, int ret; int64_t offset; int64_t sectors, sectors_per_block; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_read(bs->file, sector_num, buf, nb_sectors); @@ -494,7 +549,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, int64_t offset; int64_t sectors, sectors_per_block; int ret; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_write(bs->file, sector_num, buf, nb_sectors); @@ -596,8 +651,8 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) { - struct vhd_dyndisk_header* dyndisk_header = - (struct vhd_dyndisk_header*) buf; + VHDDynDiskHeader *dyndisk_header = + (VHDDynDiskHeader *) buf; size_t block_size, num_bat_entries; int i; int ret = -EIO; @@ -683,10 +738,11 @@ static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) return ret; } -static int vpc_create(const char *filename, QEMUOptionParameter *options) +static int vpc_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { uint8_t buf[1024]; - struct vhd_footer *footer = (struct vhd_footer *) buf; + VHDFooter *footer = (VHDFooter *) buf; QEMUOptionParameter *disk_type_param; int fd, i; uint16_t cyls = 0; @@ -789,7 +845,7 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) static int vpc_has_zero_init(BlockDriverState *bs) { BDRVVPCState *s = bs->opaque; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_has_zero_init(bs->file); @@ -801,7 +857,7 @@ static int vpc_has_zero_init(BlockDriverState *bs) static void vpc_close(BlockDriverState *bs) { BDRVVPCState *s = bs->opaque; - g_free(s->pagetable); + qemu_vfree(s->pagetable); #ifdef CACHE g_free(s->pageentry_u8); #endif @@ -839,6 +895,8 @@ static BlockDriver bdrv_vpc = { .bdrv_read = vpc_co_read, .bdrv_write = vpc_co_write, + .bdrv_get_info = vpc_get_info, + .create_options = vpc_create_options, .bdrv_has_zero_init = vpc_has_zero_init, }; diff --git a/block/vvfat.c b/block/vvfat.c index cd3b8edd9..1978c9ed6 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -266,8 +266,7 @@ typedef struct mbr_t { } QEMU_PACKED mbr_t; typedef struct direntry_t { - uint8_t name[8]; - uint8_t extension[3]; + uint8_t name[8 + 3]; uint8_t attributes; uint8_t reserved[2]; uint16_t ctime; @@ -518,11 +517,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry) uint8_t chksum=0; int i; - for(i=0;i<11;i++) { - unsigned char c; - - c = (i < 8) ? entry->name[i] : entry->extension[i-8]; - chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c; + for (i = 0; i < ARRAY_SIZE(entry->name); i++) { + chksum = (((chksum & 0xfe) >> 1) | + ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i]; } return chksum; @@ -617,7 +614,7 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, if(is_dot) { entry=array_get_next(&(s->directory)); - memset(entry->name,0x20,11); + memset(entry->name, 0x20, sizeof(entry->name)); memcpy(entry->name,filename,strlen(filename)); return entry; } @@ -632,12 +629,14 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, i = 8; entry=array_get_next(&(s->directory)); - memset(entry->name,0x20,11); + memset(entry->name, 0x20, sizeof(entry->name)); memcpy(entry->name, filename, i); - if(j > 0) - for (i = 0; i < 3 && filename[j+1+i]; i++) - entry->extension[i] = filename[j+1+i]; + if (j > 0) { + for (i = 0; i < 3 && filename[j + 1 + i]; i++) { + entry->name[8 + i] = filename[j + 1 + i]; + } + } /* upcase & remove unwanted characters */ for(i=10;i>=0;i--) { @@ -861,8 +860,7 @@ static int init_directories(BDRVVVFATState* s, { direntry_t* entry=array_get_next(&(s->directory)); entry->attributes=0x28; /* archive | volume label */ - memcpy(entry->name,"QEMU VVF",8); - memcpy(entry->extension,"AT ",3); + memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name)); } /* Now build FAT, and write back information into directory */ @@ -1065,7 +1063,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options, qdict_put(options, "rw", qbool_from_int(rw)); } -static int vvfat_open(BlockDriverState *bs, QDict *options, int flags) +static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVVFATState *s = bs->opaque; int cyls, heads, secs; @@ -1084,19 +1083,17 @@ DLOG(if (stderr == NULL) { setbuf(stderr, NULL); }) - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } dirname = qemu_opt_get(opts, "dir"); if (!dirname) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires " - "a 'dir' option"); + error_setg(errp, "vvfat block driver requires a 'dir' option"); ret = -EINVAL; goto fail; } @@ -1122,6 +1119,7 @@ DLOG(if (stderr == NULL) { if (!s->fat_type) { s->fat_type = 16; } + s->first_sectors_number = 0x40; cyls = s->fat_type == 12 ? 64 : 1024; heads = 16; secs = 63; @@ -1136,8 +1134,7 @@ DLOG(if (stderr == NULL) { case 12: break; default: - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only " - "12, 16 and 32"); + error_setg(errp, "Valid FAT types are only 12, 16 and 32"); ret = -EINVAL; goto fail; } @@ -1150,7 +1147,6 @@ DLOG(if (stderr == NULL) { s->current_cluster=0xffffffff; - s->first_sectors_number=0x40; /* read only is the default for safety */ bs->read_only = 1; s->qcow = s->write_target = NULL; @@ -1590,17 +1586,20 @@ static int parse_short_name(BDRVVVFATState* s, lfn->name[i] = direntry->name[i]; } - for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--); + for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) { + } if (j >= 0) { lfn->name[i++] = '.'; lfn->name[i + j + 1] = '\0'; for (;j >= 0; j--) { - if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f) - return -2; - else if (s->downcase_short_names) - lfn->name[i + j] = qemu_tolower(direntry->extension[j]); - else - lfn->name[i + j] = direntry->extension[j]; + uint8_t c = direntry->name[8 + j]; + if (c <= ' ' || c > 0x7f) { + return -2; + } else if (s->downcase_short_names) { + lfn->name[i + j] = qemu_tolower(c); + } else { + lfn->name[i + j] = c; + } } } else lfn->name[i + j + 1] = '\0'; @@ -2874,16 +2873,17 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } -static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int* n) { BDRVVVFATState* s = bs->opaque; *n = s->sector_count - sector_num; - if (*n > nb_sectors) - *n = nb_sectors; - else if (*n < 0) - return 0; - return 1; + if (*n > nb_sectors) { + *n = nb_sectors; + } else if (*n < 0) { + return 0; + } + return BDRV_BLOCK_DATA; } static int write_target_commit(BlockDriverState *bs, int64_t sector_num, @@ -2894,7 +2894,7 @@ static int write_target_commit(BlockDriverState *bs, int64_t sector_num, static void write_target_close(BlockDriverState *bs) { BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - bdrv_delete(s->qcow); + bdrv_unref(s->qcow); g_free(s->qcow_filename); } @@ -2908,6 +2908,7 @@ static int enable_write_target(BDRVVVFATState *s) { BlockDriver *bdrv_qcow; QEMUOptionParameter *options; + Error *local_err = NULL; int ret; int size = sector2cluster(s, s->sector_count); s->used_clusters = calloc(size, 1); @@ -2925,17 +2926,20 @@ static int enable_write_target(BDRVVVFATState *s) set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); - ret = bdrv_create(bdrv_qcow, s->qcow_filename, options); + ret = bdrv_create(bdrv_qcow, s->qcow_filename, options, &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto err; } - s->qcow = bdrv_new(""); - - ret = bdrv_open(s->qcow, s->qcow_filename, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); + s->qcow = NULL; + ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow, + &local_err); if (ret < 0) { - bdrv_delete(s->qcow); + qerror_report_err(local_err); + error_free(local_err); goto err; } @@ -2943,7 +2947,7 @@ static int enable_write_target(BDRVVVFATState *s) unlink(s->qcow_filename); #endif - s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1); + s->bs->backing_hd = bdrv_new(""); s->bs->backing_hd->drv = &vvfat_write_target; s->bs->backing_hd->opaque = g_malloc(sizeof(void*)); *(void**)s->bs->backing_hd->opaque = s; @@ -2984,7 +2988,7 @@ static BlockDriver bdrv_vvfat = { .bdrv_read = vvfat_co_read, .bdrv_write = vvfat_co_write, - .bdrv_co_is_allocated = vvfat_co_is_allocated, + .bdrv_co_get_block_status = vvfat_co_get_block_status, }; static void bdrv_vvfat_init(void) diff --git a/block/win32-aio.c b/block/win32-aio.c index fcb7c754d..5d1d199b6 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -105,13 +105,6 @@ static void win32_aio_completion_cb(EventNotifier *e) } } -static int win32_aio_flush_cb(EventNotifier *e) -{ - QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e); - - return (s->count > 0) ? 1 : 0; -} - static void win32_aio_cancel(BlockDriverAIOCB *blockacb) { QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb; @@ -201,8 +194,7 @@ QEMUWin32AIOState *win32_aio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb, - win32_aio_flush_cb); + qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb); return s; |