summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorAnas Nashif <anas.nashif@intel.com>2012-11-06 07:50:24 -0800
committerAnas Nashif <anas.nashif@intel.com>2012-11-06 07:50:24 -0800
commit060629c6ef0b7e5c267d84c91600113264d33120 (patch)
tree18fcb144ac71b9c4d08ee5d1dc58e2b16c109a5a /block
downloadqemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.gz
qemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.bz2
qemu-060629c6ef0b7e5c267d84c91600113264d33120.zip
Imported Upstream version 1.2.0upstream/1.2.0
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs11
-rw-r--r--block/blkdebug.c473
-rw-r--r--block/blkverify.c366
-rw-r--r--block/bochs.c243
-rw-r--r--block/cloop.c195
-rw-r--r--block/cow.c356
-rw-r--r--block/curl.c624
-rw-r--r--block/dmg.c325
-rw-r--r--block/iscsi.c1086
-rw-r--r--block/nbd.c517
-rw-r--r--block/parallels.c170
-rw-r--r--block/qcow.c890
-rw-r--r--block/qcow2-cache.c323
-rw-r--r--block/qcow2-cluster.c1199
-rw-r--r--block/qcow2-refcount.c1240
-rw-r--r--block/qcow2-snapshot.c660
-rw-r--r--block/qcow2.c1719
-rw-r--r--block/qcow2.h336
-rw-r--r--block/qed-check.c248
-rw-r--r--block/qed-cluster.c165
-rw-r--r--block/qed-gencb.c32
-rw-r--r--block/qed-l2-cache.c187
-rw-r--r--block/qed-table.c297
-rw-r--r--block/qed.c1586
-rw-r--r--block/qed.h344
-rw-r--r--block/raw-posix-aio.h45
-rw-r--r--block/raw-posix.c1383
-rw-r--r--block/raw-win32.c431
-rw-r--r--block/raw.c145
-rw-r--r--block/rbd.c970
-rw-r--r--block/sheepdog.c2083
-rw-r--r--block/stream.c209
-rw-r--r--block/vdi.c786
-rw-r--r--block/vmdk.c1665
-rw-r--r--block/vpc.c799
-rw-r--r--block/vvfat.c2911
36 files changed, 25019 insertions, 0 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
new file mode 100644
index 000000000..b5754d39b
--- /dev/null
+++ b/block/Makefile.objs
@@ -0,0 +1,11 @@
+block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
+block-obj-y += qed-check.o
+block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-obj-y += stream.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-$(CONFIG_POSIX) += raw-posix.o
+block-obj-$(CONFIG_LIBISCSI) += iscsi.o
+block-obj-$(CONFIG_CURL) += curl.o
+block-obj-$(CONFIG_RBD) += rbd.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
new file mode 100644
index 000000000..59dcea065
--- /dev/null
+++ b/block/blkdebug.c
@@ -0,0 +1,473 @@
+/*
+ * Block protocol for I/O error injection
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+typedef struct BDRVBlkdebugState {
+ int state;
+ QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
+ QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
+} BDRVBlkdebugState;
+
+typedef struct BlkdebugAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+ int ret;
+} BlkdebugAIOCB;
+
+static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
+
+static AIOPool blkdebug_aio_pool = {
+ .aiocb_size = sizeof(BlkdebugAIOCB),
+ .cancel = blkdebug_aio_cancel,
+};
+
+enum {
+ ACTION_INJECT_ERROR,
+ ACTION_SET_STATE,
+};
+
+typedef struct BlkdebugRule {
+ BlkDebugEvent event;
+ int action;
+ int state;
+ union {
+ struct {
+ int error;
+ int immediately;
+ int once;
+ int64_t sector;
+ } inject;
+ struct {
+ int new_state;
+ } set_state;
+ } options;
+ QLIST_ENTRY(BlkdebugRule) next;
+ QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
+} BlkdebugRule;
+
+static QemuOptsList inject_error_opts = {
+ .name = "inject-error",
+ .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head),
+ .desc = {
+ {
+ .name = "event",
+ .type = QEMU_OPT_STRING,
+ },
+ {
+ .name = "state",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
+ .name = "errno",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
+ .name = "sector",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
+ .name = "once",
+ .type = QEMU_OPT_BOOL,
+ },
+ {
+ .name = "immediately",
+ .type = QEMU_OPT_BOOL,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList set_state_opts = {
+ .name = "set-state",
+ .head = QTAILQ_HEAD_INITIALIZER(set_state_opts.head),
+ .desc = {
+ {
+ .name = "event",
+ .type = QEMU_OPT_STRING,
+ },
+ {
+ .name = "state",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
+ .name = "new_state",
+ .type = QEMU_OPT_NUMBER,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList *config_groups[] = {
+ &inject_error_opts,
+ &set_state_opts,
+ NULL
+};
+
+static const char *event_names[BLKDBG_EVENT_MAX] = {
+ [BLKDBG_L1_UPDATE] = "l1_update",
+ [BLKDBG_L1_GROW_ALLOC_TABLE] = "l1_grow.alloc_table",
+ [BLKDBG_L1_GROW_WRITE_TABLE] = "l1_grow.write_table",
+ [BLKDBG_L1_GROW_ACTIVATE_TABLE] = "l1_grow.activate_table",
+
+ [BLKDBG_L2_LOAD] = "l2_load",
+ [BLKDBG_L2_UPDATE] = "l2_update",
+ [BLKDBG_L2_UPDATE_COMPRESSED] = "l2_update_compressed",
+ [BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read",
+ [BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write",
+
+ [BLKDBG_READ_AIO] = "read_aio",
+ [BLKDBG_READ_BACKING_AIO] = "read_backing_aio",
+ [BLKDBG_READ_COMPRESSED] = "read_compressed",
+
+ [BLKDBG_WRITE_AIO] = "write_aio",
+ [BLKDBG_WRITE_COMPRESSED] = "write_compressed",
+
+ [BLKDBG_VMSTATE_LOAD] = "vmstate_load",
+ [BLKDBG_VMSTATE_SAVE] = "vmstate_save",
+
+ [BLKDBG_COW_READ] = "cow_read",
+ [BLKDBG_COW_WRITE] = "cow_write",
+
+ [BLKDBG_REFTABLE_LOAD] = "reftable_load",
+ [BLKDBG_REFTABLE_GROW] = "reftable_grow",
+
+ [BLKDBG_REFBLOCK_LOAD] = "refblock_load",
+ [BLKDBG_REFBLOCK_UPDATE] = "refblock_update",
+ [BLKDBG_REFBLOCK_UPDATE_PART] = "refblock_update_part",
+ [BLKDBG_REFBLOCK_ALLOC] = "refblock_alloc",
+ [BLKDBG_REFBLOCK_ALLOC_HOOKUP] = "refblock_alloc.hookup",
+ [BLKDBG_REFBLOCK_ALLOC_WRITE] = "refblock_alloc.write",
+ [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS] = "refblock_alloc.write_blocks",
+ [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE] = "refblock_alloc.write_table",
+ [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE] = "refblock_alloc.switch_table",
+
+ [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc",
+ [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes",
+ [BLKDBG_CLUSTER_FREE] = "cluster_free",
+};
+
+static int get_event_by_name(const char *name, BlkDebugEvent *event)
+{
+ int i;
+
+ for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+ if (!strcmp(event_names[i], name)) {
+ *event = i;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+struct add_rule_data {
+ BDRVBlkdebugState *s;
+ int action;
+};
+
+static int add_rule(QemuOpts *opts, void *opaque)
+{
+ struct add_rule_data *d = opaque;
+ BDRVBlkdebugState *s = d->s;
+ const char* event_name;
+ BlkDebugEvent event;
+ struct BlkdebugRule *rule;
+
+ /* Find the right event for the rule */
+ event_name = qemu_opt_get(opts, "event");
+ if (!event_name || get_event_by_name(event_name, &event) < 0) {
+ return -1;
+ }
+
+ /* Set attributes common for all actions */
+ rule = g_malloc0(sizeof(*rule));
+ *rule = (struct BlkdebugRule) {
+ .event = event,
+ .action = d->action,
+ .state = qemu_opt_get_number(opts, "state", 0),
+ };
+
+ /* Parse action-specific options */
+ switch (d->action) {
+ case ACTION_INJECT_ERROR:
+ rule->options.inject.error = qemu_opt_get_number(opts, "errno", EIO);
+ rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0);
+ rule->options.inject.immediately =
+ qemu_opt_get_bool(opts, "immediately", 0);
+ rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1);
+ break;
+
+ case ACTION_SET_STATE:
+ rule->options.set_state.new_state =
+ qemu_opt_get_number(opts, "new_state", 0);
+ break;
+ };
+
+ /* Add the rule */
+ QLIST_INSERT_HEAD(&s->rules[event], rule, next);
+
+ return 0;
+}
+
+static int read_config(BDRVBlkdebugState *s, const char *filename)
+{
+ FILE *f;
+ int ret;
+ struct add_rule_data d;
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ return -errno;
+ }
+
+ ret = qemu_config_parse(f, config_groups, filename);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ d.s = s;
+ d.action = ACTION_INJECT_ERROR;
+ qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0);
+
+ d.action = ACTION_SET_STATE;
+ qemu_opts_foreach(&set_state_opts, add_rule, &d, 0);
+
+ ret = 0;
+fail:
+ qemu_opts_reset(&inject_error_opts);
+ qemu_opts_reset(&set_state_opts);
+ fclose(f);
+ return ret;
+}
+
+/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */
+static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ int ret;
+ char *config, *c;
+
+ /* Parse the blkdebug: prefix */
+ if (strncmp(filename, "blkdebug:", strlen("blkdebug:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkdebug:");
+
+ /* Read rules from config file */
+ c = strchr(filename, ':');
+ if (c == NULL) {
+ return -EINVAL;
+ }
+
+ config = g_strdup(filename);
+ config[c - filename] = '\0';
+ ret = read_config(s, config);
+ g_free(config);
+ if (ret < 0) {
+ return ret;
+ }
+ filename = c + 1;
+
+ /* Set initial state */
+ s->state = 1;
+
+ /* Open the backing file */
+ ret = bdrv_file_open(&bs->file, filename, flags);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+static void error_callback_bh(void *opaque)
+{
+ struct BlkdebugAIOCB *acb = opaque;
+ qemu_bh_delete(acb->bh);
+ acb->common.cb(acb->common.opaque, acb->ret);
+ qemu_aio_release(acb);
+}
+
+static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ BlkdebugAIOCB *acb = container_of(blockacb, BlkdebugAIOCB, common);
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ int error = rule->options.inject.error;
+ struct BlkdebugAIOCB *acb;
+ QEMUBH *bh;
+
+ if (rule->options.inject.once) {
+ QSIMPLEQ_INIT(&s->active_rules);
+ }
+
+ if (rule->options.inject.immediately) {
+ return NULL;
+ }
+
+ acb = qemu_aio_get(&blkdebug_aio_pool, bs, cb, opaque);
+ acb->ret = -error;
+
+ bh = qemu_bh_new(error_callback_bh, acb);
+ acb->bh = bh;
+ qemu_bh_schedule(bh);
+
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugRule *rule = NULL;
+
+ QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+ if (rule->options.inject.sector == -1 ||
+ (rule->options.inject.sector >= sector_num &&
+ rule->options.inject.sector < sector_num + nb_sectors)) {
+ break;
+ }
+ }
+
+ if (rule && rule->options.inject.error) {
+ return inject_error(bs, cb, opaque, rule);
+ }
+
+ return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugRule *rule = NULL;
+
+ QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+ if (rule->options.inject.sector == -1 ||
+ (rule->options.inject.sector >= sector_num &&
+ rule->options.inject.sector < sector_num + nb_sectors)) {
+ break;
+ }
+ }
+
+ if (rule && rule->options.inject.error) {
+ return inject_error(bs, cb, opaque, rule);
+ }
+
+ return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void blkdebug_close(BlockDriverState *bs)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugRule *rule, *next;
+ int i;
+
+ for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+ QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
+ QLIST_REMOVE(rule, next);
+ g_free(rule);
+ }
+ }
+}
+
+static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
+ int old_state, bool injected)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+
+ /* Only process rules for the current state */
+ if (rule->state && rule->state != old_state) {
+ return injected;
+ }
+
+ /* Take the action */
+ switch (rule->action) {
+ case ACTION_INJECT_ERROR:
+ if (!injected) {
+ QSIMPLEQ_INIT(&s->active_rules);
+ injected = true;
+ }
+ QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next);
+ break;
+
+ case ACTION_SET_STATE:
+ s->state = rule->options.set_state.new_state;
+ break;
+ }
+ return injected;
+}
+
+static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ struct BlkdebugRule *rule;
+ int old_state = s->state;
+ bool injected;
+
+ assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
+
+ injected = false;
+ QLIST_FOREACH(rule, &s->rules[event], next) {
+ injected = process_rule(bs, rule, old_state, injected);
+ }
+}
+
+static int64_t blkdebug_getlength(BlockDriverState *bs)
+{
+ return bdrv_getlength(bs->file);
+}
+
+static BlockDriver bdrv_blkdebug = {
+ .format_name = "blkdebug",
+ .protocol_name = "blkdebug",
+
+ .instance_size = sizeof(BDRVBlkdebugState),
+
+ .bdrv_file_open = blkdebug_open,
+ .bdrv_close = blkdebug_close,
+ .bdrv_getlength = blkdebug_getlength,
+
+ .bdrv_aio_readv = blkdebug_aio_readv,
+ .bdrv_aio_writev = blkdebug_aio_writev,
+
+ .bdrv_debug_event = blkdebug_debug_event,
+};
+
+static void bdrv_blkdebug_init(void)
+{
+ bdrv_register(&bdrv_blkdebug);
+}
+
+block_init(bdrv_blkdebug_init);
diff --git a/block/blkverify.c b/block/blkverify.c
new file mode 100644
index 000000000..9d5f1ec5b
--- /dev/null
+++ b/block/blkverify.c
@@ -0,0 +1,366 @@
+/*
+ * Block protocol for block driver correctness testing
+ *
+ * Copyright (C) 2010 IBM, Corp.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *test_file;
+} BDRVBlkverifyState;
+
+typedef struct BlkverifyAIOCB BlkverifyAIOCB;
+struct BlkverifyAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+
+ /* Request metadata */
+ bool is_write;
+ int64_t sector_num;
+ int nb_sectors;
+
+ int ret; /* first completed request's result */
+ unsigned int done; /* completion counter */
+ bool *finished; /* completion signal for cancel */
+
+ QEMUIOVector *qiov; /* user I/O vector */
+ QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */
+ void *buf; /* buffer for raw file I/O */
+
+ void (*verify)(BlkverifyAIOCB *acb);
+};
+
+static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb;
+ bool finished = false;
+
+ /* Wait until request completes, invokes its callback, and frees itself */
+ acb->finished = &finished;
+ while (!finished) {
+ qemu_aio_wait();
+ }
+}
+
+static AIOPool blkverify_aio_pool = {
+ .aiocb_size = sizeof(BlkverifyAIOCB),
+ .cancel = blkverify_aio_cancel,
+};
+
+static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ",
+ acb->is_write ? "write" : "read", acb->sector_num,
+ acb->nb_sectors);
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+ exit(1);
+}
+
+/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */
+static int blkverify_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+ int ret;
+ char *raw, *c;
+
+ /* Parse the blkverify: prefix */
+ if (strncmp(filename, "blkverify:", strlen("blkverify:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkverify:");
+
+ /* Parse the raw image filename */
+ c = strchr(filename, ':');
+ if (c == NULL) {
+ return -EINVAL;
+ }
+
+ raw = g_strdup(filename);
+ raw[c - filename] = '\0';
+ ret = bdrv_file_open(&bs->file, raw, flags);
+ g_free(raw);
+ if (ret < 0) {
+ return ret;
+ }
+ filename = c + 1;
+
+ /* Open the test file */
+ s->test_file = bdrv_new("");
+ ret = bdrv_open(s->test_file, filename, flags, NULL);
+ if (ret < 0) {
+ bdrv_delete(s->test_file);
+ s->test_file = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static void blkverify_close(BlockDriverState *bs)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+
+ bdrv_delete(s->test_file);
+ s->test_file = NULL;
+}
+
+static int64_t blkverify_getlength(BlockDriverState *bs)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+
+ return bdrv_getlength(s->test_file);
+}
+
+/**
+ * Check that I/O vector contents are identical
+ *
+ * @a: I/O vector
+ * @b: I/O vector
+ * @ret: Offset to first mismatching byte or -1 if match
+ */
+static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+ int i;
+ ssize_t offset = 0;
+
+ assert(a->niov == b->niov);
+ for (i = 0; i < a->niov; i++) {
+ size_t len = 0;
+ uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+ uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+ assert(a->iov[i].iov_len == b->iov[i].iov_len);
+ while (len < a->iov[i].iov_len && *p++ == *q++) {
+ len++;
+ }
+
+ offset += len;
+
+ if (len != a->iov[i].iov_len) {
+ return offset;
+ }
+ }
+ return -1;
+}
+
+typedef struct {
+ int src_index;
+ struct iovec *src_iov;
+ void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ /* Don't overflow */
+ if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+ return -1;
+ } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved. This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src,
+ void *buf)
+{
+ IOVectorSortElem sortelems[src->niov];
+ void *last_end;
+ int i;
+
+ /* Sort by source iovecs by base address */
+ for (i = 0; i < src->niov; i++) {
+ sortelems[i].src_index = i;
+ sortelems[i].src_iov = &src->iov[i];
+ }
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+ /* Allocate buffer space taking into account overlapping iovecs */
+ last_end = NULL;
+ for (i = 0; i < src->niov; i++) {
+ struct iovec *cur = sortelems[i].src_iov;
+ ptrdiff_t rewind = 0;
+
+ /* Detect overlap */
+ if (last_end && last_end > cur->iov_base) {
+ rewind = last_end - cur->iov_base;
+ }
+
+ sortelems[i].dest_base = buf - rewind;
+ buf += cur->iov_len - MIN(rewind, cur->iov_len);
+ last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+ }
+
+ /* Sort by source iovec index and build destination iovec */
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+ for (i = 0; i < src->niov; i++) {
+ qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+ }
+}
+
+static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
+ int64_t sector_num, QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+
+ acb->bh = NULL;
+ acb->is_write = is_write;
+ acb->sector_num = sector_num;
+ acb->nb_sectors = nb_sectors;
+ acb->ret = -EINPROGRESS;
+ acb->done = 0;
+ acb->qiov = qiov;
+ acb->buf = NULL;
+ acb->verify = NULL;
+ acb->finished = NULL;
+ return acb;
+}
+
+static void blkverify_aio_bh(void *opaque)
+{
+ BlkverifyAIOCB *acb = opaque;
+
+ qemu_bh_delete(acb->bh);
+ if (acb->buf) {
+ qemu_iovec_destroy(&acb->raw_qiov);
+ qemu_vfree(acb->buf);
+ }
+ acb->common.cb(acb->common.opaque, acb->ret);
+ if (acb->finished) {
+ *acb->finished = true;
+ }
+ qemu_aio_release(acb);
+}
+
+static void blkverify_aio_cb(void *opaque, int ret)
+{
+ BlkverifyAIOCB *acb = opaque;
+
+ switch (++acb->done) {
+ case 1:
+ acb->ret = ret;
+ break;
+
+ case 2:
+ if (acb->ret != ret) {
+ blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret);
+ }
+
+ if (acb->verify) {
+ acb->verify(acb);
+ }
+
+ acb->bh = qemu_bh_new(blkverify_aio_bh, acb);
+ qemu_bh_schedule(acb->bh);
+ break;
+ }
+}
+
+static void blkverify_verify_readv(BlkverifyAIOCB *acb)
+{
+ ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov);
+ if (offset != -1) {
+ blkverify_err(acb, "contents mismatch in sector %" PRId64,
+ acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE));
+ }
+}
+
+static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+ BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov,
+ nb_sectors, cb, opaque);
+
+ acb->verify = blkverify_verify_readv;
+ acb->buf = qemu_blockalign(bs->file, qiov->size);
+ qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
+ blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
+
+ bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
+ blkverify_aio_cb, acb);
+ bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors,
+ blkverify_aio_cb, acb);
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+ BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
+ nb_sectors, cb, opaque);
+
+ bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors,
+ blkverify_aio_cb, acb);
+ bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+ blkverify_aio_cb, acb);
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+
+ /* Only flush test file, the raw file is not important */
+ return bdrv_aio_flush(s->test_file, cb, opaque);
+}
+
+static BlockDriver bdrv_blkverify = {
+ .format_name = "blkverify",
+ .protocol_name = "blkverify",
+
+ .instance_size = sizeof(BDRVBlkverifyState),
+
+ .bdrv_getlength = blkverify_getlength,
+
+ .bdrv_file_open = blkverify_open,
+ .bdrv_close = blkverify_close,
+
+ .bdrv_aio_readv = blkverify_aio_readv,
+ .bdrv_aio_writev = blkverify_aio_writev,
+ .bdrv_aio_flush = blkverify_aio_flush,
+};
+
+static void bdrv_blkverify_init(void)
+{
+ bdrv_register(&bdrv_blkverify);
+}
+
+block_init(bdrv_blkverify_init);
diff --git a/block/bochs.c b/block/bochs.c
new file mode 100644
index 000000000..ab7944dc4
--- /dev/null
+++ b/block/bochs.c
@@ -0,0 +1,243 @@
+/*
+ * Block driver for the various disk image formats used by Bochs
+ * Currently only for "growing" type in read-only mode
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "Bochs Virtual HD Image"
+#define HEADER_VERSION 0x00020000
+#define HEADER_V1 0x00010000
+#define HEADER_SIZE 512
+
+#define REDOLOG_TYPE "Redolog"
+#define GROWING_TYPE "Growing"
+
+// not allocated: 0xffffffff
+
+// always little-endian
+struct bochs_header_v1 {
+ char magic[32]; // "Bochs Virtual HD Image"
+ char type[16]; // "Redolog"
+ char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+ uint32_t version;
+ uint32_t header; // size of header
+
+ union {
+ struct {
+ uint32_t catalog; // num of entries
+ uint32_t bitmap; // bitmap size
+ uint32_t extent; // extent size
+ uint64_t disk; // disk size
+ char padding[HEADER_SIZE - 64 - 8 - 20];
+ } redolog;
+ char padding[HEADER_SIZE - 64 - 8];
+ } extra;
+};
+
+// always little-endian
+struct bochs_header {
+ char magic[32]; // "Bochs Virtual HD Image"
+ char type[16]; // "Redolog"
+ char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+ uint32_t version;
+ uint32_t header; // size of header
+
+ union {
+ struct {
+ uint32_t catalog; // num of entries
+ uint32_t bitmap; // bitmap size
+ uint32_t extent; // extent size
+ uint32_t reserved; // for ???
+ uint64_t disk; // disk size
+ char padding[HEADER_SIZE - 64 - 8 - 24];
+ } redolog;
+ char padding[HEADER_SIZE - 64 - 8];
+ } extra;
+};
+
+typedef struct BDRVBochsState {
+ CoMutex lock;
+ uint32_t *catalog_bitmap;
+ int catalog_size;
+
+ int data_offset;
+
+ int bitmap_blocks;
+ int extent_blocks;
+ int extent_size;
+} BDRVBochsState;
+
+static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct bochs_header *bochs = (const void *)buf;
+
+ if (buf_size < HEADER_SIZE)
+ return 0;
+
+ if (!strcmp(bochs->magic, HEADER_MAGIC) &&
+ !strcmp(bochs->type, REDOLOG_TYPE) &&
+ !strcmp(bochs->subtype, GROWING_TYPE) &&
+ ((le32_to_cpu(bochs->version) == HEADER_VERSION) ||
+ (le32_to_cpu(bochs->version) == HEADER_V1)))
+ return 100;
+
+ return 0;
+}
+
+static int bochs_open(BlockDriverState *bs, int flags)
+{
+ BDRVBochsState *s = bs->opaque;
+ int i;
+ struct bochs_header bochs;
+ struct bochs_header_v1 header_v1;
+
+ bs->read_only = 1; // no write support yet
+
+ if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) {
+ goto fail;
+ }
+
+ if (strcmp(bochs.magic, HEADER_MAGIC) ||
+ strcmp(bochs.type, REDOLOG_TYPE) ||
+ strcmp(bochs.subtype, GROWING_TYPE) ||
+ ((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
+ (le32_to_cpu(bochs.version) != HEADER_V1))) {
+ goto fail;
+ }
+
+ if (le32_to_cpu(bochs.version) == HEADER_V1) {
+ memcpy(&header_v1, &bochs, sizeof(bochs));
+ bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+ } else {
+ bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+ }
+
+ s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
+ s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+ if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
+ s->catalog_size * 4) != s->catalog_size * 4)
+ goto fail;
+ for (i = 0; i < s->catalog_size; i++)
+ le32_to_cpus(&s->catalog_bitmap[i]);
+
+ s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);
+
+ s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
+ s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+
+ s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+ fail:
+ return -1;
+}
+
+static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+ BDRVBochsState *s = bs->opaque;
+ int64_t offset = sector_num * 512;
+ int64_t extent_index, extent_offset, bitmap_offset;
+ char bitmap_entry;
+
+ // seek to sector
+ extent_index = offset / s->extent_size;
+ extent_offset = (offset % s->extent_size) / 512;
+
+ if (s->catalog_bitmap[extent_index] == 0xffffffff) {
+ return -1; /* not allocated */
+ }
+
+ bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
+ (s->extent_blocks + s->bitmap_blocks));
+
+ /* read in bitmap for current extent */
+ if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
+ &bitmap_entry, 1) != 1) {
+ return -1;
+ }
+
+ if (!((bitmap_entry >> (extent_offset % 8)) & 1)) {
+ return -1; /* not allocated */
+ }
+
+ return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
+}
+
+static int bochs_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+
+ while (nb_sectors > 0) {
+ int64_t block_offset = seek_to_sector(bs, sector_num);
+ if (block_offset >= 0) {
+ ret = bdrv_pread(bs->file, block_offset, buf, 512);
+ if (ret != 512) {
+ return -1;
+ }
+ } else
+ memset(buf, 0, 512);
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+ return 0;
+}
+
+static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVBochsState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = bochs_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static void bochs_close(BlockDriverState *bs)
+{
+ BDRVBochsState *s = bs->opaque;
+ g_free(s->catalog_bitmap);
+}
+
+static BlockDriver bdrv_bochs = {
+ .format_name = "bochs",
+ .instance_size = sizeof(BDRVBochsState),
+ .bdrv_probe = bochs_probe,
+ .bdrv_open = bochs_open,
+ .bdrv_read = bochs_co_read,
+ .bdrv_close = bochs_close,
+};
+
+static void bdrv_bochs_init(void)
+{
+ bdrv_register(&bdrv_bochs);
+}
+
+block_init(bdrv_bochs_init);
diff --git a/block/cloop.c b/block/cloop.c
new file mode 100644
index 000000000..7570eb8e7
--- /dev/null
+++ b/block/cloop.c
@@ -0,0 +1,195 @@
+/*
+ * QEMU Block driver for CLOOP images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVCloopState {
+ CoMutex lock;
+ uint32_t block_size;
+ uint32_t n_blocks;
+ uint64_t *offsets;
+ uint32_t sectors_per_block;
+ uint32_t current_block;
+ uint8_t *compressed_block;
+ uint8_t *uncompressed_block;
+ z_stream zstream;
+} BDRVCloopState;
+
+static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const char *magic_version_2_0 = "#!/bin/sh\n"
+ "#V2.0 Format\n"
+ "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n";
+ int length = strlen(magic_version_2_0);
+ if (length > buf_size) {
+ length = buf_size;
+ }
+ if (!memcmp(magic_version_2_0, buf, length)) {
+ return 2;
+ }
+ return 0;
+}
+
+static int cloop_open(BlockDriverState *bs, int flags)
+{
+ BDRVCloopState *s = bs->opaque;
+ uint32_t offsets_size, max_compressed_block_size = 1, i;
+
+ bs->read_only = 1;
+
+ /* read header */
+ if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) {
+ goto cloop_close;
+ }
+ s->block_size = be32_to_cpu(s->block_size);
+
+ if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) {
+ goto cloop_close;
+ }
+ s->n_blocks = be32_to_cpu(s->n_blocks);
+
+ /* read offsets */
+ offsets_size = s->n_blocks * sizeof(uint64_t);
+ s->offsets = g_malloc(offsets_size);
+ if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) <
+ offsets_size) {
+ goto cloop_close;
+ }
+ for(i=0;i<s->n_blocks;i++) {
+ s->offsets[i] = be64_to_cpu(s->offsets[i]);
+ if (i > 0) {
+ uint32_t size = s->offsets[i] - s->offsets[i - 1];
+ if (size > max_compressed_block_size) {
+ max_compressed_block_size = size;
+ }
+ }
+ }
+
+ /* initialize zlib engine */
+ s->compressed_block = g_malloc(max_compressed_block_size + 1);
+ s->uncompressed_block = g_malloc(s->block_size);
+ if (inflateInit(&s->zstream) != Z_OK) {
+ goto cloop_close;
+ }
+ s->current_block = s->n_blocks;
+
+ s->sectors_per_block = s->block_size/512;
+ bs->total_sectors = s->n_blocks * s->sectors_per_block;
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+
+cloop_close:
+ return -1;
+}
+
+static inline int cloop_read_block(BlockDriverState *bs, int block_num)
+{
+ BDRVCloopState *s = bs->opaque;
+
+ if (s->current_block != block_num) {
+ int ret;
+ uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num];
+
+ ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block,
+ bytes);
+ if (ret != bytes) {
+ return -1;
+ }
+
+ s->zstream.next_in = s->compressed_block;
+ s->zstream.avail_in = bytes;
+ s->zstream.next_out = s->uncompressed_block;
+ s->zstream.avail_out = s->block_size;
+ ret = inflateReset(&s->zstream);
+ if (ret != Z_OK) {
+ return -1;
+ }
+ ret = inflate(&s->zstream, Z_FINISH);
+ if (ret != Z_STREAM_END || s->zstream.total_out != s->block_size) {
+ return -1;
+ }
+
+ s->current_block = block_num;
+ }
+ return 0;
+}
+
+static int cloop_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVCloopState *s = bs->opaque;
+ int i;
+
+ for (i = 0; i < nb_sectors; i++) {
+ uint32_t sector_offset_in_block =
+ ((sector_num + i) % s->sectors_per_block),
+ block_num = (sector_num + i) / s->sectors_per_block;
+ if (cloop_read_block(bs, block_num) != 0) {
+ return -1;
+ }
+ memcpy(buf + i * 512,
+ s->uncompressed_block + sector_offset_in_block * 512, 512);
+ }
+ return 0;
+}
+
+static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVCloopState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = cloop_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static void cloop_close(BlockDriverState *bs)
+{
+ BDRVCloopState *s = bs->opaque;
+ if (s->n_blocks > 0) {
+ g_free(s->offsets);
+ }
+ g_free(s->compressed_block);
+ g_free(s->uncompressed_block);
+ inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_cloop = {
+ .format_name = "cloop",
+ .instance_size = sizeof(BDRVCloopState),
+ .bdrv_probe = cloop_probe,
+ .bdrv_open = cloop_open,
+ .bdrv_read = cloop_co_read,
+ .bdrv_close = cloop_close,
+};
+
+static void bdrv_cloop_init(void)
+{
+ bdrv_register(&bdrv_cloop);
+}
+
+block_init(bdrv_cloop_init);
diff --git a/block/cow.c b/block/cow.c
new file mode 100644
index 000000000..a5a00eb9c
--- /dev/null
+++ b/block/cow.c
@@ -0,0 +1,356 @@
+/*
+ * Block driver for the COW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+/* COW block driver using file system holes */
+
+/* user mode linux compatible COW file */
+#define COW_MAGIC 0x4f4f4f4d /* MOOO */
+#define COW_VERSION 2
+
+struct cow_header_v2 {
+ uint32_t magic;
+ uint32_t version;
+ char backing_file[1024];
+ int32_t mtime;
+ uint64_t size;
+ uint32_t sectorsize;
+};
+
+typedef struct BDRVCowState {
+ CoMutex lock;
+ int64_t cow_sectors_offset;
+} BDRVCowState;
+
+static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct cow_header_v2 *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(struct cow_header_v2) &&
+ be32_to_cpu(cow_header->magic) == COW_MAGIC &&
+ be32_to_cpu(cow_header->version) == COW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+static int cow_open(BlockDriverState *bs, int flags)
+{
+ BDRVCowState *s = bs->opaque;
+ struct cow_header_v2 cow_header;
+ int bitmap_size;
+ int64_t size;
+ int ret;
+
+ /* see if it is a cow image */
+ ret = bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ if (be32_to_cpu(cow_header.version) != COW_VERSION) {
+ char version[64];
+ snprintf(version, sizeof(version),
+ "COW version %d", cow_header.version);
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "cow", version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /* cow image found */
+ size = be64_to_cpu(cow_header.size);
+ bs->total_sectors = size / 512;
+
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+ cow_header.backing_file);
+
+ bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
+ s->cow_sectors_offset = (bitmap_size + 511) & ~511;
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+ fail:
+ return ret;
+}
+
+/*
+ * XXX(hch): right now these functions are extremely inefficient.
+ * We should just read the whole bitmap we'll need in one go instead.
+ */
+static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+{
+ uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
+ uint8_t bitmap;
+ int ret;
+
+ ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+ if (ret < 0) {
+ return ret;
+ }
+
+ bitmap |= (1 << (bitnum % 8));
+
+ ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+}
+
+static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+ uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
+ uint8_t bitmap;
+ int ret;
+
+ ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+ if (ret < 0) {
+ return ret;
+ }
+
+ return !!(bitmap & (1 << (bitnum % 8)));
+}
+
+/* Return true if first block has been changed (ie. current version is
+ * in COW file). Set the number of continuous blocks for which that
+ * is true. */
+static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *num_same)
+{
+ int changed;
+
+ if (nb_sectors == 0) {
+ *num_same = nb_sectors;
+ return 0;
+ }
+
+ changed = is_bit_set(bs, sector_num);
+ if (changed < 0) {
+ return 0; /* XXX: how to return I/O errors? */
+ }
+
+ for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+ if (is_bit_set(bs, sector_num + *num_same) != changed)
+ break;
+ }
+
+ return changed;
+}
+
+static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ int error = 0;
+ int i;
+
+ for (i = 0; i < nb_sectors; i++) {
+ error = cow_set_bit(bs, sector_num + i);
+ if (error) {
+ break;
+ }
+ }
+
+ return error;
+}
+
+static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVCowState *s = bs->opaque;
+ int ret, n;
+
+ while (nb_sectors > 0) {
+ if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
+ ret = bdrv_pread(bs->file,
+ s->cow_sectors_offset + sector_num * 512,
+ buf, n * 512);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ memset(buf, 0, n * 512);
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+
+static coroutine_fn int cow_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVCowState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = cow_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int cow_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVCowState *s = bs->opaque;
+ int ret;
+
+ ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512,
+ buf, nb_sectors * 512);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return cow_update_bitmap(bs, sector_num, nb_sectors);
+}
+
+static coroutine_fn int cow_co_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVCowState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = cow_write(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static void cow_close(BlockDriverState *bs)
+{
+}
+
+static int cow_create(const char *filename, QEMUOptionParameter *options)
+{
+ struct cow_header_v2 cow_header;
+ struct stat st;
+ int64_t image_sectors = 0;
+ const char *image_filename = NULL;
+ int ret;
+ BlockDriverState *cow_bs;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ image_sectors = options->value.n / 512;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ image_filename = options->value.s;
+ }
+ options++;
+ }
+
+ ret = bdrv_create_file(filename, options);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
+ if (ret < 0) {
+ return ret;
+ }
+
+ memset(&cow_header, 0, sizeof(cow_header));
+ cow_header.magic = cpu_to_be32(COW_MAGIC);
+ cow_header.version = cpu_to_be32(COW_VERSION);
+ if (image_filename) {
+ /* Note: if no file, we put a dummy mtime */
+ cow_header.mtime = cpu_to_be32(0);
+
+ if (stat(image_filename, &st) != 0) {
+ goto mtime_fail;
+ }
+ cow_header.mtime = cpu_to_be32(st.st_mtime);
+ mtime_fail:
+ pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
+ image_filename);
+ }
+ cow_header.sectorsize = cpu_to_be32(512);
+ cow_header.size = cpu_to_be64(image_sectors * 512);
+ ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
+ if (ret < 0) {
+ goto exit;
+ }
+
+ /* resize to include at least all the bitmap */
+ ret = bdrv_truncate(cow_bs,
+ sizeof(cow_header) + ((image_sectors + 7) >> 3));
+ if (ret < 0) {
+ goto exit;
+ }
+
+exit:
+ bdrv_delete(cow_bs);
+ return ret;
+}
+
+static QEMUOptionParameter cow_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_cow = {
+ .format_name = "cow",
+ .instance_size = sizeof(BDRVCowState),
+
+ .bdrv_probe = cow_probe,
+ .bdrv_open = cow_open,
+ .bdrv_close = cow_close,
+ .bdrv_create = cow_create,
+
+ .bdrv_read = cow_co_read,
+ .bdrv_write = cow_co_write,
+ .bdrv_co_is_allocated = cow_co_is_allocated,
+
+ .create_options = cow_create_options,
+};
+
+static void bdrv_cow_init(void)
+{
+ bdrv_register(&bdrv_cow);
+}
+
+block_init(bdrv_cow_init);
diff --git a/block/curl.c b/block/curl.c
new file mode 100644
index 000000000..e7c3634d3
--- /dev/null
+++ b/block/curl.c
@@ -0,0 +1,624 @@
+/*
+ * QEMU Block driver for CURL images
+ *
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include <curl/curl.h>
+
+// #define DEBUG
+// #define DEBUG_VERBOSE
+
+#ifdef DEBUG_CURL
+#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#define CURL_NUM_STATES 8
+#define CURL_NUM_ACB 8
+#define SECTOR_SIZE 512
+#define READ_AHEAD_SIZE (256 * 1024)
+
+#define FIND_RET_NONE 0
+#define FIND_RET_OK 1
+#define FIND_RET_WAIT 2
+
+struct BDRVCURLState;
+
+typedef struct CURLAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+ QEMUIOVector *qiov;
+
+ int64_t sector_num;
+ int nb_sectors;
+
+ size_t start;
+ size_t end;
+} CURLAIOCB;
+
+typedef struct CURLState
+{
+ struct BDRVCURLState *s;
+ CURLAIOCB *acb[CURL_NUM_ACB];
+ CURL *curl;
+ char *orig_buf;
+ size_t buf_start;
+ size_t buf_off;
+ size_t buf_len;
+ char range[128];
+ char errmsg[CURL_ERROR_SIZE];
+ char in_use;
+} CURLState;
+
+typedef struct BDRVCURLState {
+ CURLM *multi;
+ size_t len;
+ CURLState states[CURL_NUM_STATES];
+ char *url;
+ size_t readahead_size;
+} BDRVCURLState;
+
+static void curl_clean_state(CURLState *s);
+static void curl_multi_do(void *arg);
+static int curl_aio_flush(void *opaque);
+
+static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
+ void *s, void *sp)
+{
+ DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd);
+ switch (action) {
+ case CURL_POLL_IN:
+ qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, curl_aio_flush, s);
+ break;
+ case CURL_POLL_OUT:
+ qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, curl_aio_flush, s);
+ break;
+ case CURL_POLL_INOUT:
+ qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do,
+ curl_aio_flush, s);
+ break;
+ case CURL_POLL_REMOVE:
+ qemu_aio_set_fd_handler(fd, NULL, NULL, NULL, NULL);
+ break;
+ }
+
+ return 0;
+}
+
+static size_t curl_size_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
+{
+ CURLState *s = ((CURLState*)opaque);
+ size_t realsize = size * nmemb;
+ size_t fsize;
+
+ if(sscanf(ptr, "Content-Length: %zd", &fsize) == 1) {
+ s->s->len = fsize;
+ }
+
+ return realsize;
+}
+
+static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
+{
+ CURLState *s = ((CURLState*)opaque);
+ size_t realsize = size * nmemb;
+ int i;
+
+ DPRINTF("CURL: Just reading %zd bytes\n", realsize);
+
+ if (!s || !s->orig_buf)
+ goto read_end;
+
+ memcpy(s->orig_buf + s->buf_off, ptr, realsize);
+ s->buf_off += realsize;
+
+ for(i=0; i<CURL_NUM_ACB; i++) {
+ CURLAIOCB *acb = s->acb[i];
+
+ if (!acb)
+ continue;
+
+ if ((s->buf_off >= acb->end)) {
+ qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
+ acb->end - acb->start);
+ acb->common.cb(acb->common.opaque, 0);
+ qemu_aio_release(acb);
+ s->acb[i] = NULL;
+ }
+ }
+
+read_end:
+ return realsize;
+}
+
+static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
+ CURLAIOCB *acb)
+{
+ int i;
+ size_t end = start + len;
+
+ for (i=0; i<CURL_NUM_STATES; i++) {
+ CURLState *state = &s->states[i];
+ size_t buf_end = (state->buf_start + state->buf_off);
+ size_t buf_fend = (state->buf_start + state->buf_len);
+
+ if (!state->orig_buf)
+ continue;
+ if (!state->buf_off)
+ continue;
+
+ // Does the existing buffer cover our section?
+ if ((start >= state->buf_start) &&
+ (start <= buf_end) &&
+ (end >= state->buf_start) &&
+ (end <= buf_end))
+ {
+ char *buf = state->orig_buf + (start - state->buf_start);
+
+ qemu_iovec_from_buf(acb->qiov, 0, buf, len);
+ acb->common.cb(acb->common.opaque, 0);
+
+ return FIND_RET_OK;
+ }
+
+ // Wait for unfinished chunks
+ if ((start >= state->buf_start) &&
+ (start <= buf_fend) &&
+ (end >= state->buf_start) &&
+ (end <= buf_fend))
+ {
+ int j;
+
+ acb->start = start - state->buf_start;
+ acb->end = acb->start + len;
+
+ for (j=0; j<CURL_NUM_ACB; j++) {
+ if (!state->acb[j]) {
+ state->acb[j] = acb;
+ return FIND_RET_WAIT;
+ }
+ }
+ }
+ }
+
+ return FIND_RET_NONE;
+}
+
+static void curl_multi_do(void *arg)
+{
+ BDRVCURLState *s = (BDRVCURLState *)arg;
+ int running;
+ int r;
+ int msgs_in_queue;
+
+ if (!s->multi)
+ return;
+
+ do {
+ r = curl_multi_socket_all(s->multi, &running);
+ } while(r == CURLM_CALL_MULTI_PERFORM);
+
+ /* Try to find done transfers, so we can free the easy
+ * handle again. */
+ do {
+ CURLMsg *msg;
+ msg = curl_multi_info_read(s->multi, &msgs_in_queue);
+
+ if (!msg)
+ break;
+ if (msg->msg == CURLMSG_NONE)
+ break;
+
+ switch (msg->msg) {
+ case CURLMSG_DONE:
+ {
+ CURLState *state = NULL;
+ curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state);
+
+ /* ACBs for successful messages get completed in curl_read_cb */
+ if (msg->data.result != CURLE_OK) {
+ int i;
+ for (i = 0; i < CURL_NUM_ACB; i++) {
+ CURLAIOCB *acb = state->acb[i];
+
+ if (acb == NULL) {
+ continue;
+ }
+
+ acb->common.cb(acb->common.opaque, -EIO);
+ qemu_aio_release(acb);
+ state->acb[i] = NULL;
+ }
+ }
+
+ curl_clean_state(state);
+ break;
+ }
+ default:
+ msgs_in_queue = 0;
+ break;
+ }
+ } while(msgs_in_queue);
+}
+
+static CURLState *curl_init_state(BDRVCURLState *s)
+{
+ CURLState *state = NULL;
+ int i, j;
+
+ do {
+ for (i=0; i<CURL_NUM_STATES; i++) {
+ for (j=0; j<CURL_NUM_ACB; j++)
+ if (s->states[i].acb[j])
+ continue;
+ if (s->states[i].in_use)
+ continue;
+
+ state = &s->states[i];
+ state->in_use = 1;
+ break;
+ }
+ if (!state) {
+ g_usleep(100);
+ curl_multi_do(s);
+ }
+ } while(!state);
+
+ if (state->curl)
+ goto has_curl;
+
+ state->curl = curl_easy_init();
+ if (!state->curl)
+ return NULL;
+ curl_easy_setopt(state->curl, CURLOPT_URL, s->url);
+ curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5);
+ curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb);
+ curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state);
+ curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state);
+ curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1);
+ curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1);
+ curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1);
+ curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
+ curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);
+
+#ifdef DEBUG_VERBOSE
+ curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1);
+#endif
+
+has_curl:
+
+ state->s = s;
+
+ return state;
+}
+
+static void curl_clean_state(CURLState *s)
+{
+ if (s->s->multi)
+ curl_multi_remove_handle(s->s->multi, s->curl);
+ s->in_use = 0;
+}
+
+static int curl_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVCURLState *s = bs->opaque;
+ CURLState *state = NULL;
+ double d;
+
+ #define RA_OPTSTR ":readahead="
+ char *file;
+ char *ra;
+ const char *ra_val;
+ int parse_state = 0;
+
+ static int inited = 0;
+
+ file = g_strdup(filename);
+ s->readahead_size = READ_AHEAD_SIZE;
+
+ /* Parse a trailing ":readahead=#:" param, if present. */
+ ra = file + strlen(file) - 1;
+ while (ra >= file) {
+ if (parse_state == 0) {
+ if (*ra == ':')
+ parse_state++;
+ else
+ break;
+ } else if (parse_state == 1) {
+ if (*ra > '9' || *ra < '0') {
+ char *opt_start = ra - strlen(RA_OPTSTR) + 1;
+ if (opt_start > file &&
+ strncmp(opt_start, RA_OPTSTR, strlen(RA_OPTSTR)) == 0) {
+ ra_val = ra + 1;
+ ra -= strlen(RA_OPTSTR) - 1;
+ *ra = '\0';
+ s->readahead_size = atoi(ra_val);
+ break;
+ } else {
+ break;
+ }
+ }
+ }
+ ra--;
+ }
+
+ if ((s->readahead_size & 0x1ff) != 0) {
+ fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n",
+ s->readahead_size);
+ goto out_noclean;
+ }
+
+ if (!inited) {
+ curl_global_init(CURL_GLOBAL_ALL);
+ inited = 1;
+ }
+
+ DPRINTF("CURL: Opening %s\n", file);
+ s->url = file;
+ state = curl_init_state(s);
+ if (!state)
+ goto out_noclean;
+
+ // Get file size
+
+ curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1);
+ curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_size_cb);
+ if (curl_easy_perform(state->curl))
+ goto out;
+ curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d);
+ curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb);
+ curl_easy_setopt(state->curl, CURLOPT_NOBODY, 0);
+ if (d)
+ s->len = (size_t)d;
+ else if(!s->len)
+ goto out;
+ DPRINTF("CURL: Size = %zd\n", s->len);
+
+ curl_clean_state(state);
+ curl_easy_cleanup(state->curl);
+ state->curl = NULL;
+
+ // Now we know the file exists and its size, so let's
+ // initialize the multi interface!
+
+ s->multi = curl_multi_init();
+ curl_multi_setopt( s->multi, CURLMOPT_SOCKETDATA, s);
+ curl_multi_setopt( s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb );
+ curl_multi_do(s);
+
+ return 0;
+
+out:
+ fprintf(stderr, "CURL: Error opening file: %s\n", state->errmsg);
+ curl_easy_cleanup(state->curl);
+ state->curl = NULL;
+out_noclean:
+ g_free(file);
+ return -EINVAL;
+}
+
+static int curl_aio_flush(void *opaque)
+{
+ BDRVCURLState *s = opaque;
+ int i, j;
+
+ for (i=0; i < CURL_NUM_STATES; i++) {
+ for(j=0; j < CURL_NUM_ACB; j++) {
+ if (s->states[i].acb[j]) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ // Do we have to implement canceling? Seems to work without...
+}
+
+static AIOPool curl_aio_pool = {
+ .aiocb_size = sizeof(CURLAIOCB),
+ .cancel = curl_aio_cancel,
+};
+
+
+static void curl_readv_bh_cb(void *p)
+{
+ CURLState *state;
+
+ CURLAIOCB *acb = p;
+ BDRVCURLState *s = acb->common.bs->opaque;
+
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+
+ size_t start = acb->sector_num * SECTOR_SIZE;
+ size_t end;
+
+ // In case we have the requested data already (e.g. read-ahead),
+ // we can just call the callback and be done.
+ switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
+ case FIND_RET_OK:
+ qemu_aio_release(acb);
+ // fall through
+ case FIND_RET_WAIT:
+ return;
+ default:
+ break;
+ }
+
+ // No cache found, so let's start a new request
+ state = curl_init_state(s);
+ if (!state) {
+ acb->common.cb(acb->common.opaque, -EIO);
+ qemu_aio_release(acb);
+ return;
+ }
+
+ acb->start = 0;
+ acb->end = (acb->nb_sectors * SECTOR_SIZE);
+
+ state->buf_off = 0;
+ if (state->orig_buf)
+ g_free(state->orig_buf);
+ state->buf_start = start;
+ state->buf_len = acb->end + s->readahead_size;
+ end = MIN(start + state->buf_len, s->len) - 1;
+ state->orig_buf = g_malloc(state->buf_len);
+ state->acb[0] = acb;
+
+ snprintf(state->range, 127, "%zd-%zd", start, end);
+ DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n",
+ (acb->nb_sectors * SECTOR_SIZE), start, state->range);
+ curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);
+
+ curl_multi_add_handle(s->multi, state->curl);
+ curl_multi_do(s);
+
+}
+
+static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ CURLAIOCB *acb;
+
+ acb = qemu_aio_get(&curl_aio_pool, bs, cb, opaque);
+
+ acb->qiov = qiov;
+ acb->sector_num = sector_num;
+ acb->nb_sectors = nb_sectors;
+
+ acb->bh = qemu_bh_new(curl_readv_bh_cb, acb);
+
+ if (!acb->bh) {
+ DPRINTF("CURL: qemu_bh_new failed\n");
+ return NULL;
+ }
+
+ qemu_bh_schedule(acb->bh);
+ return &acb->common;
+}
+
+static void curl_close(BlockDriverState *bs)
+{
+ BDRVCURLState *s = bs->opaque;
+ int i;
+
+ DPRINTF("CURL: Close\n");
+ for (i=0; i<CURL_NUM_STATES; i++) {
+ if (s->states[i].in_use)
+ curl_clean_state(&s->states[i]);
+ if (s->states[i].curl) {
+ curl_easy_cleanup(s->states[i].curl);
+ s->states[i].curl = NULL;
+ }
+ if (s->states[i].orig_buf) {
+ g_free(s->states[i].orig_buf);
+ s->states[i].orig_buf = NULL;
+ }
+ }
+ if (s->multi)
+ curl_multi_cleanup(s->multi);
+ if (s->url)
+ free(s->url);
+}
+
+static int64_t curl_getlength(BlockDriverState *bs)
+{
+ BDRVCURLState *s = bs->opaque;
+ return s->len;
+}
+
+static BlockDriver bdrv_http = {
+ .format_name = "http",
+ .protocol_name = "http",
+
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
+
+ .bdrv_aio_readv = curl_aio_readv,
+};
+
+static BlockDriver bdrv_https = {
+ .format_name = "https",
+ .protocol_name = "https",
+
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
+
+ .bdrv_aio_readv = curl_aio_readv,
+};
+
+static BlockDriver bdrv_ftp = {
+ .format_name = "ftp",
+ .protocol_name = "ftp",
+
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
+
+ .bdrv_aio_readv = curl_aio_readv,
+};
+
+static BlockDriver bdrv_ftps = {
+ .format_name = "ftps",
+ .protocol_name = "ftps",
+
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
+
+ .bdrv_aio_readv = curl_aio_readv,
+};
+
+static BlockDriver bdrv_tftp = {
+ .format_name = "tftp",
+ .protocol_name = "tftp",
+
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
+
+ .bdrv_aio_readv = curl_aio_readv,
+};
+
+static void curl_block_init(void)
+{
+ bdrv_register(&bdrv_http);
+ bdrv_register(&bdrv_https);
+ bdrv_register(&bdrv_ftp);
+ bdrv_register(&bdrv_ftps);
+ bdrv_register(&bdrv_tftp);
+}
+
+block_init(curl_block_init);
diff --git a/block/dmg.c b/block/dmg.c
new file mode 100644
index 000000000..37902a434
--- /dev/null
+++ b/block/dmg.c
@@ -0,0 +1,325 @@
+/*
+ * QEMU Block driver for DMG images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "bswap.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVDMGState {
+ CoMutex lock;
+ /* each chunk contains a certain number of sectors,
+ * offsets[i] is the offset in the .dmg file,
+ * lengths[i] is the length of the compressed chunk,
+ * sectors[i] is the sector beginning at offsets[i],
+ * sectorcounts[i] is the number of sectors in that chunk,
+ * the sectors array is ordered
+ * 0<=i<n_chunks */
+
+ uint32_t n_chunks;
+ uint32_t* types;
+ uint64_t* offsets;
+ uint64_t* lengths;
+ uint64_t* sectors;
+ uint64_t* sectorcounts;
+ uint32_t current_chunk;
+ uint8_t *compressed_chunk;
+ uint8_t *uncompressed_chunk;
+ z_stream zstream;
+} BDRVDMGState;
+
+static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ int len=strlen(filename);
+ if(len>4 && !strcmp(filename+len-4,".dmg"))
+ return 2;
+ return 0;
+}
+
+static off_t read_off(BlockDriverState *bs, int64_t offset)
+{
+ uint64_t buffer;
+ if (bdrv_pread(bs->file, offset, &buffer, 8) < 8)
+ return 0;
+ return be64_to_cpu(buffer);
+}
+
+static off_t read_uint32(BlockDriverState *bs, int64_t offset)
+{
+ uint32_t buffer;
+ if (bdrv_pread(bs->file, offset, &buffer, 4) < 4)
+ return 0;
+ return be32_to_cpu(buffer);
+}
+
+static int dmg_open(BlockDriverState *bs, int flags)
+{
+ BDRVDMGState *s = bs->opaque;
+ off_t info_begin,info_end,last_in_offset,last_out_offset;
+ uint32_t count;
+ uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+ int64_t offset;
+
+ bs->read_only = 1;
+ s->n_chunks = 0;
+ s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+
+ /* read offset of info blocks */
+ offset = bdrv_getlength(bs->file);
+ if (offset < 0) {
+ goto fail;
+ }
+ offset -= 0x1d8;
+
+ info_begin = read_off(bs, offset);
+ if (info_begin == 0) {
+ goto fail;
+ }
+
+ if (read_uint32(bs, info_begin) != 0x100) {
+ goto fail;
+ }
+
+ count = read_uint32(bs, info_begin + 4);
+ if (count == 0) {
+ goto fail;
+ }
+ info_end = info_begin + count;
+
+ offset = info_begin + 0x100;
+
+ /* read offsets */
+ last_in_offset = last_out_offset = 0;
+ while (offset < info_end) {
+ uint32_t type;
+
+ count = read_uint32(bs, offset);
+ if(count==0)
+ goto fail;
+ offset += 4;
+
+ type = read_uint32(bs, offset);
+ if (type == 0x6d697368 && count >= 244) {
+ int new_size, chunk_count;
+
+ offset += 4;
+ offset += 200;
+
+ chunk_count = (count-204)/40;
+ new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+ s->types = g_realloc(s->types, new_size/2);
+ s->offsets = g_realloc(s->offsets, new_size);
+ s->lengths = g_realloc(s->lengths, new_size);
+ s->sectors = g_realloc(s->sectors, new_size);
+ s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+
+ for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
+ s->types[i] = read_uint32(bs, offset);
+ offset += 4;
+ if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
+ if(s->types[i]==0xffffffff) {
+ last_in_offset = s->offsets[i-1]+s->lengths[i-1];
+ last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
+ }
+ chunk_count--;
+ i--;
+ offset += 36;
+ continue;
+ }
+ offset += 4;
+
+ s->sectors[i] = last_out_offset+read_off(bs, offset);
+ offset += 8;
+
+ s->sectorcounts[i] = read_off(bs, offset);
+ offset += 8;
+
+ s->offsets[i] = last_in_offset+read_off(bs, offset);
+ offset += 8;
+
+ s->lengths[i] = read_off(bs, offset);
+ offset += 8;
+
+ if(s->lengths[i]>max_compressed_size)
+ max_compressed_size = s->lengths[i];
+ if(s->sectorcounts[i]>max_sectors_per_chunk)
+ max_sectors_per_chunk = s->sectorcounts[i];
+ }
+ s->n_chunks+=chunk_count;
+ }
+ }
+
+ /* initialize zlib engine */
+ s->compressed_chunk = g_malloc(max_compressed_size+1);
+ s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
+ if(inflateInit(&s->zstream) != Z_OK)
+ goto fail;
+
+ s->current_chunk = s->n_chunks;
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+fail:
+ return -1;
+}
+
+static inline int is_sector_in_chunk(BDRVDMGState* s,
+ uint32_t chunk_num,int sector_num)
+{
+ if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
+ s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
+ return 0;
+ else
+ return -1;
+}
+
+static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+{
+ /* binary search */
+ uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
+ while(chunk1!=chunk2) {
+ chunk3 = (chunk1+chunk2)/2;
+ if(s->sectors[chunk3]>sector_num)
+ chunk2 = chunk3;
+ else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
+ return chunk3;
+ else
+ chunk1 = chunk3;
+ }
+ return s->n_chunks; /* error */
+}
+
+static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num)
+{
+ BDRVDMGState *s = bs->opaque;
+
+ if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
+ int ret;
+ uint32_t chunk = search_chunk(s,sector_num);
+
+ if(chunk>=s->n_chunks)
+ return -1;
+
+ s->current_chunk = s->n_chunks;
+ switch(s->types[chunk]) {
+ case 0x80000005: { /* zlib compressed */
+ int i;
+
+ /* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ i=0;
+ do {
+ ret = bdrv_pread(bs->file, s->offsets[chunk] + i,
+ s->compressed_chunk+i, s->lengths[chunk]-i);
+ if(ret<0 && errno==EINTR)
+ ret=0;
+ i+=ret;
+ } while(ret>=0 && ret+i<s->lengths[chunk]);
+
+ if (ret != s->lengths[chunk])
+ return -1;
+
+ s->zstream.next_in = s->compressed_chunk;
+ s->zstream.avail_in = s->lengths[chunk];
+ s->zstream.next_out = s->uncompressed_chunk;
+ s->zstream.avail_out = 512*s->sectorcounts[chunk];
+ ret = inflateReset(&s->zstream);
+ if(ret != Z_OK)
+ return -1;
+ ret = inflate(&s->zstream, Z_FINISH);
+ if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
+ return -1;
+ break; }
+ case 1: /* copy */
+ ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->uncompressed_chunk, s->lengths[chunk]);
+ if (ret != s->lengths[chunk])
+ return -1;
+ break;
+ case 2: /* zero */
+ memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
+ break;
+ }
+ s->current_chunk = chunk;
+ }
+ return 0;
+}
+
+static int dmg_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVDMGState *s = bs->opaque;
+ int i;
+
+ for(i=0;i<nb_sectors;i++) {
+ uint32_t sector_offset_in_chunk;
+ if(dmg_read_chunk(bs, sector_num+i) != 0)
+ return -1;
+ sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
+ memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+ }
+ return 0;
+}
+
+static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVDMGState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = dmg_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static void dmg_close(BlockDriverState *bs)
+{
+ BDRVDMGState *s = bs->opaque;
+ if(s->n_chunks>0) {
+ free(s->types);
+ free(s->offsets);
+ free(s->lengths);
+ free(s->sectors);
+ free(s->sectorcounts);
+ }
+ free(s->compressed_chunk);
+ free(s->uncompressed_chunk);
+ inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_dmg = {
+ .format_name = "dmg",
+ .instance_size = sizeof(BDRVDMGState),
+ .bdrv_probe = dmg_probe,
+ .bdrv_open = dmg_open,
+ .bdrv_read = dmg_co_read,
+ .bdrv_close = dmg_close,
+};
+
+static void bdrv_dmg_init(void)
+{
+ bdrv_register(&bdrv_dmg);
+}
+
+block_init(bdrv_dmg_init);
diff --git a/block/iscsi.c b/block/iscsi.c
new file mode 100644
index 000000000..0b96165ec
--- /dev/null
+++ b/block/iscsi.c
@@ -0,0 +1,1086 @@
+/*
+ * QEMU Block driver for iSCSI images
+ *
+ * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "config-host.h"
+
+#include <poll.h>
+#include <arpa/inet.h>
+#include "qemu-common.h"
+#include "qemu-error.h"
+#include "block_int.h"
+#include "trace.h"
+#include "hw/scsi-defs.h"
+
+#include <iscsi/iscsi.h>
+#include <iscsi/scsi-lowlevel.h>
+
+#ifdef __linux__
+#include <scsi/sg.h>
+#include <hw/scsi-defs.h>
+#endif
+
+typedef struct IscsiLun {
+ struct iscsi_context *iscsi;
+ int lun;
+ enum scsi_inquiry_peripheral_device_type type;
+ int block_size;
+ uint64_t num_blocks;
+ int events;
+} IscsiLun;
+
+typedef struct IscsiAIOCB {
+ BlockDriverAIOCB common;
+ QEMUIOVector *qiov;
+ QEMUBH *bh;
+ IscsiLun *iscsilun;
+ struct scsi_task *task;
+ uint8_t *buf;
+ int status;
+ int canceled;
+ size_t read_size;
+ size_t read_offset;
+#ifdef __linux__
+ sg_io_hdr_t *ioh;
+#endif
+} IscsiAIOCB;
+
+struct IscsiTask {
+ IscsiLun *iscsilun;
+ BlockDriverState *bs;
+ int status;
+ int complete;
+};
+
+static void
+iscsi_bh_cb(void *p)
+{
+ IscsiAIOCB *acb = p;
+
+ qemu_bh_delete(acb->bh);
+
+ if (acb->canceled == 0) {
+ acb->common.cb(acb->common.opaque, acb->status);
+ }
+
+ if (acb->task != NULL) {
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ }
+
+ qemu_aio_release(acb);
+}
+
+static void
+iscsi_schedule_bh(IscsiAIOCB *acb)
+{
+ if (acb->bh) {
+ return;
+ }
+ acb->bh = qemu_bh_new(iscsi_bh_cb, acb);
+ qemu_bh_schedule(acb->bh);
+}
+
+
+static void
+iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
+ void *private_data)
+{
+ IscsiAIOCB *acb = private_data;
+
+ acb->status = -ECANCELED;
+ iscsi_schedule_bh(acb);
+}
+
+static void
+iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
+ IscsiLun *iscsilun = acb->iscsilun;
+
+ if (acb->status != -EINPROGRESS) {
+ return;
+ }
+
+ acb->canceled = 1;
+
+ /* send a task mgmt call to the target to cancel the task on the target */
+ iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
+ iscsi_abort_task_cb, acb);
+
+ while (acb->status == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+}
+
+static AIOPool iscsi_aio_pool = {
+ .aiocb_size = sizeof(IscsiAIOCB),
+ .cancel = iscsi_aio_cancel,
+};
+
+
+static void iscsi_process_read(void *arg);
+static void iscsi_process_write(void *arg);
+
+static int iscsi_process_flush(void *arg)
+{
+ IscsiLun *iscsilun = arg;
+
+ return iscsi_queue_length(iscsilun->iscsi) > 0;
+}
+
+static void
+iscsi_set_events(IscsiLun *iscsilun)
+{
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ int ev;
+
+ /* We always register a read handler. */
+ ev = POLLIN;
+ ev |= iscsi_which_events(iscsi);
+ if (ev != iscsilun->events) {
+ qemu_aio_set_fd_handler(iscsi_get_fd(iscsi),
+ iscsi_process_read,
+ (ev & POLLOUT) ? iscsi_process_write : NULL,
+ iscsi_process_flush,
+ iscsilun);
+
+ }
+
+ /* If we just added an event, the callback might be delayed
+ * unless we call qemu_notify_event().
+ */
+ if (ev & ~iscsilun->events) {
+ qemu_notify_event();
+ }
+ iscsilun->events = ev;
+}
+
+static void
+iscsi_process_read(void *arg)
+{
+ IscsiLun *iscsilun = arg;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+
+ iscsi_service(iscsi, POLLIN);
+ iscsi_set_events(iscsilun);
+}
+
+static void
+iscsi_process_write(void *arg)
+{
+ IscsiLun *iscsilun = arg;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+
+ iscsi_service(iscsi, POLLOUT);
+ iscsi_set_events(iscsilun);
+}
+
+
+static void
+iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled);
+
+ g_free(acb->buf);
+
+ if (acb->canceled != 0) {
+ return;
+ }
+
+ acb->status = 0;
+ if (status < 0) {
+ error_report("Failed to write16 data to iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ iscsi_schedule_bh(acb);
+}
+
+static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
+{
+ return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ IscsiAIOCB *acb;
+ size_t size;
+ uint32_t num_sectors;
+ uint64_t lba;
+ struct iscsi_data data;
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
+
+ acb->iscsilun = iscsilun;
+ acb->qiov = qiov;
+
+ acb->canceled = 0;
+ acb->bh = NULL;
+ acb->status = -EINPROGRESS;
+
+ /* XXX we should pass the iovec to write16 to avoid the extra copy */
+ /* this will allow us to get rid of 'buf' completely */
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+ acb->buf = g_malloc(size);
+ qemu_iovec_to_buf(acb->qiov, 0, acb->buf, size);
+
+ acb->task = malloc(sizeof(struct scsi_task));
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to allocate task for scsi WRITE16 "
+ "command. %s", iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+ memset(acb->task, 0, sizeof(struct scsi_task));
+
+ acb->task->xfer_dir = SCSI_XFER_WRITE;
+ acb->task->cdb_size = 16;
+ acb->task->cdb[0] = 0x8a;
+ if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
+ /* set FUA on writes when cache mode is write through */
+ acb->task->cdb[1] |= 0x04;
+ }
+ lba = sector_qemu2lun(sector_num, iscsilun);
+ *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32);
+ *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff);
+ num_sectors = size / iscsilun->block_size;
+ *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
+ acb->task->expxferlen = size;
+
+ data.data = acb->buf;
+ data.size = size;
+
+ if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
+ iscsi_aio_write16_cb,
+ &data,
+ acb) != 0) {
+ scsi_free_scsi_task(acb->task);
+ g_free(acb->buf);
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+static void
+iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);
+
+ if (acb->canceled != 0) {
+ return;
+ }
+
+ acb->status = 0;
+ if (status != 0) {
+ error_report("Failed to read16 data from iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ iscsi_schedule_bh(acb);
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ IscsiAIOCB *acb;
+ size_t qemu_read_size;
+ int i;
+ uint64_t lba;
+ uint32_t num_sectors;
+
+ qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
+
+ acb->iscsilun = iscsilun;
+ acb->qiov = qiov;
+
+ acb->canceled = 0;
+ acb->bh = NULL;
+ acb->status = -EINPROGRESS;
+ acb->read_size = qemu_read_size;
+ acb->buf = NULL;
+
+ /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU
+ * may be misaligned to the LUN, so we may need to read some extra
+ * data.
+ */
+ acb->read_offset = 0;
+ if (iscsilun->block_size > BDRV_SECTOR_SIZE) {
+ uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num;
+
+ acb->read_offset = bdrv_offset % iscsilun->block_size;
+ }
+
+ num_sectors = (qemu_read_size + iscsilun->block_size
+ + acb->read_offset - 1)
+ / iscsilun->block_size;
+
+ acb->task = malloc(sizeof(struct scsi_task));
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to allocate task for scsi READ16 "
+ "command. %s", iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+ memset(acb->task, 0, sizeof(struct scsi_task));
+
+ acb->task->xfer_dir = SCSI_XFER_READ;
+ lba = sector_qemu2lun(sector_num, iscsilun);
+ acb->task->expxferlen = qemu_read_size;
+
+ switch (iscsilun->type) {
+ case TYPE_DISK:
+ acb->task->cdb_size = 16;
+ acb->task->cdb[0] = 0x88;
+ *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32);
+ *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff);
+ *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
+ break;
+ default:
+ acb->task->cdb_size = 10;
+ acb->task->cdb[0] = 0x28;
+ *(uint32_t *)&acb->task->cdb[2] = htonl(lba);
+ *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
+ break;
+ }
+
+ if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
+ iscsi_aio_read16_cb,
+ NULL,
+ acb) != 0) {
+ scsi_free_scsi_task(acb->task);
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ for (i = 0; i < acb->qiov->niov; i++) {
+ scsi_task_add_data_in_buffer(acb->task,
+ acb->qiov->iov[i].iov_len,
+ acb->qiov->iov[i].iov_base);
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+
+static void
+iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ if (acb->canceled != 0) {
+ return;
+ }
+
+ acb->status = 0;
+ if (status < 0) {
+ error_report("Failed to sync10 data on iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ iscsi_schedule_bh(acb);
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ IscsiAIOCB *acb;
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->canceled = 0;
+ acb->bh = NULL;
+ acb->status = -EINPROGRESS;
+
+ acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
+ 0, 0, 0, 0,
+ iscsi_synccache10_cb,
+ acb);
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to send synchronizecache10 command. %s",
+ iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+static void
+iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ if (acb->canceled != 0) {
+ return;
+ }
+
+ acb->status = 0;
+ if (status < 0) {
+ error_report("Failed to unmap data on iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ iscsi_schedule_bh(acb);
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ IscsiAIOCB *acb;
+ struct unmap_list list[1];
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->canceled = 0;
+ acb->bh = NULL;
+ acb->status = -EINPROGRESS;
+
+ list[0].lba = sector_qemu2lun(sector_num, iscsilun);
+ list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
+
+ acb->task = iscsi_unmap_task(iscsi, iscsilun->lun,
+ 0, 0, &list[0], 1,
+ iscsi_unmap_cb,
+ acb);
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to send unmap command. %s",
+ iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+#ifdef __linux__
+static void
+iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ if (acb->canceled != 0) {
+ return;
+ }
+
+ acb->status = 0;
+ if (status < 0) {
+ error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ acb->ioh->driver_status = 0;
+ acb->ioh->host_status = 0;
+ acb->ioh->resid = 0;
+
+#define SG_ERR_DRIVER_SENSE 0x08
+
+ if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) {
+ int ss;
+
+ acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE;
+
+ acb->ioh->sb_len_wr = acb->task->datain.size - 2;
+ ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ?
+ acb->ioh->mx_sb_len : acb->ioh->sb_len_wr;
+ memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
+ }
+
+ iscsi_schedule_bh(acb);
+}
+
+static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ struct iscsi_data data;
+ IscsiAIOCB *acb;
+
+ assert(req == SG_IO);
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->canceled = 0;
+ acb->bh = NULL;
+ acb->status = -EINPROGRESS;
+ acb->buf = NULL;
+ acb->ioh = buf;
+
+ acb->task = malloc(sizeof(struct scsi_task));
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to allocate task for scsi command. %s",
+ iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+ memset(acb->task, 0, sizeof(struct scsi_task));
+
+ switch (acb->ioh->dxfer_direction) {
+ case SG_DXFER_TO_DEV:
+ acb->task->xfer_dir = SCSI_XFER_WRITE;
+ break;
+ case SG_DXFER_FROM_DEV:
+ acb->task->xfer_dir = SCSI_XFER_READ;
+ break;
+ default:
+ acb->task->xfer_dir = SCSI_XFER_NONE;
+ break;
+ }
+
+ acb->task->cdb_size = acb->ioh->cmd_len;
+ memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len);
+ acb->task->expxferlen = acb->ioh->dxfer_len;
+
+ if (acb->task->xfer_dir == SCSI_XFER_WRITE) {
+ data.data = acb->ioh->dxferp;
+ data.size = acb->ioh->dxfer_len;
+ }
+ if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
+ iscsi_aio_ioctl_cb,
+ (acb->task->xfer_dir == SCSI_XFER_WRITE) ?
+ &data : NULL,
+ acb) != 0) {
+ scsi_free_scsi_task(acb->task);
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ /* tell libiscsi to read straight into the buffer we got from ioctl */
+ if (acb->task->xfer_dir == SCSI_XFER_READ) {
+ scsi_task_add_data_in_buffer(acb->task,
+ acb->ioh->dxfer_len,
+ acb->ioh->dxferp);
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ IscsiLun *iscsilun = bs->opaque;
+
+ switch (req) {
+ case SG_GET_VERSION_NUM:
+ *(int *)buf = 30000;
+ break;
+ case SG_GET_SCSI_ID:
+ ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
+ break;
+ default:
+ return -1;
+ }
+ return 0;
+}
+#endif
+
+static int64_t
+iscsi_getlength(BlockDriverState *bs)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ int64_t len;
+
+ len = iscsilun->num_blocks;
+ len *= iscsilun->block_size;
+
+ return len;
+}
+
+static void
+iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ struct IscsiTask *itask = opaque;
+ struct scsi_readcapacity16 *rc16;
+ struct scsi_task *task = command_data;
+
+ if (status != 0) {
+ error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ rc16 = scsi_datain_unmarshall(task);
+ if (rc16 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ itask->iscsilun->block_size = rc16->block_length;
+ itask->iscsilun->num_blocks = rc16->returned_lba + 1;
+ itask->bs->total_sectors = itask->iscsilun->num_blocks *
+ itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
+
+ itask->status = 0;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+}
+
+static void
+iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ struct IscsiTask *itask = opaque;
+ struct scsi_readcapacity10 *rc10;
+ struct scsi_task *task = command_data;
+
+ if (status != 0) {
+ error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ rc10 = scsi_datain_unmarshall(task);
+ if (rc10 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ itask->iscsilun->block_size = rc10->block_size;
+ if (rc10->lba == 0) {
+ /* blank disk loaded */
+ itask->iscsilun->num_blocks = 0;
+ } else {
+ itask->iscsilun->num_blocks = rc10->lba + 1;
+ }
+ itask->bs->total_sectors = itask->iscsilun->num_blocks *
+ itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
+
+ itask->status = 0;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+}
+
+static void
+iscsi_inquiry_cb(struct iscsi_context *iscsi, int status, void *command_data,
+ void *opaque)
+{
+ struct IscsiTask *itask = opaque;
+ struct scsi_task *task = command_data;
+ struct scsi_inquiry_standard *inq;
+
+ if (status != 0) {
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ inq = scsi_datain_unmarshall(task);
+ if (inq == NULL) {
+ error_report("iSCSI: Failed to unmarshall inquiry data.");
+ itask->status = 1;
+ itask->complete = 1;
+ scsi_free_scsi_task(task);
+ return;
+ }
+
+ itask->iscsilun->type = inq->periperal_device_type;
+
+ scsi_free_scsi_task(task);
+
+ switch (itask->iscsilun->type) {
+ case TYPE_DISK:
+ task = iscsi_readcapacity16_task(iscsi, itask->iscsilun->lun,
+ iscsi_readcapacity16_cb, opaque);
+ if (task == NULL) {
+ error_report("iSCSI: failed to send readcapacity16 command.");
+ itask->status = 1;
+ itask->complete = 1;
+ return;
+ }
+ break;
+ case TYPE_ROM:
+ task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun,
+ 0, 0,
+ iscsi_readcapacity10_cb, opaque);
+ if (task == NULL) {
+ error_report("iSCSI: failed to send readcapacity16 command.");
+ itask->status = 1;
+ itask->complete = 1;
+ return;
+ }
+ break;
+ default:
+ itask->status = 0;
+ itask->complete = 1;
+ }
+}
+
+static void
+iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data,
+ void *opaque)
+{
+ struct IscsiTask *itask = opaque;
+ struct scsi_task *task;
+
+ if (status != 0) {
+ itask->status = 1;
+ itask->complete = 1;
+ return;
+ }
+
+ task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
+ 0, 0, 36,
+ iscsi_inquiry_cb, opaque);
+ if (task == NULL) {
+ error_report("iSCSI: failed to send inquiry command.");
+ itask->status = 1;
+ itask->complete = 1;
+ return;
+ }
+}
+
+static int parse_chap(struct iscsi_context *iscsi, const char *target)
+{
+ QemuOptsList *list;
+ QemuOpts *opts;
+ const char *user = NULL;
+ const char *password = NULL;
+
+ list = qemu_find_opts("iscsi");
+ if (!list) {
+ return 0;
+ }
+
+ opts = qemu_opts_find(list, target);
+ if (opts == NULL) {
+ opts = QTAILQ_FIRST(&list->head);
+ if (!opts) {
+ return 0;
+ }
+ }
+
+ user = qemu_opt_get(opts, "user");
+ if (!user) {
+ return 0;
+ }
+
+ password = qemu_opt_get(opts, "password");
+ if (!password) {
+ error_report("CHAP username specified but no password was given");
+ return -1;
+ }
+
+ if (iscsi_set_initiator_username_pwd(iscsi, user, password)) {
+ error_report("Failed to set initiator username and password");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void parse_header_digest(struct iscsi_context *iscsi, const char *target)
+{
+ QemuOptsList *list;
+ QemuOpts *opts;
+ const char *digest = NULL;
+
+ list = qemu_find_opts("iscsi");
+ if (!list) {
+ return;
+ }
+
+ opts = qemu_opts_find(list, target);
+ if (opts == NULL) {
+ opts = QTAILQ_FIRST(&list->head);
+ if (!opts) {
+ return;
+ }
+ }
+
+ digest = qemu_opt_get(opts, "header-digest");
+ if (!digest) {
+ return;
+ }
+
+ if (!strcmp(digest, "CRC32C")) {
+ iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C);
+ } else if (!strcmp(digest, "NONE")) {
+ iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE);
+ } else if (!strcmp(digest, "CRC32C-NONE")) {
+ iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE);
+ } else if (!strcmp(digest, "NONE-CRC32C")) {
+ iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
+ } else {
+ error_report("Invalid header-digest setting : %s", digest);
+ }
+}
+
+static char *parse_initiator_name(const char *target)
+{
+ QemuOptsList *list;
+ QemuOpts *opts;
+ const char *name = NULL;
+ const char *iscsi_name = qemu_get_vm_name();
+
+ list = qemu_find_opts("iscsi");
+ if (list) {
+ opts = qemu_opts_find(list, target);
+ if (!opts) {
+ opts = QTAILQ_FIRST(&list->head);
+ }
+ if (opts) {
+ name = qemu_opt_get(opts, "initiator-name");
+ }
+ }
+
+ if (name) {
+ return g_strdup(name);
+ } else {
+ return g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s",
+ iscsi_name ? ":" : "",
+ iscsi_name ? iscsi_name : "");
+ }
+}
+
+/*
+ * We support iscsi url's on the form
+ * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
+ */
+static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = NULL;
+ struct iscsi_url *iscsi_url = NULL;
+ struct IscsiTask task;
+ char *initiator_name = NULL;
+ int ret;
+
+ if ((BDRV_SECTOR_SIZE % 512) != 0) {
+ error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
+ "BDRV_SECTOR_SIZE(%lld) is not a multiple "
+ "of 512", BDRV_SECTOR_SIZE);
+ return -EINVAL;
+ }
+
+ iscsi_url = iscsi_parse_full_url(iscsi, filename);
+ if (iscsi_url == NULL) {
+ error_report("Failed to parse URL : %s %s", filename,
+ iscsi_get_error(iscsi));
+ ret = -EINVAL;
+ goto out;
+ }
+
+ memset(iscsilun, 0, sizeof(IscsiLun));
+
+ initiator_name = parse_initiator_name(iscsi_url->target);
+
+ iscsi = iscsi_create_context(initiator_name);
+ if (iscsi == NULL) {
+ error_report("iSCSI: Failed to create iSCSI context.");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (iscsi_set_targetname(iscsi, iscsi_url->target)) {
+ error_report("iSCSI: Failed to set target name.");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (iscsi_url->user != NULL) {
+ ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user,
+ iscsi_url->passwd);
+ if (ret != 0) {
+ error_report("Failed to set initiator username and password");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* check if we got CHAP username/password via the options */
+ if (parse_chap(iscsi, iscsi_url->target) != 0) {
+ error_report("iSCSI: Failed to set CHAP user/password");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) {
+ error_report("iSCSI: Failed to set session type to normal.");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
+
+ /* check if we got HEADER_DIGEST via the options */
+ parse_header_digest(iscsi, iscsi_url->target);
+
+ task.iscsilun = iscsilun;
+ task.status = 0;
+ task.complete = 0;
+ task.bs = bs;
+
+ iscsilun->iscsi = iscsi;
+ iscsilun->lun = iscsi_url->lun;
+
+ if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
+ iscsi_connect_cb, &task)
+ != 0) {
+ error_report("iSCSI: Failed to start async connect.");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ while (!task.complete) {
+ iscsi_set_events(iscsilun);
+ qemu_aio_wait();
+ }
+ if (task.status != 0) {
+ error_report("iSCSI: Failed to connect to LUN : %s",
+ iscsi_get_error(iscsi));
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Medium changer or tape. We dont have any emulation for this so this must
+ * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
+ * to read from the device to guess the image format.
+ */
+ if (iscsilun->type == TYPE_MEDIUM_CHANGER ||
+ iscsilun->type == TYPE_TAPE) {
+ bs->sg = 1;
+ }
+
+ ret = 0;
+
+out:
+ if (initiator_name != NULL) {
+ g_free(initiator_name);
+ }
+ if (iscsi_url != NULL) {
+ iscsi_destroy_url(iscsi_url);
+ }
+
+ if (ret) {
+ if (iscsi != NULL) {
+ iscsi_destroy_context(iscsi);
+ }
+ memset(iscsilun, 0, sizeof(IscsiLun));
+ }
+ return ret;
+}
+
+static void iscsi_close(BlockDriverState *bs)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+
+ qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL);
+ iscsi_destroy_context(iscsi);
+ memset(iscsilun, 0, sizeof(IscsiLun));
+}
+
+static BlockDriver bdrv_iscsi = {
+ .format_name = "iscsi",
+ .protocol_name = "iscsi",
+
+ .instance_size = sizeof(IscsiLun),
+ .bdrv_file_open = iscsi_open,
+ .bdrv_close = iscsi_close,
+
+ .bdrv_getlength = iscsi_getlength,
+
+ .bdrv_aio_readv = iscsi_aio_readv,
+ .bdrv_aio_writev = iscsi_aio_writev,
+ .bdrv_aio_flush = iscsi_aio_flush,
+
+ .bdrv_aio_discard = iscsi_aio_discard,
+
+#ifdef __linux__
+ .bdrv_ioctl = iscsi_ioctl,
+ .bdrv_aio_ioctl = iscsi_aio_ioctl,
+#endif
+};
+
+static void iscsi_block_init(void)
+{
+ bdrv_register(&bdrv_iscsi);
+}
+
+block_init(iscsi_block_init);
diff --git a/block/nbd.c b/block/nbd.c
new file mode 100644
index 000000000..2bce47bf7
--- /dev/null
+++ b/block/nbd.c
@@ -0,0 +1,517 @@
+/*
+ * QEMU Block driver for NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "nbd.h"
+#include "block_int.h"
+#include "module.h"
+#include "qemu_socket.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#define EN_OPTSTR ":exportname="
+
+/* #define DEBUG_NBD */
+
+#if defined(DEBUG_NBD)
+#define logout(fmt, ...) \
+ fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#else
+#define logout(fmt, ...) ((void)0)
+#endif
+
+#define MAX_NBD_REQUESTS 16
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
+
+typedef struct BDRVNBDState {
+ int sock;
+ uint32_t nbdflags;
+ off_t size;
+ size_t blocksize;
+ char *export_name; /* An NBD server may export several devices */
+
+ CoMutex send_mutex;
+ CoMutex free_sema;
+ Coroutine *send_coroutine;
+ int in_flight;
+
+ Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+ struct nbd_reply reply;
+
+ /* If it begins with '/', this is a UNIX domain socket. Otherwise,
+ * it's a string of the form <hostname|ip4|\[ip6\]>:port
+ */
+ char *host_spec;
+} BDRVNBDState;
+
+static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
+{
+ char *file;
+ char *export_name;
+ const char *host_spec;
+ const char *unixpath;
+ int err = -EINVAL;
+
+ file = g_strdup(filename);
+
+ export_name = strstr(file, EN_OPTSTR);
+ if (export_name) {
+ if (export_name[strlen(EN_OPTSTR)] == 0) {
+ goto out;
+ }
+ export_name[0] = 0; /* truncate 'file' */
+ export_name += strlen(EN_OPTSTR);
+ s->export_name = g_strdup(export_name);
+ }
+
+ /* extract the host_spec - fail if it's not nbd:... */
+ if (!strstart(file, "nbd:", &host_spec)) {
+ goto out;
+ }
+
+ /* are we a UNIX or TCP socket? */
+ if (strstart(host_spec, "unix:", &unixpath)) {
+ if (unixpath[0] != '/') { /* We demand an absolute path*/
+ goto out;
+ }
+ s->host_spec = g_strdup(unixpath);
+ } else {
+ s->host_spec = g_strdup(host_spec);
+ }
+
+ err = 0;
+
+out:
+ g_free(file);
+ if (err != 0) {
+ g_free(s->export_name);
+ g_free(s->host_spec);
+ }
+ return err;
+}
+
+static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
+{
+ int i;
+
+ /* Poor man semaphore. The free_sema is locked when no other request
+ * can be accepted, and unlocked after receiving one reply. */
+ if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
+ qemu_co_mutex_lock(&s->free_sema);
+ assert(s->in_flight < MAX_NBD_REQUESTS);
+ }
+ s->in_flight++;
+
+ for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+ if (s->recv_coroutine[i] == NULL) {
+ s->recv_coroutine[i] = qemu_coroutine_self();
+ break;
+ }
+ }
+
+ assert(i < MAX_NBD_REQUESTS);
+ request->handle = INDEX_TO_HANDLE(s, i);
+}
+
+static int nbd_have_request(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+
+ return s->in_flight > 0;
+}
+
+static void nbd_reply_ready(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+ uint64_t i;
+ int ret;
+
+ if (s->reply.handle == 0) {
+ /* No reply already in flight. Fetch a header. It is possible
+ * that another thread has done the same thing in parallel, so
+ * the socket is not readable anymore.
+ */
+ ret = nbd_receive_reply(s->sock, &s->reply);
+ if (ret == -EAGAIN) {
+ return;
+ }
+ if (ret < 0) {
+ s->reply.handle = 0;
+ goto fail;
+ }
+ }
+
+ /* There's no need for a mutex on the receive side, because the
+ * handler acts as a synchronization point and ensures that only
+ * one coroutine is called until the reply finishes. */
+ i = HANDLE_TO_INDEX(s, s->reply.handle);
+ if (i >= MAX_NBD_REQUESTS) {
+ goto fail;
+ }
+
+ if (s->recv_coroutine[i]) {
+ qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+ return;
+ }
+
+fail:
+ for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+ if (s->recv_coroutine[i]) {
+ qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+ }
+ }
+}
+
+static void nbd_restart_write(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+ qemu_coroutine_enter(s->send_coroutine, NULL);
+}
+
+static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
+ QEMUIOVector *qiov, int offset)
+{
+ int rc, ret;
+
+ qemu_co_mutex_lock(&s->send_mutex);
+ s->send_coroutine = qemu_coroutine_self();
+ qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
+ nbd_have_request, s);
+ rc = nbd_send_request(s->sock, request);
+ if (rc >= 0 && qiov) {
+ ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+ offset, request->len);
+ if (ret != request->len) {
+ return -EIO;
+ }
+ }
+ qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
+ nbd_have_request, s);
+ s->send_coroutine = NULL;
+ qemu_co_mutex_unlock(&s->send_mutex);
+ return rc;
+}
+
+static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
+ struct nbd_reply *reply,
+ QEMUIOVector *qiov, int offset)
+{
+ int ret;
+
+ /* Wait until we're woken up by the read handler. TODO: perhaps
+ * peek at the next reply and avoid yielding if it's ours? */
+ qemu_coroutine_yield();
+ *reply = s->reply;
+ if (reply->handle != request->handle) {
+ reply->error = EIO;
+ } else {
+ if (qiov && reply->error == 0) {
+ ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
+ offset, request->len);
+ if (ret != request->len) {
+ reply->error = EIO;
+ }
+ }
+
+ /* Tell the read handler to read another header. */
+ s->reply.handle = 0;
+ }
+}
+
+static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
+{
+ int i = HANDLE_TO_INDEX(s, request->handle);
+ s->recv_coroutine[i] = NULL;
+ if (s->in_flight-- == MAX_NBD_REQUESTS) {
+ qemu_co_mutex_unlock(&s->free_sema);
+ }
+}
+
+static int nbd_establish_connection(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+ int sock;
+ int ret;
+ off_t size;
+ size_t blocksize;
+
+ if (s->host_spec[0] == '/') {
+ sock = unix_socket_outgoing(s->host_spec);
+ } else {
+ sock = tcp_socket_outgoing_spec(s->host_spec);
+ }
+
+ /* Failed to establish connection */
+ if (sock < 0) {
+ logout("Failed to establish connection to NBD server\n");
+ return -errno;
+ }
+
+ /* NBD handshake */
+ ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
+ &blocksize);
+ if (ret < 0) {
+ logout("Failed to negotiate with the NBD server\n");
+ closesocket(sock);
+ return ret;
+ }
+
+ /* Now that we're connected, set the socket to be non-blocking and
+ * kick the reply mechanism. */
+ socket_set_nonblock(sock);
+ qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
+ nbd_have_request, s);
+
+ s->sock = sock;
+ s->size = size;
+ s->blocksize = blocksize;
+
+ logout("Established connection with NBD server\n");
+ return 0;
+}
+
+static void nbd_teardown_connection(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+
+ request.type = NBD_CMD_DISC;
+ request.from = 0;
+ request.len = 0;
+ nbd_send_request(s->sock, &request);
+
+ qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
+ closesocket(s->sock);
+}
+
+static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+{
+ BDRVNBDState *s = bs->opaque;
+ int result;
+
+ qemu_co_mutex_init(&s->send_mutex);
+ qemu_co_mutex_init(&s->free_sema);
+
+ /* Pop the config into our state object. Exit if invalid. */
+ result = nbd_config(s, filename, flags);
+ if (result != 0) {
+ return result;
+ }
+
+ /* establish TCP connection, return error if it fails
+ * TODO: Configurable retry-until-timeout behaviour.
+ */
+ result = nbd_establish_connection(bs);
+
+ return result;
+}
+
+static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov,
+ int offset)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ ssize_t ret;
+
+ request.type = NBD_CMD_READ;
+ request.from = sector_num * 512;
+ request.len = nb_sectors * 512;
+
+ nbd_coroutine_start(s, &request);
+ ret = nbd_co_send_request(s, &request, NULL, 0);
+ if (ret < 0) {
+ reply.error = -ret;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, qiov, offset);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
+
+}
+
+static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov,
+ int offset)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ ssize_t ret;
+
+ request.type = NBD_CMD_WRITE;
+ if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
+ request.type |= NBD_CMD_FLAG_FUA;
+ }
+
+ request.from = sector_num * 512;
+ request.len = nb_sectors * 512;
+
+ nbd_coroutine_start(s, &request);
+ ret = nbd_co_send_request(s, &request, qiov, offset);
+ if (ret < 0) {
+ reply.error = -ret;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
+}
+
+/* qemu-nbd has a limit of slightly less than 1M per request. Try to
+ * remain aligned to 4K. */
+#define NBD_MAX_SECTORS 2040
+
+static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ int offset = 0;
+ int ret;
+ while (nb_sectors > NBD_MAX_SECTORS) {
+ ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
+ if (ret < 0) {
+ return ret;
+ }
+ offset += NBD_MAX_SECTORS * 512;
+ sector_num += NBD_MAX_SECTORS;
+ nb_sectors -= NBD_MAX_SECTORS;
+ }
+ return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
+}
+
+static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ int offset = 0;
+ int ret;
+ while (nb_sectors > NBD_MAX_SECTORS) {
+ ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
+ if (ret < 0) {
+ return ret;
+ }
+ offset += NBD_MAX_SECTORS * 512;
+ sector_num += NBD_MAX_SECTORS;
+ nb_sectors -= NBD_MAX_SECTORS;
+ }
+ return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
+}
+
+static int nbd_co_flush(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ ssize_t ret;
+
+ if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+ return 0;
+ }
+
+ request.type = NBD_CMD_FLUSH;
+ if (s->nbdflags & NBD_FLAG_SEND_FUA) {
+ request.type |= NBD_CMD_FLAG_FUA;
+ }
+
+ request.from = 0;
+ request.len = 0;
+
+ nbd_coroutine_start(s, &request);
+ ret = nbd_co_send_request(s, &request, NULL, 0);
+ if (ret < 0) {
+ reply.error = -ret;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
+}
+
+static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ ssize_t ret;
+
+ if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
+ return 0;
+ }
+ request.type = NBD_CMD_TRIM;
+ request.from = sector_num * 512;;
+ request.len = nb_sectors * 512;
+
+ nbd_coroutine_start(s, &request);
+ ret = nbd_co_send_request(s, &request, NULL, 0);
+ if (ret < 0) {
+ reply.error = -ret;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
+}
+
+static void nbd_close(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+ g_free(s->export_name);
+ g_free(s->host_spec);
+
+ nbd_teardown_connection(bs);
+}
+
+static int64_t nbd_getlength(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+
+ return s->size;
+}
+
+static BlockDriver bdrv_nbd = {
+ .format_name = "nbd",
+ .instance_size = sizeof(BDRVNBDState),
+ .bdrv_file_open = nbd_open,
+ .bdrv_co_readv = nbd_co_readv,
+ .bdrv_co_writev = nbd_co_writev,
+ .bdrv_close = nbd_close,
+ .bdrv_co_flush_to_os = nbd_co_flush,
+ .bdrv_co_discard = nbd_co_discard,
+ .bdrv_getlength = nbd_getlength,
+ .protocol_name = "nbd",
+};
+
+static void bdrv_nbd_init(void)
+{
+ bdrv_register(&bdrv_nbd);
+}
+
+block_init(bdrv_nbd_init);
diff --git a/block/parallels.c b/block/parallels.c
new file mode 100644
index 000000000..d30f0ecf7
--- /dev/null
+++ b/block/parallels.c
@@ -0,0 +1,170 @@
+/*
+ * Block driver for Parallels disk image format
+ *
+ * Copyright (c) 2007 Alex Beregszaszi
+ *
+ * This code is based on comparing different disk images created by Parallels.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_VERSION 2
+#define HEADER_SIZE 64
+
+// always little-endian
+struct parallels_header {
+ char magic[16]; // "WithoutFreeSpace"
+ uint32_t version;
+ uint32_t heads;
+ uint32_t cylinders;
+ uint32_t tracks;
+ uint32_t catalog_entries;
+ uint32_t nb_sectors;
+ char padding[24];
+} QEMU_PACKED;
+
+typedef struct BDRVParallelsState {
+ CoMutex lock;
+
+ uint32_t *catalog_bitmap;
+ int catalog_size;
+
+ int tracks;
+} BDRVParallelsState;
+
+static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct parallels_header *ph = (const void *)buf;
+
+ if (buf_size < HEADER_SIZE)
+ return 0;
+
+ if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
+ (le32_to_cpu(ph->version) == HEADER_VERSION))
+ return 100;
+
+ return 0;
+}
+
+static int parallels_open(BlockDriverState *bs, int flags)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int i;
+ struct parallels_header ph;
+
+ bs->read_only = 1; // no write support yet
+
+ if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph))
+ goto fail;
+
+ if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
+ (le32_to_cpu(ph.version) != HEADER_VERSION)) {
+ goto fail;
+ }
+
+ bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+
+ s->tracks = le32_to_cpu(ph.tracks);
+
+ s->catalog_size = le32_to_cpu(ph.catalog_entries);
+ s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+ if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) !=
+ s->catalog_size * 4)
+ goto fail;
+ for (i = 0; i < s->catalog_size; i++)
+ le32_to_cpus(&s->catalog_bitmap[i]);
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+fail:
+ if (s->catalog_bitmap)
+ g_free(s->catalog_bitmap);
+ return -1;
+}
+
+static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+ BDRVParallelsState *s = bs->opaque;
+ uint32_t index, offset;
+
+ index = sector_num / s->tracks;
+ offset = sector_num % s->tracks;
+
+ /* not allocated */
+ if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
+ return -1;
+ return (uint64_t)(s->catalog_bitmap[index] + offset) * 512;
+}
+
+static int parallels_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ while (nb_sectors > 0) {
+ int64_t position = seek_to_sector(bs, sector_num);
+ if (position >= 0) {
+ if (bdrv_pread(bs->file, position, buf, 512) != 512)
+ return -1;
+ } else {
+ memset(buf, 0, 512);
+ }
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+ return 0;
+}
+
+static coroutine_fn int parallels_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVParallelsState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = parallels_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static void parallels_close(BlockDriverState *bs)
+{
+ BDRVParallelsState *s = bs->opaque;
+ g_free(s->catalog_bitmap);
+}
+
+static BlockDriver bdrv_parallels = {
+ .format_name = "parallels",
+ .instance_size = sizeof(BDRVParallelsState),
+ .bdrv_probe = parallels_probe,
+ .bdrv_open = parallels_open,
+ .bdrv_read = parallels_co_read,
+ .bdrv_close = parallels_close,
+};
+
+static void bdrv_parallels_init(void)
+{
+ bdrv_register(&bdrv_parallels);
+}
+
+block_init(bdrv_parallels_init);
diff --git a/block/qcow.c b/block/qcow.c
new file mode 100644
index 000000000..7b5ab87d2
--- /dev/null
+++ b/block/qcow.c
@@ -0,0 +1,890 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+#include "migration.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+ uint64_t *l2_cache;
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ CoMutex lock;
+ Error *migration_blocker;
+} BDRVQcowState;
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) == QCOW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, shift, ret;
+ QCowHeader header;
+
+ ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto fail;
+ }
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be32_to_cpus(&header.mtime);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+
+ if (header.magic != QCOW_MAGIC) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.version != QCOW_VERSION) {
+ char version[64];
+ snprintf(version, sizeof(version), "QCOW version %d", header.version);
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "qcow", version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ if (header.size <= 1 || header.cluster_bits < 9) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.crypt_method > QCOW_CRYPT_AES) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header) {
+ bs->encrypted = 1;
+ }
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = header.l2_bits;
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+ /* read the level 1 table */
+ shift = s->cluster_bits + s->l2_bits;
+ s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+ s->l1_table_offset = header.l1_table_offset;
+ s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+
+ ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ /* alloc L2 cache */
+ s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ s->cluster_cache = g_malloc(s->cluster_size);
+ s->cluster_data = g_malloc(s->cluster_size);
+ s->cluster_cache_offset = -1;
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023) {
+ len = 1023;
+ }
+ ret = bdrv_pread(bs->file, header.backing_file_offset,
+ bs->backing_file, len);
+ if (ret < 0) {
+ goto fail;
+ }
+ bs->backing_file[len] = '\0';
+ }
+
+ /* Disable migration when qcow images are used */
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "qcow", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+
+ fail:
+ g_free(s->l1_table);
+ g_free(s->l2_cache);
+ g_free(s->cluster_cache);
+ g_free(s->cluster_data);
+ return ret;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+ return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+ uint64_t offset, int allocate,
+ int compressed_size,
+ int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ int min_index, i, j, l1_index, l2_index;
+ uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+ uint32_t min_count;
+ int new_l2_table;
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ l2_offset = s->l1_table[l1_index];
+ new_l2_table = 0;
+ if (!l2_offset) {
+ if (!allocate)
+ return 0;
+ /* allocate a new l2 entry */
+ l2_offset = bdrv_getlength(bs->file);
+ /* round to cluster size */
+ l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+ /* update the L1 entry */
+ s->l1_table[l1_index] = l2_offset;
+ tmp = cpu_to_be64(l2_offset);
+ if (bdrv_pwrite_sync(bs->file,
+ s->l1_table_offset + l1_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) < 0)
+ return 0;
+ new_l2_table = 1;
+ }
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for(j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i << s->l2_bits);
+ goto found;
+ }
+ }
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+ if (new_l2_table) {
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+ s->l2_size * sizeof(uint64_t)) < 0)
+ return 0;
+ } else {
+ if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ }
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+ found:
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (!cluster_offset ||
+ ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+ if (!allocate)
+ return 0;
+ /* allocate a new cluster */
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+ (n_end - n_start) < s->cluster_sectors) {
+ /* if the cluster is already compressed, we must
+ decompress it in the case it is not completely
+ overwritten */
+ if (decompress_cluster(bs, cluster_offset) < 0)
+ return 0;
+ cluster_offset = bdrv_getlength(bs->file);
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ /* write the cluster content */
+ if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
+ s->cluster_size)
+ return -1;
+ } else {
+ cluster_offset = bdrv_getlength(bs->file);
+ if (allocate == 1) {
+ /* round to cluster size */
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
+ /* if encrypted, we must initialize the cluster
+ content which won't be written */
+ if (s->crypt_method &&
+ (n_end - n_start) < s->cluster_sectors) {
+ uint64_t start_sect;
+ start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+ memset(s->cluster_data + 512, 0x00, 512);
+ for(i = 0; i < s->cluster_sectors; i++) {
+ if (i < n_start || i >= n_end) {
+ encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1, 1,
+ &s->aes_encrypt_key);
+ if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
+ s->cluster_data, 512) != 512)
+ return -1;
+ }
+ }
+ }
+ } else if (allocate == 2) {
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ (uint64_t)compressed_size << (63 - s->cluster_bits);
+ }
+ }
+ /* update L2 table */
+ tmp = cpu_to_be64(cluster_offset);
+ l2_table[l2_index] = tmp;
+ if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) < 0)
+ return 0;
+ }
+ return cluster_offset;
+}
+
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ qemu_co_mutex_lock(&s->lock);
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+ qemu_co_mutex_unlock(&s->lock);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, csize;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ csize = cluster_offset >> (63 - s->cluster_bits);
+ csize &= (s->cluster_size - 1);
+ ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
+ if (ret != csize)
+ return -1;
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ int ret = 0, n;
+ uint64_t cluster_offset;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ uint8_t *buf;
+ void *orig_buf;
+
+ if (qiov->niov > 1) {
+ buf = orig_buf = qemu_blockalign(bs, qiov->size);
+ } else {
+ orig_buf = NULL;
+ buf = (uint8_t *)qiov->iov->iov_base;
+ }
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (nb_sectors != 0) {
+ /* prepare next request */
+ cluster_offset = get_cluster_offset(bs, sector_num << 9,
+ 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+
+ if (!cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ hd_iov.iov_base = (void *)buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->backing_hd, sector_num,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ /* Note: in this case, no need to wait */
+ memset(buf, 0, 512 * n);
+ }
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* add AIO support for compressed blocks ? */
+ if (decompress_cluster(bs, cluster_offset) < 0) {
+ goto fail;
+ }
+ memcpy(buf,
+ s->cluster_cache + index_in_cluster * 512, 512 * n);
+ } else {
+ if ((cluster_offset & 511) != 0) {
+ goto fail;
+ }
+ hd_iov.iov_base = (void *)buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ break;
+ }
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, buf, buf,
+ n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+ ret = 0;
+
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+
+done:
+ qemu_co_mutex_unlock(&s->lock);
+
+ if (qiov->niov > 1) {
+ qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
+ qemu_vfree(orig_buf);
+ }
+
+ return ret;
+
+fail:
+ ret = -EIO;
+ goto done;
+}
+
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ uint64_t cluster_offset;
+ const uint8_t *src_buf;
+ int ret = 0, n;
+ uint8_t *cluster_data = NULL;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ uint8_t *buf;
+ void *orig_buf;
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ if (qiov->niov > 1) {
+ buf = orig_buf = qemu_blockalign(bs, qiov->size);
+ qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+ } else {
+ orig_buf = NULL;
+ buf = (uint8_t *)qiov->iov->iov_base;
+ }
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (nb_sectors != 0) {
+
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster + n);
+ if (!cluster_offset || (cluster_offset & 511) != 0) {
+ ret = -EIO;
+ break;
+ }
+ if (s->crypt_method) {
+ if (!cluster_data) {
+ cluster_data = g_malloc0(s->cluster_size);
+ }
+ encrypt_sectors(s, sector_num, cluster_data, buf,
+ n, 1, &s->aes_encrypt_key);
+ src_buf = cluster_data;
+ } else {
+ src_buf = buf;
+ }
+
+ hd_iov.iov_base = (void *)src_buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_writev(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ break;
+ }
+ ret = 0;
+
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ qemu_co_mutex_unlock(&s->lock);
+
+ if (qiov->niov > 1) {
+ qemu_vfree(orig_buf);
+ }
+ g_free(cluster_data);
+
+ return ret;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ g_free(s->l1_table);
+ g_free(s->l2_cache);
+ g_free(s->cluster_cache);
+ g_free(s->cluster_data);
+
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+ int header_size, backing_filename_len, l1_size, shift, i;
+ QCowHeader header;
+ uint8_t *tmp;
+ int64_t total_size = 0;
+ const char *backing_file = NULL;
+ int flags = 0;
+ int ret;
+ BlockDriverState *qcow_bs;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / 512;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+ flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+ }
+ options++;
+ }
+
+ ret = bdrv_create_file(filename, options);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_truncate(qcow_bs, 0);
+ if (ret < 0) {
+ goto exit;
+ }
+
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+ header.size = cpu_to_be64(total_size * 512);
+ header_size = sizeof(header);
+ backing_filename_len = 0;
+ if (backing_file) {
+ if (strcmp(backing_file, "fat:")) {
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_file);
+ header.backing_file_size = cpu_to_be32(backing_filename_len);
+ header_size += backing_filename_len;
+ } else {
+ /* special backing file for vvfat */
+ backing_file = NULL;
+ }
+ header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+ unmodifyed sectors */
+ header.l2_bits = 12; /* 32 KB L2 tables */
+ } else {
+ header.cluster_bits = 12; /* 4 KB clusters */
+ header.l2_bits = 9; /* 4 KB L2 tables */
+ }
+ header_size = (header_size + 7) & ~7;
+ shift = header.cluster_bits + header.l2_bits;
+ l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+ header.l1_table_offset = cpu_to_be64(header_size);
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+
+ /* write all the data */
+ ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+ if (ret != sizeof(header)) {
+ goto exit;
+ }
+
+ if (backing_file) {
+ ret = bdrv_pwrite(qcow_bs, sizeof(header),
+ backing_file, backing_filename_len);
+ if (ret != backing_filename_len) {
+ goto exit;
+ }
+ }
+
+ tmp = g_malloc0(BDRV_SECTOR_SIZE);
+ for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
+ BDRV_SECTOR_SIZE); i++) {
+ ret = bdrv_pwrite(qcow_bs, header_size +
+ BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+ if (ret != BDRV_SECTOR_SIZE) {
+ g_free(tmp);
+ goto exit;
+ }
+ }
+
+ g_free(tmp);
+ ret = 0;
+exit:
+ bdrv_delete(qcow_bs);
+ return ret;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+ l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors != s->cluster_sectors)
+ return -EINVAL;
+
+ out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ deflateEnd(&strm);
+ ret = -EINVAL;
+ goto fail;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+ out_len, 0, 0);
+ if (cluster_offset == 0) {
+ ret = -EIO;
+ goto fail;
+ }
+
+ cluster_offset &= s->cluster_offset_mask;
+ ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ g_free(out_buf);
+ return ret;
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ return 0;
+}
+
+
+static QEMUOptionParameter qcow_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_ENCRYPT,
+ .type = OPT_FLAG,
+ .help = "Encrypt the image"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_qcow = {
+ .format_name = "qcow",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow_probe,
+ .bdrv_open = qcow_open,
+ .bdrv_close = qcow_close,
+ .bdrv_create = qcow_create,
+
+ .bdrv_co_readv = qcow_co_readv,
+ .bdrv_co_writev = qcow_co_writev,
+ .bdrv_co_is_allocated = qcow_co_is_allocated,
+
+ .bdrv_set_key = qcow_set_key,
+ .bdrv_make_empty = qcow_make_empty,
+ .bdrv_write_compressed = qcow_write_compressed,
+ .bdrv_get_info = qcow_get_info,
+
+ .create_options = qcow_create_options,
+};
+
+static void bdrv_qcow_init(void)
+{
+ bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
new file mode 100644
index 000000000..2d4322a8d
--- /dev/null
+++ b/block/qcow2-cache.c
@@ -0,0 +1,323 @@
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include "trace.h"
+
+typedef struct Qcow2CachedTable {
+ void* table;
+ int64_t offset;
+ bool dirty;
+ int cache_hits;
+ int ref;
+} Qcow2CachedTable;
+
+struct Qcow2Cache {
+ Qcow2CachedTable* entries;
+ struct Qcow2Cache* depends;
+ int size;
+ bool depends_on_flush;
+};
+
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2Cache *c;
+ int i;
+
+ c = g_malloc0(sizeof(*c));
+ c->size = num_tables;
+ c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+
+ for (i = 0; i < c->size; i++) {
+ c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+ }
+
+ return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ assert(c->entries[i].ref == 0);
+ qemu_vfree(c->entries[i].table);
+ }
+
+ g_free(c->entries);
+ g_free(c);
+
+ return 0;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
+{
+ int ret;
+
+ ret = qcow2_cache_flush(bs, c->depends);
+ if (ret < 0) {
+ return ret;
+ }
+
+ c->depends = NULL;
+ c->depends_on_flush = false;
+
+ return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret = 0;
+
+ if (!c->entries[i].dirty || !c->entries[i].offset) {
+ return 0;
+ }
+
+ trace_qcow2_cache_entry_flush(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+
+ if (c->depends) {
+ ret = qcow2_cache_flush_dependency(bs, c);
+ } else if (c->depends_on_flush) {
+ ret = bdrv_flush(bs->file);
+ if (ret >= 0) {
+ c->depends_on_flush = false;
+ }
+ }
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (c == s->refcount_block_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+ } else if (c == s->l2_table_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+ }
+
+ ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+ s->cluster_size);
+ if (ret < 0) {
+ return ret;
+ }
+
+ c->entries[i].dirty = false;
+
+ return 0;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+ BDRVQcowState *s = bs->opaque;
+ int result = 0;
+ int ret;
+ int i;
+
+ trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache);
+
+ for (i = 0; i < c->size; i++) {
+ ret = qcow2_cache_entry_flush(bs, c, i);
+ if (ret < 0 && result != -ENOSPC) {
+ result = ret;
+ }
+ }
+
+ if (result == 0) {
+ ret = bdrv_flush(bs->file);
+ if (ret < 0) {
+ result = ret;
+ }
+ }
+
+ return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+ Qcow2Cache *dependency)
+{
+ int ret;
+
+ if (dependency->depends) {
+ ret = qcow2_cache_flush_dependency(bs, dependency);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ if (c->depends && (c->depends != dependency)) {
+ ret = qcow2_cache_flush_dependency(bs, c);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ c->depends = dependency;
+ return 0;
+}
+
+void qcow2_cache_depends_on_flush(Qcow2Cache *c)
+{
+ c->depends_on_flush = true;
+}
+
+static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+{
+ int i;
+ int min_count = INT_MAX;
+ int min_index = -1;
+
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].ref) {
+ continue;
+ }
+
+ if (c->entries[i].cache_hits < min_count) {
+ min_index = i;
+ min_count = c->entries[i].cache_hits;
+ }
+
+ /* Give newer hits priority */
+ /* TODO Check how to optimize the replacement strategy */
+ c->entries[i].cache_hits /= 2;
+ }
+
+ if (min_index == -1) {
+ /* This can't happen in current synchronous code, but leave the check
+ * here as a reminder for whoever starts using AIO with the cache */
+ abort();
+ }
+ return min_index;
+}
+
+static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
+ uint64_t offset, void **table, bool read_from_disk)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+ int ret;
+
+ trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
+ offset, read_from_disk);
+
+ /* Check if the table is already cached */
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].offset == offset) {
+ goto found;
+ }
+ }
+
+ /* If not, write a table back and replace it */
+ i = qcow2_cache_find_entry_to_replace(c);
+ trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+ if (i < 0) {
+ return i;
+ }
+
+ ret = qcow2_cache_entry_flush(bs, c, i);
+ if (ret < 0) {
+ return ret;
+ }
+
+ trace_qcow2_cache_get_read(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+ c->entries[i].offset = 0;
+ if (read_from_disk) {
+ if (c == s->l2_table_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+ }
+
+ ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* Give the table some hits for the start so that it won't be replaced
+ * immediately. The number 32 is completely arbitrary. */
+ c->entries[i].cache_hits = 32;
+ c->entries[i].offset = offset;
+
+ /* And return the right table */
+found:
+ c->entries[i].cache_hits++;
+ c->entries[i].ref++;
+ *table = c->entries[i].table;
+
+ trace_qcow2_cache_get_done(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+
+ return 0;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table)
+{
+ return qcow2_cache_do_get(bs, c, offset, table, true);
+}
+
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table)
+{
+ return qcow2_cache_do_get(bs, c, offset, table, false);
+}
+
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].table == *table) {
+ goto found;
+ }
+ }
+ return -ENOENT;
+
+found:
+ c->entries[i].ref--;
+ *table = NULL;
+
+ assert(c->entries[i].ref >= 0);
+ return 0;
+}
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].table == table) {
+ goto found;
+ }
+ }
+ abort();
+
+found:
+ c->entries[i].dirty = true;
+}
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
new file mode 100644
index 000000000..e179211c5
--- /dev/null
+++ b/block/qcow2-cluster.c
@@ -0,0 +1,1199 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <zlib.h>
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+#include "trace.h"
+
+int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int new_l1_size, new_l1_size2, ret, i;
+ uint64_t *new_l1_table;
+ int64_t new_l1_table_offset;
+ uint8_t data[12];
+
+ if (min_size <= s->l1_size)
+ return 0;
+
+ if (exact_size) {
+ new_l1_size = min_size;
+ } else {
+ /* Bump size up to reduce the number of times we have to grow */
+ new_l1_size = s->l1_size;
+ if (new_l1_size == 0) {
+ new_l1_size = 1;
+ }
+ while (min_size > new_l1_size) {
+ new_l1_size = (new_l1_size * 3 + 1) / 2;
+ }
+ }
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+ new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+ new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+ memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+ /* write new table (align to cluster) */
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
+ new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+ if (new_l1_table_offset < 0) {
+ g_free(new_l1_table);
+ return new_l1_table_offset;
+ }
+
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+ ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
+ if (ret < 0)
+ goto fail;
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+ /* set new table */
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+ cpu_to_be32w((uint32_t*)data, new_l1_size);
+ cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
+ if (ret < 0) {
+ goto fail;
+ }
+ g_free(s->l1_table);
+ qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+ s->l1_table_offset = new_l1_table_offset;
+ s->l1_table = new_l1_table;
+ s->l1_size = new_l1_size;
+ return 0;
+ fail:
+ g_free(new_l1_table);
+ qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2);
+ return ret;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
+ uint64_t **l2_table)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
+
+ return ret;
+}
+
+/*
+ * Writes one sector of the L1 table to the disk (can't update single entries
+ * and we really don't want bdrv_pread to perform a read-modify-write)
+ */
+#define L1_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l1_entry(BlockDriverState *bs, int l1_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t buf[L1_ENTRIES_PER_SECTOR];
+ int l1_start_index;
+ int i, ret;
+
+ l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+ for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+ buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+ ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
+ buf, sizeof(buf));
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t old_l2_offset;
+ uint64_t *l2_table;
+ int64_t l2_offset;
+ int ret;
+
+ old_l2_offset = s->l1_table[l1_index];
+
+ trace_qcow2_l2_allocate(bs, l1_index);
+
+ /* allocate a new l2 entry */
+
+ l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+ if (l2_offset < 0) {
+ return l2_offset;
+ }
+
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* allocate a new entry in the l2 cache */
+
+ trace_qcow2_l2_allocate_get_empty(bs, l1_index);
+ ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ l2_table = *table;
+
+ if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
+ /* if there was no old l2 table, clear the new table */
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ } else {
+ uint64_t* old_table;
+
+ /* if there was an old l2 table, read it from the disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
+ ret = qcow2_cache_get(bs, s->l2_table_cache,
+ old_l2_offset & L1E_OFFSET_MASK,
+ (void**) &old_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ memcpy(l2_table, old_table, s->cluster_size);
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ /* write the l2 table to the file */
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
+
+ trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* update the L1 entry */
+ trace_qcow2_l2_allocate_write_l1(bs, l1_index);
+ s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+ ret = write_l1_entry(bs, l1_index);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ *table = l2_table;
+ trace_qcow2_l2_allocate_done(bs, l1_index, 0);
+ return 0;
+
+fail:
+ trace_qcow2_l2_allocate_done(bs, l1_index, ret);
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+ s->l1_table[l1_index] = old_l2_offset;
+ return ret;
+}
+
+/*
+ * Checks how many clusters in a given L2 table are contiguous in the image
+ * file. As soon as one of the flags in the bitmask stop_flags changes compared
+ * to the first cluster, the search is stopped and the cluster is not counted
+ * as contiguous. (This allows it, for example, to stop at the first compressed
+ * cluster which may require a different handling)
+ */
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+ uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
+{
+ int i;
+ uint64_t mask = stop_flags | L2E_OFFSET_MASK;
+ uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
+
+ if (!offset)
+ return 0;
+
+ for (i = start; i < start + nb_clusters; i++) {
+ uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+ if (offset + (uint64_t) i * cluster_size != l2_entry) {
+ break;
+ }
+ }
+
+ return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+ int i;
+
+ for (i = 0; i < nb_clusters; i++) {
+ int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+
+ if (type != QCOW2_CLUSTER_UNALLOCATED) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+ uint64_t start_sect,
+ uint64_t cluster_offset,
+ int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ int n, ret;
+
+ /*
+ * If this is the last cluster and it is only partially used, we must only
+ * copy until the end of the image, or bdrv_check_request will fail for the
+ * bdrv_read/write calls below.
+ */
+ if (start_sect + n_end > bs->total_sectors) {
+ n_end = bs->total_sectors - start_sect;
+ }
+
+ n = n_end - n_start;
+ if (n <= 0) {
+ return 0;
+ }
+
+ iov.iov_len = n * BDRV_SECTOR_SIZE;
+ iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
+
+ /* Call .bdrv_co_readv() directly instead of using the public block-layer
+ * interface. This avoids double I/O throttling and request tracking,
+ * which can lead to deadlock when block layer copy-on-read is enabled.
+ */
+ ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (s->crypt_method) {
+ qcow2_encrypt_sectors(s, start_sect + n_start,
+ iov.iov_base, iov.iov_base, n, 1,
+ &s->aes_encrypt_key);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
+ ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = 0;
+out:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, find the cluster offset in
+ * qcow2 file. The offset is stored in *cluster_offset.
+ *
+ * on entry, *num is the number of contiguous sectors we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous sectors we can read.
+ *
+ * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
+ * cases.
+ */
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int *num, uint64_t *cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int l1_index, l2_index;
+ uint64_t l2_offset, *l2_table;
+ int l1_bits, c;
+ unsigned int index_in_cluster, nb_clusters;
+ uint64_t nb_available, nb_needed;
+ int ret;
+
+ index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+ nb_needed = *num + index_in_cluster;
+
+ l1_bits = s->l2_bits + s->cluster_bits;
+
+ /* compute how many bytes there are between the offset and
+ * the end of the l1 entry
+ */
+
+ nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
+
+ /* compute the number of available sectors */
+
+ nb_available = (nb_available >> 9) + index_in_cluster;
+
+ if (nb_needed > nb_available) {
+ nb_needed = nb_available;
+ }
+
+ *cluster_offset = 0;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> l1_bits;
+ if (l1_index >= s->l1_size) {
+ ret = QCOW2_CLUSTER_UNALLOCATED;
+ goto out;
+ }
+
+ l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+ if (!l2_offset) {
+ ret = QCOW2_CLUSTER_UNALLOCATED;
+ goto out;
+ }
+
+ /* load the l2 table in memory */
+
+ ret = l2_load(bs, l2_offset, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+ ret = qcow2_get_cluster_type(*cluster_offset);
+ switch (ret) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* Compressed clusters can only be processed one by one */
+ c = 1;
+ *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
+ break;
+ case QCOW2_CLUSTER_ZERO:
+ c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+ *cluster_offset = 0;
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ /* how many empty clusters ? */
+ c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+ *cluster_offset = 0;
+ break;
+ case QCOW2_CLUSTER_NORMAL:
+ /* how many allocated clusters ? */
+ c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+ *cluster_offset &= L2E_OFFSET_MASK;
+ break;
+ default:
+ abort();
+ }
+
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
+ nb_available = (c * s->cluster_sectors);
+
+out:
+ if (nb_available > nb_needed)
+ nb_available = nb_needed;
+
+ *num = nb_available - index_in_cluster;
+
+ return ret;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ * Returns 0 on success, -errno in failure case
+ */
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+ uint64_t **new_l2_table,
+ int *new_l2_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int l1_index, l2_index;
+ uint64_t l2_offset;
+ uint64_t *l2_table = NULL;
+ int ret;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ if (l1_index >= s->l1_size) {
+ ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+
+ /* seek the l2 table of the given l2 offset */
+
+ if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
+ /* load the l2 table in memory */
+ ret = l2_load(bs, l2_offset, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ /* First allocate a new L2 table (and do COW if needed) */
+ ret = l2_allocate(bs, l1_index, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Then decrease the refcount of the old table */
+ if (l2_offset) {
+ qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+ }
+ }
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+ *new_l2_table = l2_table;
+ *new_l2_index = l2_index;
+
+ return 0;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int compressed_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index, ret;
+ uint64_t *l2_table;
+ int64_t cluster_offset;
+ int nb_csectors;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return 0;
+ }
+
+ /* Compression can't overwrite anything. Fail if the cluster was already
+ * allocated. */
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (cluster_offset & L2E_OFFSET_MASK) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ return 0;
+ }
+
+ cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+ if (cluster_offset < 0) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ return 0;
+ }
+
+ nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+ (cluster_offset >> 9);
+
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ ((uint64_t)nb_csectors << s->csize_shift);
+
+ /* update L2 table */
+
+ /* compressed clusters never have the copied flag */
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ l2_table[l2_index] = cpu_to_be64(cluster_offset);
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return 0;
+ }
+
+ return cluster_offset;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, j = 0, l2_index, ret;
+ uint64_t *old_cluster, start_sect, *l2_table;
+ uint64_t cluster_offset = m->alloc_offset;
+ bool cow = false;
+
+ trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+
+ if (m->nb_clusters == 0)
+ return 0;
+
+ old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+ /* copy content of unmodified sectors */
+ start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
+ if (m->n_start) {
+ cow = true;
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0)
+ goto err;
+ }
+
+ if (m->nb_available & (s->cluster_sectors - 1)) {
+ cow = true;
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
+ align_offset(m->nb_available, s->cluster_sectors));
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0)
+ goto err;
+ }
+
+ /*
+ * Update L2 table.
+ *
+ * Before we update the L2 table to actually point to the new cluster, we
+ * need to be sure that the refcounts have been increased and COW was
+ * handled.
+ */
+ if (cow) {
+ qcow2_cache_depends_on_flush(s->l2_table_cache);
+ }
+
+ if (qcow2_need_accurate_refcounts(s)) {
+ qcow2_cache_set_dependency(bs, s->l2_table_cache,
+ s->refcount_block_cache);
+ }
+ ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ goto err;
+ }
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+ for (i = 0; i < m->nb_clusters; i++) {
+ /* if two concurrent writes happen to the same unallocated cluster
+ * each write allocates separate cluster and writes data concurrently.
+ * The first one to complete updates l2 table with pointer to its
+ * cluster the second one has to do RMW (which is done above by
+ * copy_sectors()), update l2 table with its cluster pointer and free
+ * old cluster. This is what this loop does */
+ if(l2_table[l2_index + i] != 0)
+ old_cluster[j++] = l2_table[l2_index + i];
+
+ l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+ (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+ }
+
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /*
+ * If this was a COW, we need to decrease the refcount of the old cluster.
+ * Also flush bs->file to get the right order for L2 and refcount update.
+ */
+ if (j != 0) {
+ for (i = 0; i < j; i++) {
+ qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
+ }
+ }
+
+ ret = 0;
+err:
+ g_free(old_cluster);
+ return ret;
+ }
+
+/*
+ * Returns the number of contiguous clusters that can be used for an allocating
+ * write, but require COW to be performed (this includes yet unallocated space,
+ * which must copy from the backing file)
+ */
+static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
+ uint64_t *l2_table, int l2_index)
+{
+ int i;
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+ int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+ switch(cluster_type) {
+ case QCOW2_CLUSTER_NORMAL:
+ if (l2_entry & QCOW_OFLAG_COPIED) {
+ goto out;
+ }
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ case QCOW2_CLUSTER_COMPRESSED:
+ case QCOW2_CLUSTER_ZERO:
+ break;
+ default:
+ abort();
+ }
+ }
+
+out:
+ assert(i <= nb_clusters);
+ return i;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, unsigned int *nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowL2Meta *old_alloc;
+
+ trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+ *host_offset, *nb_clusters);
+
+ /*
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
+ */
+ QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
+
+ uint64_t start = guest_offset >> s->cluster_bits;
+ uint64_t end = start + *nb_clusters;
+ uint64_t old_start = old_alloc->offset >> s->cluster_bits;
+ uint64_t old_end = old_start + old_alloc->nb_clusters;
+
+ if (end < old_start || start > old_end) {
+ /* No intersection */
+ } else {
+ if (start < old_start) {
+ /* Stop at the start of a running allocation */
+ *nb_clusters = old_start - start;
+ } else {
+ *nb_clusters = 0;
+ }
+
+ if (*nb_clusters == 0) {
+ /* Wait for the dependency to complete. We need to recheck
+ * the free/allocated clusters when we continue. */
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_co_queue_wait(&old_alloc->dependent_requests);
+ qemu_co_mutex_lock(&s->lock);
+ return -EAGAIN;
+ }
+ }
+ }
+
+ if (!*nb_clusters) {
+ abort();
+ }
+
+ /* Allocate new clusters */
+ trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
+ if (*host_offset == 0) {
+ int64_t cluster_offset =
+ qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
+ if (cluster_offset < 0) {
+ return cluster_offset;
+ }
+ *host_offset = cluster_offset;
+ return 0;
+ } else {
+ int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+ if (ret < 0) {
+ return ret;
+ }
+ *nb_clusters = ret;
+ return 0;
+ }
+}
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset on the virtual disk, find the cluster offset in qcow2
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster was already allocated, m->nb_clusters is set to 0 and
+ * other fields in m are meaningless.
+ *
+ * If the cluster is newly allocated, m->nb_clusters is set to the number of
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
+ *
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
+ *
+ * Return 0 on success and -errno in error cases
+ */
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int n_start, int n_end, int *num, QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index, ret, sectors;
+ uint64_t *l2_table;
+ unsigned int nb_clusters, keep_clusters;
+ uint64_t cluster_offset;
+
+ trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
+ n_start, n_end);
+
+ /* Find L2 entry for the first involved cluster */
+again:
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * Calculate the number of clusters to look for. We stop at L2 table
+ * boundaries to keep things simple.
+ */
+ nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
+ s->l2_size - l2_index);
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ /*
+ * Check how many clusters are already allocated and don't need COW, and how
+ * many need a new allocation.
+ */
+ if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+ && (cluster_offset & QCOW_OFLAG_COPIED))
+ {
+ /* We keep all QCOW_OFLAG_COPIED clusters */
+ keep_clusters =
+ count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+ assert(keep_clusters <= nb_clusters);
+ nb_clusters -= keep_clusters;
+ } else {
+ keep_clusters = 0;
+ cluster_offset = 0;
+ }
+
+ if (nb_clusters > 0) {
+ /* For the moment, overwrite compressed clusters one by one */
+ uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
+ if (entry & QCOW_OFLAG_COMPRESSED) {
+ nb_clusters = 1;
+ } else {
+ nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
+ l2_index + keep_clusters);
+ }
+ }
+
+ cluster_offset &= L2E_OFFSET_MASK;
+
+ /*
+ * The L2 table isn't used any more after this. As long as the cache works
+ * synchronously, it's important to release it before calling
+ * do_alloc_cluster_offset, which may yield if we need to wait for another
+ * request to complete. If we still had the reference, we could use up the
+ * whole cache with sleeping requests.
+ */
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* If there is something left to allocate, do that now */
+ *m = (QCowL2Meta) {
+ .cluster_offset = cluster_offset,
+ .nb_clusters = 0,
+ };
+ qemu_co_queue_init(&m->dependent_requests);
+
+ if (nb_clusters > 0) {
+ uint64_t alloc_offset;
+ uint64_t alloc_cluster_offset;
+ uint64_t keep_bytes = keep_clusters * s->cluster_size;
+
+ /* Calculate start and size of allocation */
+ alloc_offset = offset + keep_bytes;
+
+ if (keep_clusters == 0) {
+ alloc_cluster_offset = 0;
+ } else {
+ alloc_cluster_offset = cluster_offset + keep_bytes;
+ }
+
+ /* Allocate, if necessary at a given offset in the image file */
+ ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
+ &nb_clusters);
+ if (ret == -EAGAIN) {
+ goto again;
+ } else if (ret < 0) {
+ goto fail;
+ }
+
+ /* save info needed for meta data update */
+ if (nb_clusters > 0) {
+ /*
+ * requested_sectors: Number of sectors from the start of the first
+ * newly allocated cluster to the end of the (possibly shortened
+ * before) write request.
+ *
+ * avail_sectors: Number of sectors from the start of the first
+ * newly allocated to the end of the last newly allocated cluster.
+ */
+ int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
+ int avail_sectors = nb_clusters
+ << (s->cluster_bits - BDRV_SECTOR_BITS);
+
+ *m = (QCowL2Meta) {
+ .cluster_offset = keep_clusters == 0 ?
+ alloc_cluster_offset : cluster_offset,
+ .alloc_offset = alloc_cluster_offset,
+ .offset = alloc_offset,
+ .n_start = keep_clusters == 0 ? n_start : 0,
+ .nb_clusters = nb_clusters,
+ .nb_available = MIN(requested_sectors, avail_sectors),
+ };
+ qemu_co_queue_init(&m->dependent_requests);
+ QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
+ }
+ }
+
+ /* Some cleanup work */
+ sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
+ if (sectors > n_end) {
+ sectors = n_end;
+ }
+
+ assert(sectors > n_start);
+ *num = sectors - n_start;
+
+ return 0;
+
+fail:
+ if (m->nb_clusters > 0) {
+ QLIST_REMOVE(m, next_in_flight);
+ }
+ return ret;
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, csize, nb_csectors, sector_offset;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+ sector_offset = coffset & 511;
+ csize = nb_csectors * 512 - sector_offset;
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+ ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
+ if (ret < 0) {
+ return ret;
+ }
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data + sector_offset, csize) < 0) {
+ return -EIO;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+/*
+ * This discards as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of discarded
+ * clusters.
+ */
+static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
+ unsigned int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table;
+ int l2_index;
+ int ret;
+ int i;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Limit nb_clusters to one L2 table */
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t old_offset;
+
+ old_offset = be64_to_cpu(l2_table[l2_index + i]);
+ if ((old_offset & L2E_OFFSET_MASK) == 0) {
+ continue;
+ }
+
+ /* First remove L2 entries */
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ l2_table[l2_index + i] = cpu_to_be64(0);
+
+ /* Then decrease the refcount */
+ qcow2_free_any_clusters(bs, old_offset, 1);
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return nb_clusters;
+}
+
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+ int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t end_offset;
+ unsigned int nb_clusters;
+ int ret;
+
+ end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+ /* Round start up and end down */
+ offset = align_offset(offset, s->cluster_size);
+ end_offset &= ~(s->cluster_size - 1);
+
+ if (offset > end_offset) {
+ return 0;
+ }
+
+ nb_clusters = size_to_clusters(s, end_offset - offset);
+
+ /* Each L2 table is handled by its own loop iteration */
+ while (nb_clusters > 0) {
+ ret = discard_single_l2(bs, offset, nb_clusters);
+ if (ret < 0) {
+ return ret;
+ }
+
+ nb_clusters -= ret;
+ offset += (ret * s->cluster_size);
+ }
+
+ return 0;
+}
+
+/*
+ * This zeroes as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of zeroed
+ * clusters.
+ */
+static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+ unsigned int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table;
+ int l2_index;
+ int ret;
+ int i;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Limit nb_clusters to one L2 table */
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t old_offset;
+
+ old_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+ /* Update L2 entries */
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ if (old_offset & QCOW_OFLAG_COMPRESSED) {
+ l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+ qcow2_free_any_clusters(bs, old_offset, 1);
+ } else {
+ l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+ }
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return nb_clusters;
+}
+
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int nb_clusters;
+ int ret;
+
+ /* The zero flag is only supported by version 3 and newer */
+ if (s->qcow_version < 3) {
+ return -ENOTSUP;
+ }
+
+ /* Each L2 table is handled by its own loop iteration */
+ nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+
+ while (nb_clusters > 0) {
+ ret = zero_single_l2(bs, offset, nb_clusters);
+ if (ret < 0) {
+ return ret;
+ }
+
+ nb_clusters -= ret;
+ offset += (ret * s->cluster_size);
+ }
+
+ return 0;
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
new file mode 100644
index 000000000..5e3f9153f
--- /dev/null
+++ b/block/qcow2-refcount.c
@@ -0,0 +1,1240 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length,
+ int addend);
+
+
+/*********************************************************/
+/* refcount handling */
+
+int qcow2_refcount_init(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, refcount_table_size2, i;
+
+ refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+ s->refcount_table = g_malloc(refcount_table_size2);
+ if (s->refcount_table_size > 0) {
+ BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+ ret = bdrv_pread(bs->file, s->refcount_table_offset,
+ s->refcount_table, refcount_table_size2);
+ if (ret != refcount_table_size2)
+ goto fail;
+ for(i = 0; i < s->refcount_table_size; i++)
+ be64_to_cpus(&s->refcount_table[i]);
+ }
+ return 0;
+ fail:
+ return -ENOMEM;
+}
+
+void qcow2_refcount_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ g_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+ int64_t refcount_block_offset,
+ void **refcount_block)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
+ ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+ refcount_block);
+
+ return ret;
+}
+
+/*
+ * Returns the refcount of the cluster given by its index. Any non-negative
+ * return value is the refcount of the cluster, negative values are -errno
+ * and indicate an error.
+ */
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ int refcount_table_index, block_index;
+ int64_t refcount_block_offset;
+ int ret;
+ uint16_t *refcount_block;
+ uint16_t refcount;
+
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+ if (refcount_table_index >= s->refcount_table_size)
+ return 0;
+ refcount_block_offset = s->refcount_table[refcount_table_index];
+ if (!refcount_block_offset)
+ return 0;
+
+ ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ refcount = be16_to_cpu(refcount_block[block_index]);
+
+ ret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return refcount;
+}
+
+/*
+ * Rounds the refcount table size up to avoid growing the table for each single
+ * refcount block that is allocated.
+ */
+static unsigned int next_refcount_table_size(BDRVQcowState *s,
+ unsigned int min_size)
+{
+ unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
+ unsigned int refcount_table_clusters =
+ MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
+
+ while (min_clusters > refcount_table_clusters) {
+ refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+ }
+
+ return refcount_table_clusters << (s->cluster_bits - 3);
+}
+
+
+/* Checks if two offsets are described by the same refcount block */
+static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
+ uint64_t offset_b)
+{
+ uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+ uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+
+ return (block_a == block_b);
+}
+
+/*
+ * Loads a refcount block. If it doesn't exist yet, it is allocated first
+ * (including growing the refcount table if needed).
+ *
+ * Returns 0 on success or -errno in error case
+ */
+static int alloc_refcount_block(BlockDriverState *bs,
+ int64_t cluster_index, uint16_t **refcount_block)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int refcount_table_index;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+ /* Find the refcount block for the given cluster */
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+ if (refcount_table_index < s->refcount_table_size) {
+
+ uint64_t refcount_block_offset =
+ s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
+
+ /* If it's already there, we're done */
+ if (refcount_block_offset) {
+ return load_refcount_block(bs, refcount_block_offset,
+ (void**) refcount_block);
+ }
+ }
+
+ /*
+ * If we came here, we need to allocate something. Something is at least
+ * a cluster for the new refcount block. It may also include a new refcount
+ * table if the old refcount table is too small.
+ *
+ * Note that allocating clusters here needs some special care:
+ *
+ * - We can't use the normal qcow2_alloc_clusters(), it would try to
+ * increase the refcount and very likely we would end up with an endless
+ * recursion. Instead we must place the refcount blocks in a way that
+ * they can describe them themselves.
+ *
+ * - We need to consider that at this point we are inside update_refcounts
+ * and doing the initial refcount increase. This means that some clusters
+ * have already been allocated by the caller, but their refcount isn't
+ * accurate yet. free_cluster_index tells us where this allocation ends
+ * as long as we don't overwrite it by freeing clusters.
+ *
+ * - alloc_clusters_noref and qcow2_free_clusters may load a different
+ * refcount block into the cache
+ */
+
+ *refcount_block = NULL;
+
+ /* We write to the refcount table, so we might depend on L2 tables */
+ qcow2_cache_flush(bs, s->l2_table_cache);
+
+ /* Allocate the refcount block itself and mark it as used */
+ int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+ if (new_block < 0) {
+ return new_block;
+ }
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
+ " at %" PRIx64 "\n",
+ refcount_table_index, cluster_index << s->cluster_bits, new_block);
+#endif
+
+ if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
+ /* Zero the new refcount block before updating it */
+ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+ (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ memset(*refcount_block, 0, s->cluster_size);
+
+ /* The block describes itself, need to update the cache */
+ int block_index = (new_block >> s->cluster_bits) &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ (*refcount_block)[block_index] = cpu_to_be16(1);
+ } else {
+ /* Described somewhere else. This can recurse at most twice before we
+ * arrive at a block that describes itself. */
+ ret = update_refcount(bs, new_block, s->cluster_size, 1);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ bdrv_flush(bs->file);
+
+ /* Initialize the new refcount block only after updating its refcount,
+ * update_refcount uses the refcount cache itself */
+ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+ (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ memset(*refcount_block, 0, s->cluster_size);
+ }
+
+ /* Now the new refcount block needs to be written to disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
+ qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ /* If the refcount table is big enough, just hook the block up there */
+ if (refcount_table_index < s->refcount_table_size) {
+ uint64_t data64 = cpu_to_be64(new_block);
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
+ ret = bdrv_pwrite_sync(bs->file,
+ s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
+ &data64, sizeof(data64));
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ s->refcount_table[refcount_table_index] = new_block;
+ return 0;
+ }
+
+ ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ /*
+ * If we come here, we need to grow the refcount table. Again, a new
+ * refcount table needs some space and we can't simply allocate to avoid
+ * endless recursion.
+ *
+ * Therefore let's grab new refcount blocks at the end of the image, which
+ * will describe themselves and the new refcount table. This way we can
+ * reference them only in the new table and do the switch to the new
+ * refcount table at once without producing an inconsistent state in
+ * between.
+ */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
+
+ /* Calculate the number of refcount blocks needed so far */
+ uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
+ uint64_t blocks_used = (s->free_cluster_index +
+ refcount_block_clusters - 1) / refcount_block_clusters;
+
+ /* And now we need at least one block more for the new metadata */
+ uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
+ uint64_t last_table_size;
+ uint64_t blocks_clusters;
+ do {
+ uint64_t table_clusters = size_to_clusters(s, table_size);
+ blocks_clusters = 1 +
+ ((table_clusters + refcount_block_clusters - 1)
+ / refcount_block_clusters);
+ uint64_t meta_clusters = table_clusters + blocks_clusters;
+
+ last_table_size = table_size;
+ table_size = next_refcount_table_size(s, blocks_used +
+ ((meta_clusters + refcount_block_clusters - 1)
+ / refcount_block_clusters));
+
+ } while (last_table_size != table_size);
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
+ s->refcount_table_size, table_size);
+#endif
+
+ /* Create the new refcount table and blocks */
+ uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
+ s->cluster_size;
+ uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
+ uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
+ uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
+
+ assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
+
+ /* Fill the new refcount table */
+ memcpy(new_table, s->refcount_table,
+ s->refcount_table_size * sizeof(uint64_t));
+ new_table[refcount_table_index] = new_block;
+
+ int i;
+ for (i = 0; i < blocks_clusters; i++) {
+ new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
+ }
+
+ /* Fill the refcount blocks */
+ uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
+ int block = 0;
+ for (i = 0; i < table_clusters + blocks_clusters; i++) {
+ new_blocks[block++] = cpu_to_be16(1);
+ }
+
+ /* Write refcount blocks to disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
+ ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
+ blocks_clusters * s->cluster_size);
+ g_free(new_blocks);
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ /* Write refcount table to disk */
+ for(i = 0; i < table_size; i++) {
+ cpu_to_be64s(&new_table[i]);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
+ ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
+ table_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ for(i = 0; i < table_size; i++) {
+ be64_to_cpus(&new_table[i]);
+ }
+
+ /* Hook up the new refcount table in the qcow2 header */
+ uint8_t data[12];
+ cpu_to_be64w((uint64_t*)data, table_offset);
+ cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
+ data, sizeof(data));
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ /* And switch it in memory */
+ uint64_t old_table_offset = s->refcount_table_offset;
+ uint64_t old_table_size = s->refcount_table_size;
+
+ g_free(s->refcount_table);
+ s->refcount_table = new_table;
+ s->refcount_table_size = table_size;
+ s->refcount_table_offset = table_offset;
+
+ /* Free old table. Remember, we must not change free_cluster_index */
+ uint64_t old_free_cluster_index = s->free_cluster_index;
+ qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+ s->free_cluster_index = old_free_cluster_index;
+
+ ret = load_refcount_block(bs, new_block, (void**) refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+
+fail_table:
+ g_free(new_table);
+fail_block:
+ if (*refcount_block != NULL) {
+ qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+ }
+ return ret;
+}
+
+/* XXX: cache several refcount block clusters ? */
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length, int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+ uint16_t *refcount_block = NULL;
+ int64_t old_table_index = -1;
+ int ret;
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+ offset, length, addend);
+#endif
+ if (length < 0) {
+ return -EINVAL;
+ } else if (length == 0) {
+ return 0;
+ }
+
+ if (addend < 0) {
+ qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+ s->l2_table_cache);
+ }
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + length - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size)
+ {
+ int block_index, refcount;
+ int64_t cluster_index = cluster_offset >> s->cluster_bits;
+ int64_t table_index =
+ cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+ /* Load the refcount block and allocate it if needed */
+ if (table_index != old_table_index) {
+ if (refcount_block) {
+ ret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ old_table_index = table_index;
+
+ qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+
+ /* we can update the count and save it */
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+
+ refcount = be16_to_cpu(refcount_block[block_index]);
+ refcount += addend;
+ if (refcount < 0 || refcount > 0xffff) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (refcount == 0 && cluster_index < s->free_cluster_index) {
+ s->free_cluster_index = cluster_index;
+ }
+ refcount_block[block_index] = cpu_to_be16(refcount);
+ }
+
+ ret = 0;
+fail:
+ /* Write last changed block to disk */
+ if (refcount_block) {
+ int wret;
+ wret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (wret < 0) {
+ return ret < 0 ? ret : wret;
+ }
+ }
+
+ /*
+ * Try do undo any updates if an error is returned (This may succeed in
+ * some cases like ENOSPC for allocating a new refcount block)
+ */
+ if (ret < 0) {
+ int dummy;
+ dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
+ (void)dummy;
+ }
+
+ return ret;
+}
+
+/*
+ * Increases or decreases the refcount of a given cluster by one.
+ * addend must be 1 or -1.
+ *
+ * If the return value is non-negative, it is the new refcount of the cluster.
+ * If it is negative, it is -errno and indicates an error.
+ */
+static int update_cluster_refcount(BlockDriverState *bs,
+ int64_t cluster_index,
+ int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bdrv_flush(bs->file);
+
+ return get_refcount(bs, cluster_index);
+}
+
+
+
+/*********************************************************/
+/* cluster allocation functions */
+
+
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, nb_clusters, refcount;
+
+ nb_clusters = size_to_clusters(s, size);
+retry:
+ for(i = 0; i < nb_clusters; i++) {
+ int64_t next_cluster_index = s->free_cluster_index++;
+ refcount = get_refcount(bs, next_cluster_index);
+
+ if (refcount < 0) {
+ return refcount;
+ } else if (refcount != 0) {
+ goto retry;
+ }
+ }
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+ size,
+ (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+ return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+ int64_t offset;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+ offset = alloc_clusters_noref(bs, size);
+ if (offset < 0) {
+ return offset;
+ }
+
+ ret = update_refcount(bs, offset, size, 1);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return offset;
+}
+
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t cluster_index;
+ uint64_t old_free_cluster_index;
+ int i, refcount, ret;
+
+ /* Check how many clusters there are free */
+ cluster_index = offset >> s->cluster_bits;
+ for(i = 0; i < nb_clusters; i++) {
+ refcount = get_refcount(bs, cluster_index++);
+
+ if (refcount < 0) {
+ return refcount;
+ } else if (refcount != 0) {
+ break;
+ }
+ }
+
+ /* And then allocate them */
+ old_free_cluster_index = s->free_cluster_index;
+ s->free_cluster_index = cluster_index + i;
+
+ ret = update_refcount(bs, offset, i << s->cluster_bits, 1);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->free_cluster_index = old_free_cluster_index;
+
+ return i;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+ contiguous sectors. size must be <= cluster_size */
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t offset, cluster_offset;
+ int free_in_cluster;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+ assert(size > 0 && size <= s->cluster_size);
+ if (s->free_byte_offset == 0) {
+ offset = qcow2_alloc_clusters(bs, s->cluster_size);
+ if (offset < 0) {
+ return offset;
+ }
+ s->free_byte_offset = offset;
+ }
+ redo:
+ free_in_cluster = s->cluster_size -
+ (s->free_byte_offset & (s->cluster_size - 1));
+ if (size <= free_in_cluster) {
+ /* enough space in current cluster */
+ offset = s->free_byte_offset;
+ s->free_byte_offset += size;
+ free_in_cluster -= size;
+ if (free_in_cluster == 0)
+ s->free_byte_offset = 0;
+ if ((offset & (s->cluster_size - 1)) != 0)
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ } else {
+ offset = qcow2_alloc_clusters(bs, s->cluster_size);
+ if (offset < 0) {
+ return offset;
+ }
+ cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+ if ((cluster_offset + s->cluster_size) == offset) {
+ /* we are lucky: contiguous data */
+ offset = s->free_byte_offset;
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ s->free_byte_offset += size;
+ } else {
+ s->free_byte_offset = offset;
+ goto redo;
+ }
+ }
+
+ bdrv_flush(bs->file);
+ return offset;
+}
+
+void qcow2_free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size)
+{
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
+ ret = update_refcount(bs, offset, size, -1);
+ if (ret < 0) {
+ fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
+ /* TODO Remember the clusters to free them later and avoid leaking */
+ }
+}
+
+/*
+ * Free a cluster using its L2 entry (handles clusters of all types, e.g.
+ * normal cluster, compressed cluster, etc.)
+ */
+void qcow2_free_any_clusters(BlockDriverState *bs,
+ uint64_t l2_entry, int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ switch (qcow2_get_cluster_type(l2_entry)) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ {
+ int nb_csectors;
+ nb_csectors = ((l2_entry >> s->csize_shift) &
+ s->csize_mask) + 1;
+ qcow2_free_clusters(bs,
+ (l2_entry & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512);
+ }
+ break;
+ case QCOW2_CLUSTER_NORMAL:
+ qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+ nb_clusters << s->cluster_bits);
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ case QCOW2_CLUSTER_ZERO:
+ break;
+ default:
+ abort();
+ }
+}
+
+
+
+/*********************************************************/
+/* snapshots and image creation */
+
+
+
+/* update the refcounts of snapshots and the copied flag */
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+ int64_t l1_table_offset, int l1_size, int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+ int64_t old_offset, old_l2_offset;
+ int i, j, l1_modified = 0, nb_csectors, refcount;
+ int ret;
+
+ l2_table = NULL;
+ l1_table = NULL;
+ l1_size2 = l1_size * sizeof(uint64_t);
+
+ /* WARNING: qcow2_snapshot_goto relies on this function not using the
+ * l1_table_offset when it is the current s->l1_table_offset! Be careful
+ * when changing this! */
+ if (l1_table_offset != s->l1_table_offset) {
+ if (l1_size2 != 0) {
+ l1_table = g_malloc0(align_offset(l1_size2, 512));
+ } else {
+ l1_table = NULL;
+ }
+ l1_allocated = 1;
+ if (bdrv_pread(bs->file, l1_table_offset,
+ l1_table, l1_size2) != l1_size2)
+ {
+ ret = -EIO;
+ goto fail;
+ }
+
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ } else {
+ assert(l1_size == s->l1_size);
+ l1_table = s->l1_table;
+ l1_allocated = 0;
+ }
+
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ old_l2_offset = l2_offset;
+ l2_offset &= L1E_OFFSET_MASK;
+
+ ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+ (void**) &l2_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for(j = 0; j < s->l2_size; j++) {
+ offset = be64_to_cpu(l2_table[j]);
+ if (offset != 0) {
+ old_offset = offset;
+ offset &= ~QCOW_OFLAG_COPIED;
+ if (offset & QCOW_OFLAG_COMPRESSED) {
+ nb_csectors = ((offset >> s->csize_shift) &
+ s->csize_mask) + 1;
+ if (addend != 0) {
+ int ret;
+ ret = update_refcount(bs,
+ (offset & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512, addend);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* TODO Flushing once for the whole function should
+ * be enough */
+ bdrv_flush(bs->file);
+ }
+ /* compressed clusters are never modified */
+ refcount = 2;
+ } else {
+ uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, cluster_index, addend);
+ } else {
+ refcount = get_refcount(bs, cluster_index);
+ }
+
+ if (refcount < 0) {
+ ret = -EIO;
+ goto fail;
+ }
+ }
+
+ if (refcount == 1) {
+ offset |= QCOW_OFLAG_COPIED;
+ }
+ if (offset != old_offset) {
+ if (addend > 0) {
+ qcow2_cache_set_dependency(bs, s->l2_table_cache,
+ s->refcount_block_cache);
+ }
+ l2_table[j] = cpu_to_be64(offset);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ }
+ }
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+ } else {
+ refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+ }
+ if (refcount < 0) {
+ ret = -EIO;
+ goto fail;
+ } else if (refcount == 1) {
+ l2_offset |= QCOW_OFLAG_COPIED;
+ }
+ if (l2_offset != old_l2_offset) {
+ l1_table[i] = l2_offset;
+ l1_modified = 1;
+ }
+ }
+ }
+
+ ret = 0;
+fail:
+ if (l2_table) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ }
+
+ /* Update L1 only if it isn't deleted anyway (addend = -1) */
+ if (addend >= 0 && l1_modified) {
+ for(i = 0; i < l1_size; i++)
+ cpu_to_be64s(&l1_table[i]);
+ if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
+ l1_size2) < 0)
+ goto fail;
+ for(i = 0; i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ }
+ if (l1_allocated)
+ g_free(l1_table);
+ return ret;
+}
+
+
+
+
+/*********************************************************/
+/* refcount checking functions */
+
+
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Modifies the number of errors in res.
+ */
+static void inc_refcounts(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t offset, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+ int k;
+
+ if (size <= 0)
+ return;
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + size - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size) {
+ k = cluster_offset >> s->cluster_bits;
+ if (k < 0) {
+ fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+ cluster_offset);
+ res->corruptions++;
+ } else if (k >= refcount_table_size) {
+ fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
+ "the end of the image file, can't properly check refcounts.\n",
+ cluster_offset);
+ res->check_errors++;
+ } else {
+ if (++refcount_table[k] == 0) {
+ fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+ "\n", cluster_offset);
+ res->corruptions++;
+ }
+ }
+ }
+}
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+ uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+ int check_copied)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table, l2_entry;
+ int i, l2_size, nb_csectors, refcount;
+
+ /* Read L2 table from disk */
+ l2_size = s->l2_size * sizeof(uint64_t);
+ l2_table = g_malloc(l2_size);
+
+ if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+ goto fail;
+
+ /* Do the actual checks */
+ for(i = 0; i < s->l2_size; i++) {
+ l2_entry = be64_to_cpu(l2_table[i]);
+
+ switch (qcow2_get_cluster_type(l2_entry)) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+ if (l2_entry & QCOW_OFLAG_COPIED) {
+ fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+ "copied flag must never be set for compressed "
+ "clusters\n", l2_entry >> s->cluster_bits);
+ l2_entry &= ~QCOW_OFLAG_COPIED;
+ res->corruptions++;
+ }
+
+ /* Mark cluster as used */
+ nb_csectors = ((l2_entry >> s->csize_shift) &
+ s->csize_mask) + 1;
+ l2_entry &= s->cluster_offset_mask;
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l2_entry & ~511, nb_csectors * 512);
+ break;
+
+ case QCOW2_CLUSTER_ZERO:
+ if ((l2_entry & L2E_OFFSET_MASK) == 0) {
+ break;
+ }
+ /* fall through */
+
+ case QCOW2_CLUSTER_NORMAL:
+ {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+
+ if (check_copied) {
+ refcount = get_refcount(bs, offset >> s->cluster_bits);
+ if (refcount < 0) {
+ fprintf(stderr, "Can't get refcount for offset %"
+ PRIx64 ": %s\n", l2_entry, strerror(-refcount));
+ goto fail;
+ }
+ if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+ PRIx64 " refcount=%d\n", l2_entry, refcount);
+ res->corruptions++;
+ }
+ }
+
+ /* Mark cluster as used */
+ inc_refcounts(bs, res, refcount_table,refcount_table_size,
+ offset, s->cluster_size);
+
+ /* Correct offsets are cluster aligned */
+ if (offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+ "properly aligned; L2 entry corrupted.\n", offset);
+ res->corruptions++;
+ }
+ break;
+ }
+
+ case QCOW2_CLUSTER_UNALLOCATED:
+ break;
+
+ default:
+ abort();
+ }
+ }
+
+ g_free(l2_table);
+ return 0;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
+ g_free(l2_table);
+ return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t l1_table_offset, int l1_size,
+ int check_copied)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, l2_offset, l1_size2;
+ int i, refcount, ret;
+
+ l1_size2 = l1_size * sizeof(uint64_t);
+
+ /* Mark L1 table as used */
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l1_table_offset, l1_size2);
+
+ /* Read L1 table entries from disk */
+ if (l1_size2 == 0) {
+ l1_table = NULL;
+ } else {
+ l1_table = g_malloc(l1_size2);
+ if (bdrv_pread(bs->file, l1_table_offset,
+ l1_table, l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ }
+
+ /* Do the actual checks */
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ if (check_copied) {
+ refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+ >> s->cluster_bits);
+ if (refcount < 0) {
+ fprintf(stderr, "Can't get refcount for l2_offset %"
+ PRIx64 ": %s\n", l2_offset, strerror(-refcount));
+ goto fail;
+ }
+ if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+ " refcount=%d\n", l2_offset, refcount);
+ res->corruptions++;
+ }
+ }
+
+ /* Mark L2 table as used */
+ l2_offset &= L1E_OFFSET_MASK;
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l2_offset, s->cluster_size);
+
+ /* L2 tables are cluster aligned */
+ if (l2_offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+ "cluster aligned; L1 entry corrupted\n", l2_offset);
+ res->corruptions++;
+ }
+
+ /* Process and check L2 entries */
+ ret = check_refcounts_l2(bs, res, refcount_table,
+ refcount_table_size, l2_offset, check_copied);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ }
+ g_free(l1_table);
+ return 0;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+ res->check_errors++;
+ g_free(l1_table);
+ return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occurred.
+ */
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t size, i;
+ int nb_clusters, refcount1, refcount2;
+ QCowSnapshot *sn;
+ uint16_t *refcount_table;
+ int ret;
+
+ size = bdrv_getlength(bs->file);
+ nb_clusters = size_to_clusters(s, size);
+ refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+
+ /* header */
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ 0, s->cluster_size);
+
+ /* current L1 table */
+ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+ s->l1_table_offset, s->l1_size, 1);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* snapshots */
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+ sn->l1_table_offset, sn->l1_size, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ s->snapshots_offset, s->snapshots_size);
+
+ /* refcount data */
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ s->refcount_table_offset,
+ s->refcount_table_size * sizeof(uint64_t));
+
+ for(i = 0; i < s->refcount_table_size; i++) {
+ uint64_t offset, cluster;
+ offset = s->refcount_table[i];
+ cluster = offset >> s->cluster_bits;
+
+ /* Refcount blocks are cluster aligned */
+ if (offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
+ "cluster aligned; refcount table entry corrupted\n", i);
+ res->corruptions++;
+ continue;
+ }
+
+ if (cluster >= nb_clusters) {
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " is outside image\n", i);
+ res->corruptions++;
+ continue;
+ }
+
+ if (offset != 0) {
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ offset, s->cluster_size);
+ if (refcount_table[cluster] != 1) {
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " refcount=%d\n",
+ i, refcount_table[cluster]);
+ res->corruptions++;
+ }
+ }
+ }
+
+ /* compare ref counts */
+ for(i = 0; i < nb_clusters; i++) {
+ refcount1 = get_refcount(bs, i);
+ if (refcount1 < 0) {
+ fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+ i, strerror(-refcount1));
+ res->check_errors++;
+ continue;
+ }
+
+ refcount2 = refcount_table[i];
+ if (refcount1 != refcount2) {
+
+ /* Check if we're allowed to fix the mismatch */
+ int *num_fixed = NULL;
+ if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+ num_fixed = &res->leaks_fixed;
+ } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+ num_fixed = &res->corruptions_fixed;
+ }
+
+ fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+ num_fixed != NULL ? "Repairing" :
+ refcount1 < refcount2 ? "ERROR" :
+ "Leaked",
+ i, refcount1, refcount2);
+
+ if (num_fixed) {
+ ret = update_refcount(bs, i << s->cluster_bits, 1,
+ refcount2 - refcount1);
+ if (ret >= 0) {
+ (*num_fixed)++;
+ continue;
+ }
+ }
+
+ /* And if we couldn't, print an error */
+ if (refcount1 < refcount2) {
+ res->corruptions++;
+ } else {
+ res->leaks++;
+ }
+ }
+ }
+
+ ret = 0;
+
+fail:
+ g_free(refcount_table);
+
+ return ret;
+}
+
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
new file mode 100644
index 000000000..4e7c93b8b
--- /dev/null
+++ b/block/qcow2-snapshot.c
@@ -0,0 +1,660 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+
+typedef struct QEMU_PACKED QCowSnapshotHeader {
+ /* header is 8 byte aligned */
+ uint64_t l1_table_offset;
+
+ uint32_t l1_size;
+ uint16_t id_str_size;
+ uint16_t name_size;
+
+ uint32_t date_sec;
+ uint32_t date_nsec;
+
+ uint64_t vm_clock_nsec;
+
+ uint32_t vm_state_size;
+ uint32_t extra_data_size; /* for extension */
+ /* extra data follows */
+ /* id_str follows */
+ /* name follows */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+ uint64_t vm_state_size_large;
+ uint64_t disk_size;
+} QCowSnapshotExtraData;
+
+void qcow2_free_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ g_free(s->snapshots[i].name);
+ g_free(s->snapshots[i].id_str);
+ }
+ g_free(s->snapshots);
+ s->snapshots = NULL;
+ s->nb_snapshots = 0;
+}
+
+int qcow2_read_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshotHeader h;
+ QCowSnapshotExtraData extra;
+ QCowSnapshot *sn;
+ int i, id_str_size, name_size;
+ int64_t offset;
+ uint32_t extra_data_size;
+ int ret;
+
+ if (!s->nb_snapshots) {
+ s->snapshots = NULL;
+ s->snapshots_size = 0;
+ return 0;
+ }
+
+ offset = s->snapshots_offset;
+ s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ /* Read statically sized part of the snapshot header */
+ offset = align_offset(offset, 8);
+ ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ offset += sizeof(h);
+ sn = s->snapshots + i;
+ sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+ sn->l1_size = be32_to_cpu(h.l1_size);
+ sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+ sn->date_sec = be32_to_cpu(h.date_sec);
+ sn->date_nsec = be32_to_cpu(h.date_nsec);
+ sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+ extra_data_size = be32_to_cpu(h.extra_data_size);
+
+ id_str_size = be16_to_cpu(h.id_str_size);
+ name_size = be16_to_cpu(h.name_size);
+
+ /* Read extra data */
+ ret = bdrv_pread(bs->file, offset, &extra,
+ MIN(sizeof(extra), extra_data_size));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += extra_data_size;
+
+ if (extra_data_size >= 8) {
+ sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
+ }
+
+ if (extra_data_size >= 16) {
+ sn->disk_size = be64_to_cpu(extra.disk_size);
+ } else {
+ sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ }
+
+ /* Read snapshot ID */
+ sn->id_str = g_malloc(id_str_size + 1);
+ ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += id_str_size;
+ sn->id_str[id_str_size] = '\0';
+
+ /* Read snapshot name */
+ sn->name = g_malloc(name_size + 1);
+ ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += name_size;
+ sn->name[name_size] = '\0';
+ }
+
+ s->snapshots_size = offset - s->snapshots_offset;
+ return 0;
+
+fail:
+ qcow2_free_snapshots(bs);
+ return ret;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow2_write_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ QCowSnapshotHeader h;
+ QCowSnapshotExtraData extra;
+ int i, name_size, id_str_size, snapshots_size;
+ struct {
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+ } QEMU_PACKED header_data;
+ int64_t offset, snapshots_offset;
+ int ret;
+
+ /* compute the size of the snapshots */
+ offset = 0;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ offset = align_offset(offset, 8);
+ offset += sizeof(h);
+ offset += sizeof(extra);
+ offset += strlen(sn->id_str);
+ offset += strlen(sn->name);
+ }
+ snapshots_size = offset;
+
+ /* Allocate space for the new snapshot list */
+ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+ bdrv_flush(bs->file);
+ offset = snapshots_offset;
+ if (offset < 0) {
+ return offset;
+ }
+
+ /* Write all snapshots to the new list */
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ memset(&h, 0, sizeof(h));
+ h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+ h.l1_size = cpu_to_be32(sn->l1_size);
+ /* If it doesn't fit in 32 bit, older implementations should treat it
+ * as a disk-only snapshot rather than truncate the VM state */
+ if (sn->vm_state_size <= 0xffffffff) {
+ h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+ }
+ h.date_sec = cpu_to_be32(sn->date_sec);
+ h.date_nsec = cpu_to_be32(sn->date_nsec);
+ h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+ h.extra_data_size = cpu_to_be32(sizeof(extra));
+
+ memset(&extra, 0, sizeof(extra));
+ extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
+ extra.disk_size = cpu_to_be64(sn->disk_size);
+
+ id_str_size = strlen(sn->id_str);
+ name_size = strlen(sn->name);
+ h.id_str_size = cpu_to_be16(id_str_size);
+ h.name_size = cpu_to_be16(name_size);
+ offset = align_offset(offset, 8);
+
+ ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += sizeof(h);
+
+ ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += sizeof(extra);
+
+ ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += id_str_size;
+
+ ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += name_size;
+ }
+
+ /*
+ * Update the header to point to the new snapshot table. This requires the
+ * new table and its refcounts to be stable on disk.
+ */
+ ret = bdrv_flush(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+ offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+ header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots);
+ header_data.snapshots_offset = cpu_to_be64(snapshots_offset);
+
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+ &header_data, sizeof(header_data));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* free the old snapshot table */
+ qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+ s->snapshots_offset = snapshots_offset;
+ s->snapshots_size = snapshots_size;
+ return 0;
+
+fail:
+ return ret;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+ char *id_str, int id_str_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, id, id_max = 0;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ id = strtoul(sn->id_str, NULL, 10);
+ if (id > id_max)
+ id_max = id;
+ }
+ snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].id_str, id_str))
+ return i;
+ }
+ return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, ret;
+
+ ret = find_snapshot_by_id(bs, name);
+ if (ret >= 0)
+ return ret;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].name, name))
+ return i;
+ }
+ return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *new_snapshot_list = NULL;
+ QCowSnapshot *old_snapshot_list = NULL;
+ QCowSnapshot sn1, *sn = &sn1;
+ int i, ret;
+ uint64_t *l1_table = NULL;
+ int64_t l1_table_offset;
+
+ memset(sn, 0, sizeof(*sn));
+
+ /* Generate an ID if it wasn't passed */
+ if (sn_info->id_str[0] == '\0') {
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+ }
+
+ /* Check that the ID is unique */
+ if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
+ return -EEXIST;
+ }
+
+ /* Populate sn with passed data */
+ sn->id_str = g_strdup(sn_info->id_str);
+ sn->name = g_strdup(sn_info->name);
+
+ sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ sn->vm_state_size = sn_info->vm_state_size;
+ sn->date_sec = sn_info->date_sec;
+ sn->date_nsec = sn_info->date_nsec;
+ sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+ /* Allocate the L1 table of the snapshot and copy the current one there. */
+ l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+ if (l1_table_offset < 0) {
+ ret = l1_table_offset;
+ goto fail;
+ }
+
+ sn->l1_table_offset = l1_table_offset;
+ sn->l1_size = s->l1_size;
+
+ l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+ for(i = 0; i < s->l1_size; i++) {
+ l1_table[i] = cpu_to_be64(s->l1_table[i]);
+ }
+
+ ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ g_free(l1_table);
+ l1_table = NULL;
+
+ /*
+ * Increase the refcounts of all clusters and make sure everything is
+ * stable on disk before updating the snapshot table to contain a pointer
+ * to the new L1 table.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = bdrv_flush(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Append the new snapshot to the snapshot list */
+ new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+ if (s->snapshots) {
+ memcpy(new_snapshot_list, s->snapshots,
+ s->nb_snapshots * sizeof(QCowSnapshot));
+ old_snapshot_list = s->snapshots;
+ }
+ s->snapshots = new_snapshot_list;
+ s->snapshots[s->nb_snapshots++] = *sn;
+
+ ret = qcow2_write_snapshots(bs);
+ if (ret < 0) {
+ g_free(s->snapshots);
+ s->snapshots = old_snapshot_list;
+ goto fail;
+ }
+
+ g_free(old_snapshot_list);
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+
+fail:
+ g_free(sn->id_str);
+ g_free(sn->name);
+ g_free(l1_table);
+
+ return ret;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, snapshot_index;
+ int cur_l1_bytes, sn_l1_bytes;
+ int ret;
+ uint64_t *sn_l1_table = NULL;
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = &s->snapshots[snapshot_index];
+
+ if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
+ error_report("qcow2: Loading snapshots with different disk "
+ "size is not implemented");
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /*
+ * Make sure that the current L1 table is big enough to contain the whole
+ * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+ * current one must be padded with zeros.
+ */
+ ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ cur_l1_bytes = s->l1_size * sizeof(uint64_t);
+ sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
+
+ /*
+ * Copy the snapshot L1 table to the current L1 table.
+ *
+ * Before overwriting the old current L1 table on disk, make sure to
+ * increase all refcounts for the clusters referenced by the new one.
+ * Decrease the refcount referenced by the old one only when the L1
+ * table is overwritten.
+ */
+ sn_l1_table = g_malloc0(cur_l1_bytes);
+
+ ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+ sn->l1_size, 1);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+ cur_l1_bytes);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /*
+ * Decrease refcount of clusters of current L1 table.
+ *
+ * At this point, the in-memory s->l1_table points to the old L1 table,
+ * whereas on disk we already have the new one.
+ *
+ * qcow2_update_snapshot_refcount special cases the current L1 table to use
+ * the in-memory data instead of really using the offset to load a new one,
+ * which is why this works.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+ s->l1_size, -1);
+
+ /*
+ * Now update the in-memory L1 table to be in sync with the on-disk one. We
+ * need to do this even if updating refcounts failed.
+ */
+ for(i = 0;i < s->l1_size; i++) {
+ s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
+ }
+
+ if (ret < 0) {
+ goto fail;
+ }
+
+ g_free(sn_l1_table);
+ sn_l1_table = NULL;
+
+ /*
+ * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+ * when we decreased the refcount of the old snapshot.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+
+fail:
+ g_free(sn_l1_table);
+ return ret;
+}
+
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot sn;
+ int snapshot_index, ret;
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = s->snapshots[snapshot_index];
+
+ /* Remove it from the snapshot list */
+ memmove(s->snapshots + snapshot_index,
+ s->snapshots + snapshot_index + 1,
+ (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+ s->nb_snapshots--;
+ ret = qcow2_write_snapshots(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * The snapshot is now unused, clean up. If we fail after this point, we
+ * won't recover but just leak clusters.
+ */
+ g_free(sn.id_str);
+ g_free(sn.name);
+
+ /*
+ * Now decrease the refcounts of clusters referenced by the snapshot and
+ * free the L1 table.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+ sn.l1_size, -1);
+ if (ret < 0) {
+ return ret;
+ }
+ qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
+
+ /* must update the copied flag on the current cluster offsets */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+}
+
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+ BDRVQcowState *s = bs->opaque;
+ QEMUSnapshotInfo *sn_tab, *sn_info;
+ QCowSnapshot *sn;
+ int i;
+
+ if (!s->nb_snapshots) {
+ *psn_tab = NULL;
+ return s->nb_snapshots;
+ }
+
+ sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn_info = sn_tab + i;
+ sn = s->snapshots + i;
+ pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+ sn->id_str);
+ pstrcpy(sn_info->name, sizeof(sn_info->name),
+ sn->name);
+ sn_info->vm_state_size = sn->vm_state_size;
+ sn_info->date_sec = sn->date_sec;
+ sn_info->date_nsec = sn->date_nsec;
+ sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+ }
+ *psn_tab = sn_tab;
+ return s->nb_snapshots;
+}
+
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+{
+ int i, snapshot_index;
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ uint64_t *new_l1_table;
+ int new_l1_bytes;
+ int ret;
+
+ assert(bs->read_only);
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = &s->snapshots[snapshot_index];
+
+ /* Allocate and read in the snapshot's L1 table */
+ new_l1_bytes = s->l1_size * sizeof(uint64_t);
+ new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+
+ ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+ if (ret < 0) {
+ g_free(new_l1_table);
+ return ret;
+ }
+
+ /* Switch the L1 table */
+ g_free(s->l1_table);
+
+ s->l1_size = sn->l1_size;
+ s->l1_table_offset = sn->l1_table_offset;
+ s->l1_table = new_l1_table;
+
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+
+ return 0;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
new file mode 100644
index 000000000..8f183f146
--- /dev/null
+++ b/block/qcow2.c
@@ -0,0 +1,1719 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+#include "block/qcow2.h"
+#include "qemu-error.h"
+#include "qerror.h"
+#include "trace.h"
+
+/*
+ Differences with QCOW:
+
+ - Support for multiple incremental snapshots.
+ - Memory management by reference counts.
+ - Clusters which have a reference count of one have the bit
+ QCOW_OFLAG_COPIED to optimize write performance.
+ - Size of compressed clusters is stored in sectors to reduce bit usage
+ in the cluster offsets.
+ - Support for storing additional data (such as the VM state) in the
+ snapshots.
+ - If a backing store is used, the cluster size is not constrained
+ (could be backported to QCOW).
+ - L2 tables have always a size of one cluster.
+*/
+
+
+typedef struct {
+ uint32_t magic;
+ uint32_t len;
+} QCowExtension;
+#define QCOW2_EXT_MAGIC_END 0
+#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
+
+static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) >= 2)
+ return 100;
+ else
+ return 0;
+}
+
+
+/*
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+ uint64_t end_offset, void **p_feature_table)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowExtension ext;
+ uint64_t offset;
+ int ret;
+
+#ifdef DEBUG_EXT
+ printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+ offset = start_offset;
+ while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+ /* Sanity check */
+ if (offset > s->cluster_size)
+ printf("qcow2_read_extension: suspicious offset %lu\n", offset);
+
+ printf("attempting to read extended header in offset %lu\n", offset);
+#endif
+
+ if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+ fprintf(stderr, "qcow2_read_extension: ERROR: "
+ "pread fail from offset %" PRIu64 "\n",
+ offset);
+ return 1;
+ }
+ be32_to_cpus(&ext.magic);
+ be32_to_cpus(&ext.len);
+ offset += sizeof(ext);
+#ifdef DEBUG_EXT
+ printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+ if (ext.len > end_offset - offset) {
+ error_report("Header extension too large");
+ return -EINVAL;
+ }
+
+ switch (ext.magic) {
+ case QCOW2_EXT_MAGIC_END:
+ return 0;
+
+ case QCOW2_EXT_MAGIC_BACKING_FORMAT:
+ if (ext.len >= sizeof(bs->backing_format)) {
+ fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+ " (>=%zu)\n",
+ ext.len, sizeof(bs->backing_format));
+ return 2;
+ }
+ if (bdrv_pread(bs->file, offset , bs->backing_format,
+ ext.len) != ext.len)
+ return 3;
+ bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+ printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+ break;
+
+ case QCOW2_EXT_MAGIC_FEATURE_TABLE:
+ if (p_feature_table != NULL) {
+ void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
+ ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *p_feature_table = feature_table;
+ }
+ break;
+
+ default:
+ /* unknown magic - save it in case we need to rewrite the header */
+ {
+ Qcow2UnknownHeaderExtension *uext;
+
+ uext = g_malloc0(sizeof(*uext) + ext.len);
+ uext->magic = ext.magic;
+ uext->len = ext.len;
+ QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
+
+ ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ break;
+ }
+
+ offset += ((ext.len + 7) & ~7);
+ }
+
+ return 0;
+}
+
+static void cleanup_unknown_header_ext(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2UnknownHeaderExtension *uext, *next;
+
+ QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
+ QLIST_REMOVE(uext, next);
+ g_free(uext);
+ }
+}
+
+static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
+ const char *fmt, ...)
+{
+ char msg[64];
+ va_list ap;
+
+ va_start(ap, fmt);
+ vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "qcow2", msg);
+}
+
+static void report_unsupported_feature(BlockDriverState *bs,
+ Qcow2Feature *table, uint64_t mask)
+{
+ while (table && table->name[0] != '\0') {
+ if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
+ if (mask & (1 << table->bit)) {
+ report_unsupported(bs, "%.46s",table->name);
+ mask &= ~(1 << table->bit);
+ }
+ }
+ table++;
+ }
+
+ if (mask) {
+ report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+ }
+}
+
+/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully. Therefore it is not required to check the return
+ * value of this function.
+ */
+static int qcow2_mark_dirty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t val;
+ int ret;
+
+ assert(s->qcow_version >= 3);
+
+ if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+ return 0; /* already dirty */
+ }
+
+ val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
+ ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+ &val, sizeof(val));
+ if (ret < 0) {
+ return ret;
+ }
+ ret = bdrv_flush(bs->file);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Only treat image as dirty if the header was updated successfully */
+ s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
+ return 0;
+}
+
+/*
+ * Clears the dirty bit and flushes before if necessary. Only call this
+ * function when there are no pending requests, it does not guard against
+ * concurrent requests dirtying the image.
+ */
+static int qcow2_mark_clean(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+ int ret = bdrv_flush(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+ return qcow2_update_header(bs);
+ }
+ return 0;
+}
+
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
+{
+ int ret = qcow2_check_refcounts(bs, result, fix);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (fix && result->check_errors == 0 && result->corruptions == 0) {
+ return qcow2_mark_clean(bs);
+ }
+ return ret;
+}
+
+static int qcow2_open(BlockDriverState *bs, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, ret = 0;
+ QCowHeader header;
+ uint64_t ext_end;
+
+ ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto fail;
+ }
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.cluster_bits);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+ be32_to_cpus(&header.l1_size);
+ be64_to_cpus(&header.refcount_table_offset);
+ be32_to_cpus(&header.refcount_table_clusters);
+ be64_to_cpus(&header.snapshots_offset);
+ be32_to_cpus(&header.nb_snapshots);
+
+ if (header.magic != QCOW_MAGIC) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.version < 2 || header.version > 3) {
+ report_unsupported(bs, "QCOW version %d", header.version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ s->qcow_version = header.version;
+
+ /* Initialise version 3 header fields */
+ if (header.version == 2) {
+ header.incompatible_features = 0;
+ header.compatible_features = 0;
+ header.autoclear_features = 0;
+ header.refcount_order = 4;
+ header.header_length = 72;
+ } else {
+ be64_to_cpus(&header.incompatible_features);
+ be64_to_cpus(&header.compatible_features);
+ be64_to_cpus(&header.autoclear_features);
+ be32_to_cpus(&header.refcount_order);
+ be32_to_cpus(&header.header_length);
+ }
+
+ if (header.header_length > sizeof(header)) {
+ s->unknown_header_fields_size = header.header_length - sizeof(header);
+ s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
+ ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+ s->unknown_header_fields_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ if (header.backing_file_offset) {
+ ext_end = header.backing_file_offset;
+ } else {
+ ext_end = 1 << header.cluster_bits;
+ }
+
+ /* Handle feature bits */
+ s->incompatible_features = header.incompatible_features;
+ s->compatible_features = header.compatible_features;
+ s->autoclear_features = header.autoclear_features;
+
+ if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
+ void *feature_table = NULL;
+ qcow2_read_extensions(bs, header.header_length, ext_end,
+ &feature_table);
+ report_unsupported_feature(bs, feature_table,
+ s->incompatible_features &
+ ~QCOW2_INCOMPAT_MASK);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /* Check support for various header values */
+ if (header.refcount_order != 4) {
+ report_unsupported(bs, "%d bit reference counts",
+ 1 << header.refcount_order);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ if (header.cluster_bits < MIN_CLUSTER_BITS ||
+ header.cluster_bits > MAX_CLUSTER_BITS) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.crypt_method > QCOW_CRYPT_AES) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header) {
+ bs->encrypted = 1;
+ }
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->csize_shift = (62 - (s->cluster_bits - 8));
+ s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+ s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+ s->refcount_table_offset = header.refcount_table_offset;
+ s->refcount_table_size =
+ header.refcount_table_clusters << (s->cluster_bits - 3);
+
+ s->snapshots_offset = header.snapshots_offset;
+ s->nb_snapshots = header.nb_snapshots;
+
+ /* read the level 1 table */
+ s->l1_size = header.l1_size;
+ s->l1_vm_state_index = size_to_l1(s, header.size);
+ /* the L1 table must contain at least enough entries to put
+ header.size bytes */
+ if (s->l1_size < s->l1_vm_state_index) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->l1_table_offset = header.l1_table_offset;
+ if (s->l1_size > 0) {
+ s->l1_table = g_malloc0(
+ align_offset(s->l1_size * sizeof(uint64_t), 512));
+ ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ }
+
+ /* alloc L2 table/refcount block cache */
+ s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+ s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+
+ s->cluster_cache = g_malloc(s->cluster_size);
+ /* one more sector for decompressed data alignment */
+ s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+ + 512);
+ s->cluster_cache_offset = -1;
+ s->flags = flags;
+
+ ret = qcow2_refcount_init(bs);
+ if (ret != 0) {
+ goto fail;
+ }
+
+ QLIST_INIT(&s->cluster_allocs);
+
+ /* read qcow2 extensions */
+ if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023) {
+ len = 1023;
+ }
+ ret = bdrv_pread(bs->file, header.backing_file_offset,
+ bs->backing_file, len);
+ if (ret < 0) {
+ goto fail;
+ }
+ bs->backing_file[len] = '\0';
+ }
+
+ ret = qcow2_read_snapshots(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Clear unknown autoclear feature bits */
+ if (!bs->read_only && s->autoclear_features != 0) {
+ s->autoclear_features = 0;
+ ret = qcow2_update_header(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ /* Initialise locks */
+ qemu_co_mutex_init(&s->lock);
+
+ /* Repair image if dirty */
+ if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+ (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
+ BdrvCheckResult result = {0};
+
+ ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return ret;
+
+ fail:
+ g_free(s->unknown_header_fields);
+ cleanup_unknown_header_ext(bs);
+ qcow2_free_snapshots(bs);
+ qcow2_refcount_close(bs);
+ g_free(s->l1_table);
+ if (s->l2_table_cache) {
+ qcow2_cache_destroy(bs, s->l2_table_cache);
+ }
+ g_free(s->cluster_cache);
+ qemu_vfree(s->cluster_data);
+ return ret;
+}
+
+static int qcow2_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for(i=0;i<16;i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for(i = 0; i < 16; i++)
+ printf(" %02x", tmp[i]);
+ printf("\n");
+ for(i = 0; i < 16; i++)
+ printf(" %02x", out[i]);
+ printf("\n");
+ }
+#endif
+ return 0;
+}
+
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t cluster_offset;
+ int ret;
+
+ *pnum = nb_sectors;
+ /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+ * can't pass them on today */
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+ qemu_co_mutex_unlock(&s->lock);
+ if (ret < 0) {
+ *pnum = 0;
+ }
+
+ return (cluster_offset != 0);
+}
+
+/* handle reading after the end of the backing file */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors)
+{
+ int n1;
+ if ((sector_num + nb_sectors) <= bs->total_sectors)
+ return nb_sectors;
+ if (sector_num >= bs->total_sectors)
+ n1 = 0;
+ else
+ n1 = bs->total_sectors - sector_num;
+
+ qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+
+ return n1;
+}
+
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int remaining_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n1;
+ int ret;
+ int cur_nr_sectors; /* number of sectors in current iteration */
+ uint64_t cluster_offset = 0;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+ uint8_t *cluster_data = NULL;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (remaining_sectors != 0) {
+
+ /* prepare next request */
+ cur_nr_sectors = remaining_sectors;
+ if (s->crypt_method) {
+ cur_nr_sectors = MIN(cur_nr_sectors,
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+ }
+
+ ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+ &cur_nr_sectors, &cluster_offset);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * 512);
+
+ switch (ret) {
+ case QCOW2_CLUSTER_UNALLOCATED:
+
+ if (bs->backing_hd) {
+ /* read from the base image */
+ n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
+ sector_num, cur_nr_sectors);
+ if (n1 > 0) {
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->backing_hd, sector_num,
+ n1, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ } else {
+ /* Note: in this case, no need to wait */
+ qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+ }
+ break;
+
+ case QCOW2_CLUSTER_ZERO:
+ if (s->qcow_version < 3) {
+ ret = -EIO;
+ goto fail;
+ }
+ qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+ break;
+
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* add AIO support for compressed blocks ? */
+ ret = qcow2_decompress_cluster(bs, cluster_offset);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ qemu_iovec_from_buf(&hd_qiov, 0,
+ s->cluster_cache + index_in_cluster * 512,
+ 512 * cur_nr_sectors);
+ break;
+
+ case QCOW2_CLUSTER_NORMAL:
+ if ((cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto fail;
+ }
+
+ if (s->crypt_method) {
+ /*
+ * For encrypted images, read everything into a temporary
+ * contiguous buffer on which the AES functions can work.
+ */
+ if (!cluster_data) {
+ cluster_data =
+ qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+ }
+
+ assert(cur_nr_sectors <=
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_add(&hd_qiov, cluster_data,
+ 512 * cur_nr_sectors);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ cur_nr_sectors, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ if (s->crypt_method) {
+ qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+ qemu_iovec_from_buf(qiov, bytes_done,
+ cluster_data, 512 * cur_nr_sectors);
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ ret = -EIO;
+ goto fail;
+ }
+
+ remaining_sectors -= cur_nr_sectors;
+ sector_num += cur_nr_sectors;
+ bytes_done += cur_nr_sectors * 512;
+ }
+ ret = 0;
+
+fail:
+ qemu_co_mutex_unlock(&s->lock);
+
+ qemu_iovec_destroy(&hd_qiov);
+ qemu_vfree(cluster_data);
+
+ return ret;
+}
+
+static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
+{
+ /* Take the request off the list of running requests */
+ if (m->nb_clusters != 0) {
+ QLIST_REMOVE(m, next_in_flight);
+ }
+
+ /* Restart all dependent requests */
+ if (!qemu_co_queue_empty(&m->dependent_requests)) {
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_co_queue_restart_all(&m->dependent_requests);
+ qemu_co_mutex_lock(&s->lock);
+ }
+}
+
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ int remaining_sectors,
+ QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ int n_end;
+ int ret;
+ int cur_nr_sectors; /* number of sectors in current iteration */
+ uint64_t cluster_offset;
+ QEMUIOVector hd_qiov;
+ uint64_t bytes_done = 0;
+ uint8_t *cluster_data = NULL;
+ QCowL2Meta l2meta = {
+ .nb_clusters = 0,
+ };
+
+ trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+ remaining_sectors);
+
+ qemu_co_queue_init(&l2meta.dependent_requests);
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (remaining_sectors != 0) {
+
+ trace_qcow2_writev_start_part(qemu_coroutine_self());
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n_end = index_in_cluster + remaining_sectors;
+ if (s->crypt_method &&
+ n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
+ n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+ }
+
+ ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+ index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ if (l2meta.nb_clusters > 0 &&
+ (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
+ qcow2_mark_dirty(bs);
+ }
+
+ cluster_offset = l2meta.cluster_offset;
+ assert((cluster_offset & 511) == 0);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * 512);
+
+ if (s->crypt_method) {
+ if (!cluster_data) {
+ cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
+ s->cluster_size);
+ }
+
+ assert(hd_qiov.size <=
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+ qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
+
+ qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_add(&hd_qiov, cluster_data,
+ cur_nr_sectors * 512);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+ qemu_co_mutex_unlock(&s->lock);
+ trace_qcow2_writev_data(qemu_coroutine_self(),
+ (cluster_offset >> 9) + index_in_cluster);
+ ret = bdrv_co_writev(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ cur_nr_sectors, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ run_dependent_requests(s, &l2meta);
+
+ remaining_sectors -= cur_nr_sectors;
+ sector_num += cur_nr_sectors;
+ bytes_done += cur_nr_sectors * 512;
+ trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+ }
+ ret = 0;
+
+fail:
+ run_dependent_requests(s, &l2meta);
+
+ qemu_co_mutex_unlock(&s->lock);
+
+ qemu_iovec_destroy(&hd_qiov);
+ qemu_vfree(cluster_data);
+ trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
+
+ return ret;
+}
+
+static void qcow2_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ g_free(s->l1_table);
+
+ qcow2_cache_flush(bs, s->l2_table_cache);
+ qcow2_cache_flush(bs, s->refcount_block_cache);
+
+ qcow2_mark_clean(bs);
+
+ qcow2_cache_destroy(bs, s->l2_table_cache);
+ qcow2_cache_destroy(bs, s->refcount_block_cache);
+
+ g_free(s->unknown_header_fields);
+ cleanup_unknown_header_ext(bs);
+
+ g_free(s->cluster_cache);
+ qemu_vfree(s->cluster_data);
+ qcow2_refcount_close(bs);
+ qcow2_free_snapshots(bs);
+}
+
+static void qcow2_invalidate_cache(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int flags = s->flags;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ uint32_t crypt_method = 0;
+
+ /*
+ * Backing files are read-only which makes all of their metadata immutable,
+ * that means we don't have to worry about reopening them here.
+ */
+
+ if (s->crypt_method) {
+ crypt_method = s->crypt_method;
+ memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
+ memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
+ }
+
+ qcow2_close(bs);
+
+ memset(s, 0, sizeof(BDRVQcowState));
+ qcow2_open(bs, flags);
+
+ if (crypt_method) {
+ s->crypt_method = crypt_method;
+ memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
+ memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
+ }
+}
+
+static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
+ size_t len, size_t buflen)
+{
+ QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
+ size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
+
+ if (buflen < ext_len) {
+ return -ENOSPC;
+ }
+
+ *ext_backing_fmt = (QCowExtension) {
+ .magic = cpu_to_be32(magic),
+ .len = cpu_to_be32(len),
+ };
+ memcpy(buf + sizeof(QCowExtension), s, len);
+
+ return ext_len;
+}
+
+/*
+ * Updates the qcow2 header, including the variable length parts of it, i.e.
+ * the backing file name and all extensions. qcow2 was not designed to allow
+ * such changes, so if we run out of space (we can only use the first cluster)
+ * this function may fail.
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int qcow2_update_header(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowHeader *header;
+ char *buf;
+ size_t buflen = s->cluster_size;
+ int ret;
+ uint64_t total_size;
+ uint32_t refcount_table_clusters;
+ size_t header_length;
+ Qcow2UnknownHeaderExtension *uext;
+
+ buf = qemu_blockalign(bs, buflen);
+
+ /* Header structure */
+ header = (QCowHeader*) buf;
+
+ if (buflen < sizeof(*header)) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ header_length = sizeof(*header) + s->unknown_header_fields_size;
+ total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+
+ *header = (QCowHeader) {
+ /* Version 2 fields */
+ .magic = cpu_to_be32(QCOW_MAGIC),
+ .version = cpu_to_be32(s->qcow_version),
+ .backing_file_offset = 0,
+ .backing_file_size = 0,
+ .cluster_bits = cpu_to_be32(s->cluster_bits),
+ .size = cpu_to_be64(total_size),
+ .crypt_method = cpu_to_be32(s->crypt_method_header),
+ .l1_size = cpu_to_be32(s->l1_size),
+ .l1_table_offset = cpu_to_be64(s->l1_table_offset),
+ .refcount_table_offset = cpu_to_be64(s->refcount_table_offset),
+ .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
+ .nb_snapshots = cpu_to_be32(s->nb_snapshots),
+ .snapshots_offset = cpu_to_be64(s->snapshots_offset),
+
+ /* Version 3 fields */
+ .incompatible_features = cpu_to_be64(s->incompatible_features),
+ .compatible_features = cpu_to_be64(s->compatible_features),
+ .autoclear_features = cpu_to_be64(s->autoclear_features),
+ .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT),
+ .header_length = cpu_to_be32(header_length),
+ };
+
+ /* For older versions, write a shorter header */
+ switch (s->qcow_version) {
+ case 2:
+ ret = offsetof(QCowHeader, incompatible_features);
+ break;
+ case 3:
+ ret = sizeof(*header);
+ break;
+ default:
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ memset(buf, 0, buflen);
+
+ /* Preserve any unknown field in the header */
+ if (s->unknown_header_fields_size) {
+ if (buflen < s->unknown_header_fields_size) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
+ buf += s->unknown_header_fields_size;
+ buflen -= s->unknown_header_fields_size;
+ }
+
+ /* Backing file format header extension */
+ if (*bs->backing_format) {
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
+ bs->backing_format, strlen(bs->backing_format),
+ buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ }
+
+ /* Feature table */
+ Qcow2Feature features[] = {
+ {
+ .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+ .bit = QCOW2_INCOMPAT_DIRTY_BITNR,
+ .name = "dirty bit",
+ },
+ {
+ .type = QCOW2_FEAT_TYPE_COMPATIBLE,
+ .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+ .name = "lazy refcounts",
+ },
+ };
+
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
+ features, sizeof(features), buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+ buf += ret;
+ buflen -= ret;
+
+ /* Keep unknown header extensions */
+ QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
+ ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ }
+
+ /* End of header extensions */
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+
+ /* Backing file name */
+ if (*bs->backing_file) {
+ size_t backing_file_len = strlen(bs->backing_file);
+
+ if (buflen < backing_file_len) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ strncpy(buf, bs->backing_file, buflen);
+
+ header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
+ header->backing_file_size = cpu_to_be32(backing_file_len);
+ }
+
+ /* Write the new header */
+ ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = 0;
+fail:
+ qemu_vfree(header);
+ return ret;
+}
+
+static int qcow2_change_backing_file(BlockDriverState *bs,
+ const char *backing_file, const char *backing_fmt)
+{
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+ pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+
+ return qcow2_update_header(bs);
+}
+
+static int preallocate(BlockDriverState *bs)
+{
+ uint64_t nb_sectors;
+ uint64_t offset;
+ int num;
+ int ret;
+ QCowL2Meta meta;
+
+ nb_sectors = bdrv_getlength(bs) >> 9;
+ offset = 0;
+ qemu_co_queue_init(&meta.dependent_requests);
+ meta.cluster_offset = 0;
+
+ while (nb_sectors) {
+ num = MIN(nb_sectors, INT_MAX >> 9);
+ ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+ if (ret < 0) {
+ qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+ return ret;
+ }
+
+ /* There are no dependent requests, but we need to remove our request
+ * from the list of in-flight requests */
+ run_dependent_requests(bs->opaque, &meta);
+
+ /* TODO Preallocate data if requested */
+
+ nb_sectors -= num;
+ offset += num << 9;
+ }
+
+ /*
+ * It is expected that the image file is large enough to actually contain
+ * all of the allocated clusters (otherwise we get failing reads after
+ * EOF). Extend the image to the last allocated sector.
+ */
+ if (meta.cluster_offset != 0) {
+ uint8_t buf[512];
+ memset(buf, 0, 512);
+ ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int qcow2_create2(const char *filename, int64_t total_size,
+ const char *backing_file, const char *backing_format,
+ int flags, size_t cluster_size, int prealloc,
+ QEMUOptionParameter *options, int version)
+{
+ /* Calculate cluster_bits */
+ int cluster_bits;
+ cluster_bits = ffs(cluster_size) - 1;
+ if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
+ (1 << cluster_bits) != cluster_size)
+ {
+ error_report(
+ "Cluster size must be a power of two between %d and %dk",
+ 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
+ return -EINVAL;
+ }
+
+ /*
+ * Open the image file and write a minimal qcow2 header.
+ *
+ * We keep things simple and start with a zero-sized image. We also
+ * do without refcount blocks or a L1 table for now. We'll fix the
+ * inconsistency later.
+ *
+ * We do need a refcount table because growing the refcount table means
+ * allocating two new refcount blocks - the seconds of which would be at
+ * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
+ * size for any qcow2 image.
+ */
+ BlockDriverState* bs;
+ QCowHeader header;
+ uint8_t* refcount_table;
+ int ret;
+
+ ret = bdrv_create_file(filename, options);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Write the header */
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(version);
+ header.cluster_bits = cpu_to_be32(cluster_bits);
+ header.size = cpu_to_be64(0);
+ header.l1_table_offset = cpu_to_be64(0);
+ header.l1_size = cpu_to_be32(0);
+ header.refcount_table_offset = cpu_to_be64(cluster_size);
+ header.refcount_table_clusters = cpu_to_be32(1);
+ header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
+ header.header_length = cpu_to_be32(sizeof(header));
+
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+
+ if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+ header.compatible_features |=
+ cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
+ }
+
+ ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* Write an empty refcount table */
+ refcount_table = g_malloc0(cluster_size);
+ ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+ g_free(refcount_table);
+
+ if (ret < 0) {
+ goto out;
+ }
+
+ bdrv_close(bs);
+
+ /*
+ * And now open the image and make it consistent first (i.e. increase the
+ * refcount of the cluster that is occupied by the header and the refcount
+ * table)
+ */
+ BlockDriver* drv = bdrv_find_format("qcow2");
+ assert(drv != NULL);
+ ret = bdrv_open(bs, filename,
+ BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+ if (ret < 0) {
+ goto out;
+
+ } else if (ret != 0) {
+ error_report("Huh, first cluster in empty image is already in use?");
+ abort();
+ }
+
+ /* Okay, now that we have a valid image, let's give it the right size */
+ ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* Want a backing file? There you go.*/
+ if (backing_file) {
+ ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ /* And if we're supposed to preallocate metadata, do that now */
+ if (prealloc) {
+ BDRVQcowState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = preallocate(bs);
+ qemu_co_mutex_unlock(&s->lock);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ bdrv_delete(bs);
+ return ret;
+}
+
+static int qcow2_create(const char *filename, QEMUOptionParameter *options)
+{
+ const char *backing_file = NULL;
+ const char *backing_fmt = NULL;
+ uint64_t sectors = 0;
+ int flags = 0;
+ size_t cluster_size = DEFAULT_CLUSTER_SIZE;
+ int prealloc = 0;
+ int version = 2;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ sectors = options->value.n / 512;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+ backing_fmt = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+ flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ cluster_size = options->value.n;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+ if (!options->value.s || !strcmp(options->value.s, "off")) {
+ prealloc = 0;
+ } else if (!strcmp(options->value.s, "metadata")) {
+ prealloc = 1;
+ } else {
+ fprintf(stderr, "Invalid preallocation mode: '%s'\n",
+ options->value.s);
+ return -EINVAL;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
+ if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+ version = 2;
+ } else if (!strcmp(options->value.s, "1.1")) {
+ version = 3;
+ } else {
+ fprintf(stderr, "Invalid compatibility level: '%s'\n",
+ options->value.s);
+ return -EINVAL;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+ flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+ }
+ options++;
+ }
+
+ if (backing_file && prealloc) {
+ fprintf(stderr, "Backing file and preallocation cannot be used at "
+ "the same time\n");
+ return -EINVAL;
+ }
+
+ if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+ fprintf(stderr, "Lazy refcounts only supported with compatibility "
+ "level 1.1 and above (use compat=1.1 or greater)\n");
+ return -EINVAL;
+ }
+
+ return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+ cluster_size, prealloc, options, version);
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+#if 0
+ /* XXX: not correct */
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ l2_cache_reset(bs);
+#endif
+ return 0;
+}
+
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ int ret;
+ BDRVQcowState *s = bs->opaque;
+
+ /* Emulate misaligned zero writes */
+ if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+ return -ENOTSUP;
+ }
+
+ /* Whatever is left can use real zero clusters */
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return ret;
+}
+
+static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ int ret;
+ BDRVQcowState *s = bs->opaque;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, new_l1_size;
+
+ if (offset & 511) {
+ error_report("The new size must be a multiple of 512");
+ return -EINVAL;
+ }
+
+ /* cannot proceed if image has snapshots */
+ if (s->nb_snapshots) {
+ error_report("Can't resize an image which has snapshots");
+ return -ENOTSUP;
+ }
+
+ /* shrinking is currently not supported */
+ if (offset < bs->total_sectors * 512) {
+ error_report("qcow2 doesn't support shrinking images yet");
+ return -ENOTSUP;
+ }
+
+ new_l1_size = size_to_l1(s, offset);
+ ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* write updated header.size */
+ offset = cpu_to_be64(offset);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
+ &offset, sizeof(uint64_t));
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->l1_vm_state_index = new_l1_size;
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors == 0) {
+ /* align end of file to a sector boundary to ease reading with
+ sector based I/Os */
+ cluster_offset = bdrv_getlength(bs->file);
+ cluster_offset = (cluster_offset + 511) & ~511;
+ bdrv_truncate(bs->file, cluster_offset);
+ return 0;
+ }
+
+ if (nb_sectors != s->cluster_sectors)
+ return -EINVAL;
+
+ out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ deflateEnd(&strm);
+ ret = -EINVAL;
+ goto fail;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+ sector_num << 9, out_len);
+ if (!cluster_offset) {
+ ret = -EIO;
+ goto fail;
+ }
+ cluster_offset &= s->cluster_offset_mask;
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+ ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ g_free(out_buf);
+ return ret;
+}
+
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
+
+ if (qcow2_need_accurate_refcounts(s)) {
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
+ }
+ qemu_co_mutex_unlock(&s->lock);
+
+ return 0;
+}
+
+static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
+{
+ return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
+}
+
+static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ bdi->vm_state_offset = qcow2_vm_state_offset(s);
+ return 0;
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t nb_clusters, k, k1, size;
+ int refcount;
+
+ size = bdrv_getlength(bs->file);
+ nb_clusters = size_to_clusters(s, size);
+ for(k = 0; k < nb_clusters;) {
+ k1 = k;
+ refcount = get_refcount(bs, k);
+ k++;
+ while (k < nb_clusters && get_refcount(bs, k) == refcount)
+ k++;
+ printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
+ k - k1);
+ }
+}
+#endif
+
+static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int growable = bs->growable;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
+ bs->growable = 1;
+ ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+ bs->growable = growable;
+
+ return ret;
+}
+
+static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int growable = bs->growable;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
+ bs->growable = 1;
+ ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+ bs->growable = growable;
+
+ return ret;
+}
+
+static QEMUOptionParameter qcow2_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_COMPAT_LEVEL,
+ .type = OPT_STRING,
+ .help = "Compatibility level (0.10 or 1.1)"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FMT,
+ .type = OPT_STRING,
+ .help = "Image format of the base image"
+ },
+ {
+ .name = BLOCK_OPT_ENCRYPT,
+ .type = OPT_FLAG,
+ .help = "Encrypt the image"
+ },
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "qcow2 cluster size",
+ .value = { .n = DEFAULT_CLUSTER_SIZE },
+ },
+ {
+ .name = BLOCK_OPT_PREALLOC,
+ .type = OPT_STRING,
+ .help = "Preallocation mode (allowed values: off, metadata)"
+ },
+ {
+ .name = BLOCK_OPT_LAZY_REFCOUNTS,
+ .type = OPT_FLAG,
+ .help = "Postpone refcount updates",
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_qcow2 = {
+ .format_name = "qcow2",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow2_probe,
+ .bdrv_open = qcow2_open,
+ .bdrv_close = qcow2_close,
+ .bdrv_create = qcow2_create,
+ .bdrv_co_is_allocated = qcow2_co_is_allocated,
+ .bdrv_set_key = qcow2_set_key,
+ .bdrv_make_empty = qcow2_make_empty,
+
+ .bdrv_co_readv = qcow2_co_readv,
+ .bdrv_co_writev = qcow2_co_writev,
+ .bdrv_co_flush_to_os = qcow2_co_flush_to_os,
+
+ .bdrv_co_write_zeroes = qcow2_co_write_zeroes,
+ .bdrv_co_discard = qcow2_co_discard,
+ .bdrv_truncate = qcow2_truncate,
+ .bdrv_write_compressed = qcow2_write_compressed,
+
+ .bdrv_snapshot_create = qcow2_snapshot_create,
+ .bdrv_snapshot_goto = qcow2_snapshot_goto,
+ .bdrv_snapshot_delete = qcow2_snapshot_delete,
+ .bdrv_snapshot_list = qcow2_snapshot_list,
+ .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
+ .bdrv_get_info = qcow2_get_info,
+
+ .bdrv_save_vmstate = qcow2_save_vmstate,
+ .bdrv_load_vmstate = qcow2_load_vmstate,
+
+ .bdrv_change_backing_file = qcow2_change_backing_file,
+
+ .bdrv_invalidate_cache = qcow2_invalidate_cache,
+
+ .create_options = qcow2_create_options,
+ .bdrv_check = qcow2_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+ bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/block/qcow2.h b/block/qcow2.h
new file mode 100644
index 000000000..b4eb65470
--- /dev/null
+++ b/block/qcow2.h
@@ -0,0 +1,336 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QCOW2_H
+#define BLOCK_QCOW2_H
+
+#include "aes.h"
+#include "qemu-coroutine.h"
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1LL << 0)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+
+#define L2_CACHE_SIZE 16
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
+#define DEFAULT_CLUSTER_SIZE 65536
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t cluster_bits;
+ uint64_t size; /* in bytes */
+ uint32_t crypt_method;
+ uint32_t l1_size; /* XXX: save number of clusters instead ? */
+ uint64_t l1_table_offset;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_clusters;
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+
+ /* The following fields are only valid for version >= 3 */
+ uint64_t incompatible_features;
+ uint64_t compatible_features;
+ uint64_t autoclear_features;
+
+ uint32_t refcount_order;
+ uint32_t header_length;
+} QCowHeader;
+
+typedef struct QCowSnapshot {
+ uint64_t l1_table_offset;
+ uint32_t l1_size;
+ char *id_str;
+ char *name;
+ uint64_t disk_size;
+ uint64_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+struct Qcow2Cache;
+typedef struct Qcow2Cache Qcow2Cache;
+
+typedef struct Qcow2UnknownHeaderExtension {
+ uint32_t magic;
+ uint32_t len;
+ QLIST_ENTRY(Qcow2UnknownHeaderExtension) next;
+ uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+ QCOW2_FEAT_TYPE_INCOMPATIBLE = 0,
+ QCOW2_FEAT_TYPE_COMPATIBLE = 1,
+ QCOW2_FEAT_TYPE_AUTOCLEAR = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+ QCOW2_INCOMPAT_DIRTY_BITNR = 0,
+ QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+
+ QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY,
+};
+
+/* Compatible feature bits */
+enum {
+ QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+ QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+ QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+typedef struct Qcow2Feature {
+ uint8_t type;
+ uint8_t bit;
+ char name[46];
+} QEMU_PACKED Qcow2Feature;
+
+typedef struct BDRVQcowState {
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ int l1_vm_state_index;
+ int csize_shift;
+ int csize_mask;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+
+ Qcow2Cache* l2_table_cache;
+ Qcow2Cache* refcount_block_cache;
+
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+ QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
+
+ uint64_t *refcount_table;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_size;
+ int64_t free_cluster_index;
+ int64_t free_byte_offset;
+
+ CoMutex lock;
+
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ uint64_t snapshots_offset;
+ int snapshots_size;
+ int nb_snapshots;
+ QCowSnapshot *snapshots;
+
+ int flags;
+ int qcow_version;
+
+ uint64_t incompatible_features;
+ uint64_t compatible_features;
+ uint64_t autoclear_features;
+
+ size_t unknown_header_fields_size;
+ void* unknown_header_fields;
+ QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+} BDRVQcowState;
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+ int cluster_size;
+ int cluster_bits;
+ uint16_t *refcount_block;
+ uint64_t *refcount_table;
+ int64_t l1_table_offset;
+ int64_t refcount_table_offset;
+ int64_t refcount_block_offset;
+} QCowCreateState;
+
+struct QCowAIOCB;
+
+/* XXX This could be private for qcow2-cluster.c */
+typedef struct QCowL2Meta
+{
+ uint64_t offset;
+ uint64_t cluster_offset;
+ uint64_t alloc_offset;
+ int n_start;
+ int nb_available;
+ int nb_clusters;
+ CoQueue dependent_requests;
+
+ QLIST_ENTRY(QCowL2Meta) next_in_flight;
+} QCowL2Meta;
+
+enum {
+ QCOW2_CLUSTER_UNALLOCATED,
+ QCOW2_CLUSTER_NORMAL,
+ QCOW2_CLUSTER_COMPRESSED,
+ QCOW2_CLUSTER_ZERO
+};
+
+#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+
+static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+ return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static inline int size_to_l1(BDRVQcowState *s, int64_t size)
+{
+ int shift = s->cluster_bits + s->l2_bits;
+ return (size + (1ULL << shift) - 1) >> shift;
+}
+
+static inline int64_t align_offset(int64_t offset, int n)
+{
+ offset = (offset + n - 1) & ~(n - 1);
+ return offset;
+}
+
+static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+{
+ if (l2_entry & QCOW_OFLAG_COMPRESSED) {
+ return QCOW2_CLUSTER_COMPRESSED;
+ } else if (l2_entry & QCOW_OFLAG_ZERO) {
+ return QCOW2_CLUSTER_ZERO;
+ } else if (!(l2_entry & L2E_OFFSET_MASK)) {
+ return QCOW2_CLUSTER_UNALLOCATED;
+ } else {
+ return QCOW2_CLUSTER_NORMAL;
+ }
+}
+
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+ return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
+}
+
+// FIXME Need qcow2_ prefix to global functions
+
+/* qcow2.c functions */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors);
+int qcow2_update_header(BlockDriverState *bs);
+
+/* qcow2-refcount.c functions */
+int qcow2_refcount_init(BlockDriverState *bs);
+void qcow2_refcount_close(BlockDriverState *bs);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int nb_clusters);
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
+void qcow2_free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size);
+void qcow2_free_any_clusters(BlockDriverState *bs,
+ uint64_t cluster_offset, int nb_clusters);
+
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+ int64_t l1_table_offset, int l1_size, int addend);
+
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix);
+
+/* qcow2-cluster.c functions */
+int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key);
+
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int *num, uint64_t *cluster_offset);
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int n_start, int n_end, int *num, QCowL2Meta *m);
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int compressed_size);
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+ int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+
+/* qcow2-snapshot.c functions */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+
+void qcow2_free_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs);
+
+/* qcow2-cache.c functions */
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+ Qcow2Cache *dependency);
+void qcow2_cache_depends_on_flush(Qcow2Cache *c);
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table);
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table);
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+
+#endif
diff --git a/block/qed-check.c b/block/qed-check.c
new file mode 100644
index 000000000..b473dcd61
--- /dev/null
+++ b/block/qed-check.c
@@ -0,0 +1,248 @@
+/*
+ * QEMU Enhanced Disk Format Consistency Check
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+typedef struct {
+ BDRVQEDState *s;
+ BdrvCheckResult *result;
+ bool fix; /* whether to fix invalid offsets */
+
+ uint64_t nclusters;
+ uint32_t *used_clusters; /* referenced cluster bitmap */
+
+ QEDRequest request;
+} QEDCheck;
+
+static bool qed_test_bit(uint32_t *bitmap, uint64_t n) {
+ return !!(bitmap[n / 32] & (1 << (n % 32)));
+}
+
+static void qed_set_bit(uint32_t *bitmap, uint64_t n) {
+ bitmap[n / 32] |= 1 << (n % 32);
+}
+
+/**
+ * Set bitmap bits for clusters
+ *
+ * @check: Check structure
+ * @offset: Starting offset in bytes
+ * @n: Number of clusters
+ */
+static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset,
+ unsigned int n)
+{
+ uint64_t cluster = qed_bytes_to_clusters(check->s, offset);
+ unsigned int corruptions = 0;
+
+ while (n-- != 0) {
+ /* Clusters should only be referenced once */
+ if (qed_test_bit(check->used_clusters, cluster)) {
+ corruptions++;
+ }
+
+ qed_set_bit(check->used_clusters, cluster);
+ cluster++;
+ }
+
+ check->result->corruptions += corruptions;
+ return corruptions == 0;
+}
+
+/**
+ * Check an L2 table
+ *
+ * @ret: Number of invalid cluster offsets
+ */
+static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid = 0;
+ uint64_t last_offset = 0;
+
+ for (i = 0; i < s->table_nelems; i++) {
+ uint64_t offset = table->offsets[i];
+
+ if (qed_offset_is_unalloc_cluster(offset) ||
+ qed_offset_is_zero_cluster(offset)) {
+ continue;
+ }
+ check->result->bfi.allocated_clusters++;
+ if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+ check->result->bfi.fragmented_clusters++;
+ }
+ last_offset = offset;
+
+ /* Detect invalid cluster offset */
+ if (!qed_check_cluster_offset(s, offset)) {
+ if (check->fix) {
+ table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid++;
+ continue;
+ }
+
+ qed_set_used_clusters(check, offset, 1);
+ }
+
+ return num_invalid;
+}
+
+/**
+ * Descend tables and check each cluster is referenced once only
+ */
+static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid_l1 = 0;
+ int ret, last_error = 0;
+
+ /* Mark L1 table clusters used */
+ qed_set_used_clusters(check, s->header.l1_table_offset,
+ s->header.table_size);
+
+ for (i = 0; i < s->table_nelems; i++) {
+ unsigned int num_invalid_l2;
+ uint64_t offset = table->offsets[i];
+
+ if (qed_offset_is_unalloc_cluster(offset)) {
+ continue;
+ }
+
+ /* Detect invalid L2 offset */
+ if (!qed_check_table_offset(s, offset)) {
+ /* Clear invalid offset */
+ if (check->fix) {
+ table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid_l1++;
+ continue;
+ }
+
+ if (!qed_set_used_clusters(check, offset, s->header.table_size)) {
+ continue; /* skip an invalid table */
+ }
+
+ ret = qed_read_l2_table_sync(s, &check->request, offset);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+
+ num_invalid_l2 = qed_check_l2_table(check,
+ check->request.l2_table->table);
+
+ /* Write out fixed L2 table */
+ if (num_invalid_l2 > 0 && check->fix) {
+ ret = qed_write_l2_table_sync(s, &check->request, 0,
+ s->table_nelems, false);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+ }
+ }
+
+ /* Drop reference to final table */
+ qed_unref_l2_cache_entry(check->request.l2_table);
+ check->request.l2_table = NULL;
+
+ /* Write out fixed L1 table */
+ if (num_invalid_l1 > 0 && check->fix) {
+ ret = qed_write_l1_table_sync(s, 0, s->table_nelems);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ }
+ }
+
+ return last_error;
+}
+
+/**
+ * Check for unreferenced (leaked) clusters
+ */
+static void qed_check_for_leaks(QEDCheck *check)
+{
+ BDRVQEDState *s = check->s;
+ uint64_t i;
+
+ for (i = s->header.header_size; i < check->nclusters; i++) {
+ if (!qed_test_bit(check->used_clusters, i)) {
+ check->result->leaks++;
+ }
+ }
+}
+
+/**
+ * Mark an image clean once it passes check or has been repaired
+ */
+static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
+{
+ /* Skip if there were unfixable corruptions or I/O errors */
+ if (result->corruptions > 0 || result->check_errors > 0) {
+ return;
+ }
+
+ /* Skip if image is already marked clean */
+ if (!(s->header.features & QED_F_NEED_CHECK)) {
+ return;
+ }
+
+ /* Ensure fixes reach storage before clearing check bit */
+ bdrv_flush(s->bs);
+
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+}
+
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
+{
+ QEDCheck check = {
+ .s = s,
+ .result = result,
+ .nclusters = qed_bytes_to_clusters(s, s->file_size),
+ .request = { .l2_table = NULL },
+ .fix = fix,
+ };
+ int ret;
+
+ check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
+ sizeof(check.used_clusters[0]));
+
+ check.result->bfi.total_clusters =
+ (s->header.image_size + s->header.cluster_size - 1) /
+ s->header.cluster_size;
+ ret = qed_check_l1_table(&check, s->l1_table);
+ if (ret == 0) {
+ /* Only check for leaks if entire image was scanned successfully */
+ qed_check_for_leaks(&check);
+
+ if (fix) {
+ qed_check_mark_clean(s, result);
+ }
+ }
+
+ g_free(check.used_clusters);
+ return ret;
+}
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
new file mode 100644
index 000000000..f64b2af8f
--- /dev/null
+++ b/block/qed-cluster.c
@@ -0,0 +1,165 @@
+/*
+ * QEMU Enhanced Disk Format Cluster functions
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+/**
+ * Count the number of contiguous data clusters
+ *
+ * @s: QED state
+ * @table: L2 table
+ * @index: First cluster index
+ * @n: Maximum number of clusters
+ * @offset: Set to first cluster offset
+ *
+ * This function scans tables for contiguous clusters. A contiguous run of
+ * clusters may be allocated, unallocated, or zero.
+ */
+static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+ QEDTable *table,
+ unsigned int index,
+ unsigned int n,
+ uint64_t *offset)
+{
+ unsigned int end = MIN(index + n, s->table_nelems);
+ uint64_t last = table->offsets[index];
+ unsigned int i;
+
+ *offset = last;
+
+ for (i = index + 1; i < end; i++) {
+ if (qed_offset_is_unalloc_cluster(last)) {
+ /* Counting unallocated clusters */
+ if (!qed_offset_is_unalloc_cluster(table->offsets[i])) {
+ break;
+ }
+ } else if (qed_offset_is_zero_cluster(last)) {
+ /* Counting zero clusters */
+ if (!qed_offset_is_zero_cluster(table->offsets[i])) {
+ break;
+ }
+ } else {
+ /* Counting allocated clusters */
+ if (table->offsets[i] != last + s->header.cluster_size) {
+ break;
+ }
+ last = table->offsets[i];
+ }
+ }
+ return i - index;
+}
+
+typedef struct {
+ BDRVQEDState *s;
+ uint64_t pos;
+ size_t len;
+
+ QEDRequest *request;
+
+ /* User callback */
+ QEDFindClusterFunc *cb;
+ void *opaque;
+} QEDFindClusterCB;
+
+static void qed_find_cluster_cb(void *opaque, int ret)
+{
+ QEDFindClusterCB *find_cluster_cb = opaque;
+ BDRVQEDState *s = find_cluster_cb->s;
+ QEDRequest *request = find_cluster_cb->request;
+ uint64_t offset = 0;
+ size_t len = 0;
+ unsigned int index;
+ unsigned int n;
+
+ if (ret) {
+ goto out;
+ }
+
+ index = qed_l2_index(s, find_cluster_cb->pos);
+ n = qed_bytes_to_clusters(s,
+ qed_offset_into_cluster(s, find_cluster_cb->pos) +
+ find_cluster_cb->len);
+ n = qed_count_contiguous_clusters(s, request->l2_table->table,
+ index, n, &offset);
+
+ if (qed_offset_is_unalloc_cluster(offset)) {
+ ret = QED_CLUSTER_L2;
+ } else if (qed_offset_is_zero_cluster(offset)) {
+ ret = QED_CLUSTER_ZERO;
+ } else if (qed_check_cluster_offset(s, offset)) {
+ ret = QED_CLUSTER_FOUND;
+ } else {
+ ret = -EINVAL;
+ }
+
+ len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
+ qed_offset_into_cluster(s, find_cluster_cb->pos));
+
+out:
+ find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+ g_free(find_cluster_cb);
+}
+
+/**
+ * Find the offset of a data cluster
+ *
+ * @s: QED state
+ * @request: L2 cache entry
+ * @pos: Byte position in device
+ * @len: Number of bytes
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ *
+ * This function translates a position in the block device to an offset in the
+ * image file. It invokes the cb completion callback to report back the
+ * translated offset or unallocated range in the image file.
+ *
+ * If the L2 table exists, request->l2_table points to the L2 table cache entry
+ * and the caller must free the reference when they are finished. The cache
+ * entry is exposed in this way to avoid callers having to read the L2 table
+ * again later during request processing. If request->l2_table is non-NULL it
+ * will be unreferenced before taking on the new cache entry.
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+ size_t len, QEDFindClusterFunc *cb, void *opaque)
+{
+ QEDFindClusterCB *find_cluster_cb;
+ uint64_t l2_offset;
+
+ /* Limit length to L2 boundary. Requests are broken up at the L2 boundary
+ * so that a request acts on one L2 table at a time.
+ */
+ len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+
+ l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
+ if (qed_offset_is_unalloc_cluster(l2_offset)) {
+ cb(opaque, QED_CLUSTER_L1, 0, len);
+ return;
+ }
+ if (!qed_check_table_offset(s, l2_offset)) {
+ cb(opaque, -EINVAL, 0, 0);
+ return;
+ }
+
+ find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
+ find_cluster_cb->s = s;
+ find_cluster_cb->pos = pos;
+ find_cluster_cb->len = len;
+ find_cluster_cb->cb = cb;
+ find_cluster_cb->opaque = opaque;
+ find_cluster_cb->request = request;
+
+ qed_read_l2_table(s, request, l2_offset,
+ qed_find_cluster_cb, find_cluster_cb);
+}
diff --git a/block/qed-gencb.c b/block/qed-gencb.c
new file mode 100644
index 000000000..7d7ac1ffc
--- /dev/null
+++ b/block/qed-gencb.c
@@ -0,0 +1,32 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+{
+ GenericCB *gencb = g_malloc(len);
+ gencb->cb = cb;
+ gencb->opaque = opaque;
+ return gencb;
+}
+
+void gencb_complete(void *opaque, int ret)
+{
+ GenericCB *gencb = opaque;
+ BlockDriverCompletionFunc *cb = gencb->cb;
+ void *user_opaque = gencb->opaque;
+
+ g_free(gencb);
+ cb(user_opaque, ret);
+}
diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c
new file mode 100644
index 000000000..e9b2aae44
--- /dev/null
+++ b/block/qed-l2-cache.c
@@ -0,0 +1,187 @@
+/*
+ * QEMU Enhanced Disk Format L2 Cache
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*
+ * L2 table cache usage is as follows:
+ *
+ * An open image has one L2 table cache that is used to avoid accessing the
+ * image file for recently referenced L2 tables.
+ *
+ * Cluster offset lookup translates the logical offset within the block device
+ * to a cluster offset within the image file. This is done by indexing into
+ * the L1 and L2 tables which store cluster offsets. It is here where the L2
+ * table cache serves up recently referenced L2 tables.
+ *
+ * If there is a cache miss, that L2 table is read from the image file and
+ * committed to the cache. Subsequent accesses to that L2 table will be served
+ * from the cache until the table is evicted from the cache.
+ *
+ * L2 tables are also committed to the cache when new L2 tables are allocated
+ * in the image file. Since the L2 table cache is write-through, the new L2
+ * table is first written out to the image file and then committed to the
+ * cache.
+ *
+ * Multiple I/O requests may be using an L2 table cache entry at any given
+ * time. That means an entry may be in use across several requests and
+ * reference counting is needed to free the entry at the correct time. In
+ * particular, an entry evicted from the cache will only be freed once all
+ * references are dropped.
+ *
+ * An in-flight I/O request will hold a reference to a L2 table cache entry for
+ * the period during which it needs to access the L2 table. This includes
+ * cluster offset lookup, L2 table allocation, and L2 table update when a new
+ * data cluster has been allocated.
+ *
+ * An interesting case occurs when two requests need to access an L2 table that
+ * is not in the cache. Since the operation to read the table from the image
+ * file takes some time to complete, both requests may see a cache miss and
+ * start reading the L2 table from the image file. The first to finish will
+ * commit its L2 table into the cache. When the second tries to commit its
+ * table will be deleted in favor of the existing cache entry.
+ */
+
+#include "trace.h"
+#include "qed.h"
+
+/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */
+#define MAX_L2_CACHE_SIZE 50
+
+/**
+ * Initialize the L2 cache
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache)
+{
+ QTAILQ_INIT(&l2_cache->entries);
+ l2_cache->n_entries = 0;
+}
+
+/**
+ * Free the L2 cache
+ */
+void qed_free_l2_cache(L2TableCache *l2_cache)
+{
+ CachedL2Table *entry, *next_entry;
+
+ QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) {
+ qemu_vfree(entry->table);
+ g_free(entry);
+ }
+}
+
+/**
+ * Allocate an uninitialized entry from the cache
+ *
+ * The returned entry has a reference count of 1 and is owned by the caller.
+ * The caller must allocate the actual table field for this entry and it must
+ * be freeable using qemu_vfree().
+ */
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
+{
+ CachedL2Table *entry;
+
+ entry = g_malloc0(sizeof(*entry));
+ entry->ref++;
+
+ trace_qed_alloc_l2_cache_entry(l2_cache, entry);
+
+ return entry;
+}
+
+/**
+ * Decrease an entry's reference count and free if necessary when the reference
+ * count drops to zero.
+ */
+void qed_unref_l2_cache_entry(CachedL2Table *entry)
+{
+ if (!entry) {
+ return;
+ }
+
+ entry->ref--;
+ trace_qed_unref_l2_cache_entry(entry, entry->ref);
+ if (entry->ref == 0) {
+ qemu_vfree(entry->table);
+ g_free(entry);
+ }
+}
+
+/**
+ * Find an entry in the L2 cache. This may return NULL and it's up to the
+ * caller to satisfy the cache miss.
+ *
+ * For a cached entry, this function increases the reference count and returns
+ * the entry.
+ */
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
+{
+ CachedL2Table *entry;
+
+ QTAILQ_FOREACH(entry, &l2_cache->entries, node) {
+ if (entry->offset == offset) {
+ trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref);
+ entry->ref++;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Commit an L2 cache entry into the cache. This is meant to be used as part of
+ * the process to satisfy a cache miss. A caller would allocate an entry which
+ * is not actually in the L2 cache and then once the entry was valid and
+ * present on disk, the entry can be committed into the cache.
+ *
+ * Since the cache is write-through, it's important that this function is not
+ * called until the entry is present on disk and the L1 has been updated to
+ * point to the entry.
+ *
+ * N.B. This function steals a reference to the l2_table from the caller so the
+ * caller must obtain a new reference by issuing a call to
+ * qed_find_l2_cache_entry().
+ */
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
+{
+ CachedL2Table *entry;
+
+ entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset);
+ if (entry) {
+ qed_unref_l2_cache_entry(entry);
+ qed_unref_l2_cache_entry(l2_table);
+ return;
+ }
+
+ /* Evict an unused cache entry so we have space. If all entries are in use
+ * we can grow the cache temporarily and we try to shrink back down later.
+ */
+ if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) {
+ CachedL2Table *next;
+ QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) {
+ if (entry->ref > 1) {
+ continue;
+ }
+
+ QTAILQ_REMOVE(&l2_cache->entries, entry, node);
+ l2_cache->n_entries--;
+ qed_unref_l2_cache_entry(entry);
+
+ /* Stop evicting when we've shrunk back to max size */
+ if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) {
+ break;
+ }
+ }
+ }
+
+ l2_cache->n_entries++;
+ QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node);
+}
diff --git a/block/qed-table.c b/block/qed-table.c
new file mode 100644
index 000000000..ce07b0554
--- /dev/null
+++ b/block/qed-table.c
@@ -0,0 +1,297 @@
+/*
+ * QEMU Enhanced Disk Format Table I/O
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "qed.h"
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEDTable *table;
+
+ struct iovec iov;
+ QEMUIOVector qiov;
+} QEDReadTableCB;
+
+static void qed_read_table_cb(void *opaque, int ret)
+{
+ QEDReadTableCB *read_table_cb = opaque;
+ QEDTable *table = read_table_cb->table;
+ int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+ int i;
+
+ /* Handle I/O error */
+ if (ret) {
+ goto out;
+ }
+
+ /* Byteswap offsets */
+ for (i = 0; i < noffsets; i++) {
+ table->offsets[i] = le64_to_cpu(table->offsets[i]);
+ }
+
+out:
+ /* Completion */
+ trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+ gencb_complete(&read_table_cb->gencb, ret);
+}
+
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
+ cb, opaque);
+ QEMUIOVector *qiov = &read_table_cb->qiov;
+
+ trace_qed_read_table(s, offset, table);
+
+ read_table_cb->s = s;
+ read_table_cb->table = table;
+ read_table_cb->iov.iov_base = table->offsets,
+ read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
+
+ qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
+ bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
+ qiov->size / BDRV_SECTOR_SIZE,
+ qed_read_table_cb, read_table_cb);
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEDTable *orig_table;
+ QEDTable *table;
+ bool flush; /* flush after write? */
+
+ struct iovec iov;
+ QEMUIOVector qiov;
+} QEDWriteTableCB;
+
+static void qed_write_table_cb(void *opaque, int ret)
+{
+ QEDWriteTableCB *write_table_cb = opaque;
+
+ trace_qed_write_table_cb(write_table_cb->s,
+ write_table_cb->orig_table,
+ write_table_cb->flush,
+ ret);
+
+ if (ret) {
+ goto out;
+ }
+
+ if (write_table_cb->flush) {
+ /* We still need to flush first */
+ write_table_cb->flush = false;
+ bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+ write_table_cb);
+ return;
+ }
+
+out:
+ qemu_vfree(write_table_cb->table);
+ gencb_complete(&write_table_cb->gencb, ret);
+ return;
+}
+
+/**
+ * Write out an updated part or all of a table
+ *
+ * @s: QED state
+ * @offset: Offset of table in image file, in bytes
+ * @table: Table
+ * @index: Index of first element
+ * @n: Number of elements
+ * @flush: Whether or not to sync to disk
+ * @cb: Completion function
+ * @opaque: Argument for completion function
+ */
+static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+ unsigned int index, unsigned int n, bool flush,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDWriteTableCB *write_table_cb;
+ unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
+ unsigned int start, end, i;
+ size_t len_bytes;
+
+ trace_qed_write_table(s, offset, table, index, n);
+
+ /* Calculate indices of the first and one after last elements */
+ start = index & ~sector_mask;
+ end = (index + n + sector_mask) & ~sector_mask;
+
+ len_bytes = (end - start) * sizeof(uint64_t);
+
+ write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
+ write_table_cb->s = s;
+ write_table_cb->orig_table = table;
+ write_table_cb->flush = flush;
+ write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
+ write_table_cb->iov.iov_base = write_table_cb->table->offsets;
+ write_table_cb->iov.iov_len = len_bytes;
+ qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+
+ /* Byteswap table */
+ for (i = start; i < end; i++) {
+ uint64_t le_offset = cpu_to_le64(table->offsets[i]);
+ write_table_cb->table->offsets[i - start] = le_offset;
+ }
+
+ /* Adjust for offset into table */
+ offset += start * sizeof(uint64_t);
+
+ bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+ &write_table_cb->qiov,
+ write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
+ qed_write_table_cb, write_table_cb);
+}
+
+/**
+ * Propagate return value from async callback
+ */
+static void qed_sync_cb(void *opaque, int ret)
+{
+ *(int *)opaque = ret;
+}
+
+int qed_read_l1_table_sync(BDRVQEDState *s)
+{
+ int ret = -EINPROGRESS;
+
+ qed_read_table(s, s->header.l1_table_offset,
+ s->l1_table, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
+ qed_write_table(s, s->header.l1_table_offset,
+ s->l1_table, index, n, false, cb, opaque);
+}
+
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+ unsigned int n)
+{
+ int ret = -EINPROGRESS;
+
+ qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ uint64_t l2_offset;
+ QEDRequest *request;
+} QEDReadL2TableCB;
+
+static void qed_read_l2_table_cb(void *opaque, int ret)
+{
+ QEDReadL2TableCB *read_l2_table_cb = opaque;
+ QEDRequest *request = read_l2_table_cb->request;
+ BDRVQEDState *s = read_l2_table_cb->s;
+ CachedL2Table *l2_table = request->l2_table;
+ uint64_t l2_offset = read_l2_table_cb->l2_offset;
+
+ if (ret) {
+ /* can't trust loaded L2 table anymore */
+ qed_unref_l2_cache_entry(l2_table);
+ request->l2_table = NULL;
+ } else {
+ l2_table->offset = l2_offset;
+
+ qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+ /* This is guaranteed to succeed because we just committed the entry
+ * to the cache.
+ */
+ request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+ assert(request->l2_table != NULL);
+ }
+
+ gencb_complete(&read_l2_table_cb->gencb, ret);
+}
+
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDReadL2TableCB *read_l2_table_cb;
+
+ qed_unref_l2_cache_entry(request->l2_table);
+
+ /* Check for cached L2 entry */
+ request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+ if (request->l2_table) {
+ cb(opaque, 0);
+ return;
+ }
+
+ request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+ request->l2_table->table = qed_alloc_table(s);
+
+ read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
+ read_l2_table_cb->s = s;
+ read_l2_table_cb->l2_offset = offset;
+ read_l2_table_cb->request = request;
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
+ qed_read_table(s, offset, request->l2_table->table,
+ qed_read_l2_table_cb, read_l2_table_cb);
+}
+
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+{
+ int ret = -EINPROGRESS;
+
+ qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+ qed_write_table(s, request->l2_table->offset,
+ request->l2_table->table, index, n, flush, cb, opaque);
+}
+
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush)
+{
+ int ret = -EINPROGRESS;
+
+ qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
diff --git a/block/qed.c b/block/qed.c
new file mode 100644
index 000000000..21cb23987
--- /dev/null
+++ b/block/qed.c
@@ -0,0 +1,1586 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-timer.h"
+#include "trace.h"
+#include "qed.h"
+#include "qerror.h"
+#include "migration.h"
+
+static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QEDAIOCB *acb = (QEDAIOCB *)blockacb;
+ bool finished = false;
+
+ /* Wait for the request to finish */
+ acb->finished = &finished;
+ while (!finished) {
+ qemu_aio_wait();
+ }
+}
+
+static AIOPool qed_aio_pool = {
+ .aiocb_size = sizeof(QEDAIOCB),
+ .cancel = qed_aio_cancel,
+};
+
+static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
+ const char *filename)
+{
+ const QEDHeader *header = (const QEDHeader *)buf;
+
+ if (buf_size < sizeof(*header)) {
+ return 0;
+ }
+ if (le32_to_cpu(header->magic) != QED_MAGIC) {
+ return 0;
+ }
+ return 100;
+}
+
+/**
+ * Check whether an image format is raw
+ *
+ * @fmt: Backing file format, may be NULL
+ */
+static bool qed_fmt_is_raw(const char *fmt)
+{
+ return fmt && strcmp(fmt, "raw") == 0;
+}
+
+static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
+{
+ cpu->magic = le32_to_cpu(le->magic);
+ cpu->cluster_size = le32_to_cpu(le->cluster_size);
+ cpu->table_size = le32_to_cpu(le->table_size);
+ cpu->header_size = le32_to_cpu(le->header_size);
+ cpu->features = le64_to_cpu(le->features);
+ cpu->compat_features = le64_to_cpu(le->compat_features);
+ cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
+ cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
+ cpu->image_size = le64_to_cpu(le->image_size);
+ cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
+ cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
+}
+
+static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
+{
+ le->magic = cpu_to_le32(cpu->magic);
+ le->cluster_size = cpu_to_le32(cpu->cluster_size);
+ le->table_size = cpu_to_le32(cpu->table_size);
+ le->header_size = cpu_to_le32(cpu->header_size);
+ le->features = cpu_to_le64(cpu->features);
+ le->compat_features = cpu_to_le64(cpu->compat_features);
+ le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
+ le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
+ le->image_size = cpu_to_le64(cpu->image_size);
+ le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
+ le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
+}
+
+int qed_write_header_sync(BDRVQEDState *s)
+{
+ QEDHeader le;
+ int ret;
+
+ qed_header_cpu_to_le(&s->header, &le);
+ ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+ if (ret != sizeof(le)) {
+ return ret;
+ }
+ return 0;
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ int nsectors;
+ uint8_t *buf;
+} QEDWriteHeaderCB;
+
+static void qed_write_header_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+
+ qemu_vfree(write_header_cb->buf);
+ gencb_complete(write_header_cb, ret);
+}
+
+static void qed_write_header_read_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+ BDRVQEDState *s = write_header_cb->s;
+
+ if (ret) {
+ qed_write_header_cb(write_header_cb, ret);
+ return;
+ }
+
+ /* Update header */
+ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
+
+ bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+ write_header_cb->nsectors, qed_write_header_cb,
+ write_header_cb);
+}
+
+/**
+ * Update header in-place (does not rewrite backing filename or other strings)
+ *
+ * This function only updates known header fields in-place and does not affect
+ * extra data after the QED header.
+ */
+static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+ void *opaque)
+{
+ /* We must write full sectors for O_DIRECT but cannot necessarily generate
+ * the data following the header if an unrecognized compat feature is
+ * active. Therefore, first read the sectors containing the header, update
+ * them, and write back.
+ */
+
+ int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+ BDRV_SECTOR_SIZE;
+ size_t len = nsectors * BDRV_SECTOR_SIZE;
+ QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
+ cb, opaque);
+
+ write_header_cb->s = s;
+ write_header_cb->nsectors = nsectors;
+ write_header_cb->buf = qemu_blockalign(s->bs, len);
+ write_header_cb->iov.iov_base = write_header_cb->buf;
+ write_header_cb->iov.iov_len = len;
+ qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+
+ bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+ qed_write_header_read_cb, write_header_cb);
+}
+
+static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+{
+ uint64_t table_entries;
+ uint64_t l2_size;
+
+ table_entries = (table_size * cluster_size) / sizeof(uint64_t);
+ l2_size = table_entries * cluster_size;
+
+ return l2_size * table_entries;
+}
+
+static bool qed_is_cluster_size_valid(uint32_t cluster_size)
+{
+ if (cluster_size < QED_MIN_CLUSTER_SIZE ||
+ cluster_size > QED_MAX_CLUSTER_SIZE) {
+ return false;
+ }
+ if (cluster_size & (cluster_size - 1)) {
+ return false; /* not power of 2 */
+ }
+ return true;
+}
+
+static bool qed_is_table_size_valid(uint32_t table_size)
+{
+ if (table_size < QED_MIN_TABLE_SIZE ||
+ table_size > QED_MAX_TABLE_SIZE) {
+ return false;
+ }
+ if (table_size & (table_size - 1)) {
+ return false; /* not power of 2 */
+ }
+ return true;
+}
+
+static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
+ uint32_t table_size)
+{
+ if (image_size % BDRV_SECTOR_SIZE != 0) {
+ return false; /* not multiple of sector size */
+ }
+ if (image_size > qed_max_image_size(cluster_size, table_size)) {
+ return false; /* image is too large */
+ }
+ return true;
+}
+
+/**
+ * Read a string of known length from the image file
+ *
+ * @file: Image file
+ * @offset: File offset to start of string, in bytes
+ * @n: String length in bytes
+ * @buf: Destination buffer
+ * @buflen: Destination buffer length in bytes
+ * @ret: 0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+ char *buf, size_t buflen)
+{
+ int ret;
+ if (n >= buflen) {
+ return -EINVAL;
+ }
+ ret = bdrv_pread(file, offset, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ buf[n] = '\0';
+ return 0;
+}
+
+/**
+ * Allocate new clusters
+ *
+ * @s: QED state
+ * @n: Number of contiguous clusters to allocate
+ * @ret: Offset of first allocated cluster
+ *
+ * This function only produces the offset where the new clusters should be
+ * written. It updates BDRVQEDState but does not make any changes to the image
+ * file.
+ */
+static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
+{
+ uint64_t offset = s->file_size;
+ s->file_size += n * s->header.cluster_size;
+ return offset;
+}
+
+QEDTable *qed_alloc_table(BDRVQEDState *s)
+{
+ /* Honor O_DIRECT memory alignment requirements */
+ return qemu_blockalign(s->bs,
+ s->header.cluster_size * s->header.table_size);
+}
+
+/**
+ * Allocate a new zeroed L2 table
+ */
+static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+{
+ CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+
+ l2_table->table = qed_alloc_table(s);
+ l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
+
+ memset(l2_table->table->offsets, 0,
+ s->header.cluster_size * s->header.table_size);
+ return l2_table;
+}
+
+static void qed_aio_next_io(void *opaque, int ret);
+
+static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+{
+ assert(!s->allocating_write_reqs_plugged);
+
+ s->allocating_write_reqs_plugged = true;
+}
+
+static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+{
+ QEDAIOCB *acb;
+
+ assert(s->allocating_write_reqs_plugged);
+
+ s->allocating_write_reqs_plugged = false;
+
+ acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+ if (acb) {
+ qed_aio_next_io(acb, 0);
+ }
+}
+
+static void qed_finish_clear_need_check(void *opaque, int ret)
+{
+ /* Do nothing */
+}
+
+static void qed_flush_after_clear_need_check(void *opaque, int ret)
+{
+ BDRVQEDState *s = opaque;
+
+ bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
+
+ /* No need to wait until flush completes */
+ qed_unplug_allocating_write_reqs(s);
+}
+
+static void qed_clear_need_check(void *opaque, int ret)
+{
+ BDRVQEDState *s = opaque;
+
+ if (ret) {
+ qed_unplug_allocating_write_reqs(s);
+ return;
+ }
+
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header(s, qed_flush_after_clear_need_check, s);
+}
+
+static void qed_need_check_timer_cb(void *opaque)
+{
+ BDRVQEDState *s = opaque;
+
+ /* The timer should only fire when allocating writes have drained */
+ assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+
+ trace_qed_need_check_timer_cb(s);
+
+ qed_plug_allocating_write_reqs(s);
+
+ /* Ensure writes are on disk before clearing flag */
+ bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
+static void qed_start_need_check_timer(BDRVQEDState *s)
+{
+ trace_qed_start_need_check_timer(s);
+
+ /* Use vm_clock so we don't alter the image file while suspended for
+ * migration.
+ */
+ qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
+ get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
+}
+
+/* It's okay to call this multiple times or when no timer is started */
+static void qed_cancel_need_check_timer(BDRVQEDState *s)
+{
+ trace_qed_cancel_need_check_timer(s);
+ qemu_del_timer(s->need_check_timer);
+}
+
+static void bdrv_qed_rebind(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+ s->bs = bs;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, int flags)
+{
+ BDRVQEDState *s = bs->opaque;
+ QEDHeader le_header;
+ int64_t file_size;
+ int ret;
+
+ s->bs = bs;
+ QSIMPLEQ_INIT(&s->allocating_write_reqs);
+
+ ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+ if (ret < 0) {
+ return ret;
+ }
+ qed_header_le_to_cpu(&le_header, &s->header);
+
+ if (s->header.magic != QED_MAGIC) {
+ return -EINVAL;
+ }
+ if (s->header.features & ~QED_FEATURE_MASK) {
+ /* image uses unsupported feature bits */
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%" PRIx64,
+ s->header.features & ~QED_FEATURE_MASK);
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "QED", buf);
+ return -ENOTSUP;
+ }
+ if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
+ return -EINVAL;
+ }
+
+ /* Round down file size to the last cluster */
+ file_size = bdrv_getlength(bs->file);
+ if (file_size < 0) {
+ return file_size;
+ }
+ s->file_size = qed_start_of_cluster(s, file_size);
+
+ if (!qed_is_table_size_valid(s->header.table_size)) {
+ return -EINVAL;
+ }
+ if (!qed_is_image_size_valid(s->header.image_size,
+ s->header.cluster_size,
+ s->header.table_size)) {
+ return -EINVAL;
+ }
+ if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
+ return -EINVAL;
+ }
+
+ s->table_nelems = (s->header.cluster_size * s->header.table_size) /
+ sizeof(uint64_t);
+ s->l2_shift = ffs(s->header.cluster_size) - 1;
+ s->l2_mask = s->table_nelems - 1;
+ s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+
+ if ((s->header.features & QED_F_BACKING_FILE)) {
+ if ((uint64_t)s->header.backing_filename_offset +
+ s->header.backing_filename_size >
+ s->header.cluster_size * s->header.header_size) {
+ return -EINVAL;
+ }
+
+ ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+ s->header.backing_filename_size, bs->backing_file,
+ sizeof(bs->backing_file));
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
+ pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
+ }
+ }
+
+ /* Reset unknown autoclear feature bits. This is a backwards
+ * compatibility mechanism that allows images to be opened by older
+ * programs, which "knock out" unknown feature bits. When an image is
+ * opened by a newer program again it can detect that the autoclear
+ * feature is no longer valid.
+ */
+ if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
+ !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
+ s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
+
+ ret = qed_write_header_sync(s);
+ if (ret) {
+ return ret;
+ }
+
+ /* From here on only known autoclear feature bits are valid */
+ bdrv_flush(bs->file);
+ }
+
+ s->l1_table = qed_alloc_table(s);
+ qed_init_l2_cache(&s->l2_cache);
+
+ ret = qed_read_l1_table_sync(s);
+ if (ret) {
+ goto out;
+ }
+
+ /* If image was not closed cleanly, check consistency */
+ if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
+ /* Read-only images cannot be fixed. There is no risk of corruption
+ * since write operations are not possible. Therefore, allow
+ * potentially inconsistent images to be opened read-only. This can
+ * aid data recovery from an otherwise inconsistent image.
+ */
+ if (!bdrv_is_read_only(bs->file) &&
+ !(flags & BDRV_O_INCOMING)) {
+ BdrvCheckResult result = {0};
+
+ ret = qed_check(s, &result, true);
+ if (ret) {
+ goto out;
+ }
+ }
+ }
+
+ s->need_check_timer = qemu_new_timer_ns(vm_clock,
+ qed_need_check_timer_cb, s);
+
+out:
+ if (ret) {
+ qed_free_l2_cache(&s->l2_cache);
+ qemu_vfree(s->l1_table);
+ }
+ return ret;
+}
+
+static void bdrv_qed_close(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ qed_cancel_need_check_timer(s);
+ qemu_free_timer(s->need_check_timer);
+
+ /* Ensure writes reach stable storage */
+ bdrv_flush(bs->file);
+
+ /* Clean shutdown, no check required on next open */
+ if (s->header.features & QED_F_NEED_CHECK) {
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+ }
+
+ qed_free_l2_cache(&s->l2_cache);
+ qemu_vfree(s->l1_table);
+}
+
+static int qed_create(const char *filename, uint32_t cluster_size,
+ uint64_t image_size, uint32_t table_size,
+ const char *backing_file, const char *backing_fmt)
+{
+ QEDHeader header = {
+ .magic = QED_MAGIC,
+ .cluster_size = cluster_size,
+ .table_size = table_size,
+ .header_size = 1,
+ .features = 0,
+ .compat_features = 0,
+ .l1_table_offset = cluster_size,
+ .image_size = image_size,
+ };
+ QEDHeader le_header;
+ uint8_t *l1_table = NULL;
+ size_t l1_size = header.cluster_size * header.table_size;
+ int ret = 0;
+ BlockDriverState *bs = NULL;
+
+ ret = bdrv_create_file(filename, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* File must start empty and grow, check truncate is supported */
+ ret = bdrv_truncate(bs, 0);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (backing_file) {
+ header.features |= QED_F_BACKING_FILE;
+ header.backing_filename_offset = sizeof(le_header);
+ header.backing_filename_size = strlen(backing_file);
+
+ if (qed_fmt_is_raw(backing_fmt)) {
+ header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+ }
+ }
+
+ qed_header_cpu_to_le(&header, &le_header);
+ ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+ if (ret < 0) {
+ goto out;
+ }
+ ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
+ header.backing_filename_size);
+ if (ret < 0) {
+ goto out;
+ }
+
+ l1_table = g_malloc0(l1_size);
+ ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = 0; /* success */
+out:
+ g_free(l1_table);
+ bdrv_delete(bs);
+ return ret;
+}
+
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+{
+ uint64_t image_size = 0;
+ uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
+ uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
+ const char *backing_file = NULL;
+ const char *backing_fmt = NULL;
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ image_size = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+ backing_fmt = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ cluster_size = options->value.n;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
+ if (options->value.n) {
+ table_size = options->value.n;
+ }
+ }
+ options++;
+ }
+
+ if (!qed_is_cluster_size_valid(cluster_size)) {
+ fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
+ QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
+ return -EINVAL;
+ }
+ if (!qed_is_table_size_valid(table_size)) {
+ fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
+ QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
+ return -EINVAL;
+ }
+ if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
+ fprintf(stderr, "QED image size must be a non-zero multiple of "
+ "cluster size and less than %" PRIu64 " bytes\n",
+ qed_max_image_size(cluster_size, table_size));
+ return -EINVAL;
+ }
+
+ return qed_create(filename, cluster_size, image_size, table_size,
+ backing_file, backing_fmt);
+}
+
+typedef struct {
+ Coroutine *co;
+ int is_allocated;
+ int *pnum;
+} QEDIsAllocatedCB;
+
+static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
+{
+ QEDIsAllocatedCB *cb = opaque;
+ *cb->pnum = len / BDRV_SECTOR_SIZE;
+ cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
+}
+
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BDRVQEDState *s = bs->opaque;
+ uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+ size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
+ QEDIsAllocatedCB cb = {
+ .is_allocated = -1,
+ .pnum = pnum,
+ };
+ QEDRequest request = { .l2_table = NULL };
+
+ qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+
+ /* Now sleep if the callback wasn't invoked immediately */
+ while (cb.is_allocated == -1) {
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
+ }
+
+ qed_unref_l2_cache_entry(request.l2_table);
+
+ return cb.is_allocated;
+}
+
+static int bdrv_qed_make_empty(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+
+static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
+{
+ return acb->common.bs->opaque;
+}
+
+/**
+ * Read from the backing file or zero-fill if no backing file
+ *
+ * @s: QED state
+ * @pos: Byte position in device
+ * @qiov: Destination I/O vector
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ *
+ * This function reads qiov->size bytes starting at pos from the backing file.
+ * If there is no backing file then zeroes are read.
+ */
+static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+ QEMUIOVector *qiov,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ uint64_t backing_length = 0;
+ size_t size;
+
+ /* If there is a backing file, get its length. Treat the absence of a
+ * backing file like a zero length backing file.
+ */
+ if (s->bs->backing_hd) {
+ int64_t l = bdrv_getlength(s->bs->backing_hd);
+ if (l < 0) {
+ cb(opaque, l);
+ return;
+ }
+ backing_length = l;
+ }
+
+ /* Zero all sectors if reading beyond the end of the backing file */
+ if (pos >= backing_length ||
+ pos + qiov->size > backing_length) {
+ qemu_iovec_memset(qiov, 0, 0, qiov->size);
+ }
+
+ /* Complete now if there are no backing file sectors to read */
+ if (pos >= backing_length) {
+ cb(opaque, 0);
+ return;
+ }
+
+ /* If the read straddles the end of the backing file, shorten it */
+ size = MIN((uint64_t)backing_length - pos, qiov->size);
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+ bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
+ qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ uint64_t offset;
+} CopyFromBackingFileCB;
+
+static void qed_copy_from_backing_file_cb(void *opaque, int ret)
+{
+ CopyFromBackingFileCB *copy_cb = opaque;
+ qemu_vfree(copy_cb->iov.iov_base);
+ gencb_complete(&copy_cb->gencb, ret);
+}
+
+static void qed_copy_from_backing_file_write(void *opaque, int ret)
+{
+ CopyFromBackingFileCB *copy_cb = opaque;
+ BDRVQEDState *s = copy_cb->s;
+
+ if (ret) {
+ qed_copy_from_backing_file_cb(copy_cb, ret);
+ return;
+ }
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+ bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+ &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
+ qed_copy_from_backing_file_cb, copy_cb);
+}
+
+/**
+ * Copy data from backing file into the image
+ *
+ * @s: QED state
+ * @pos: Byte position in device
+ * @len: Number of bytes
+ * @offset: Byte offset in image file
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ */
+static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+ uint64_t len, uint64_t offset,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ CopyFromBackingFileCB *copy_cb;
+
+ /* Skip copy entirely if there is no work to do */
+ if (len == 0) {
+ cb(opaque, 0);
+ return;
+ }
+
+ copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
+ copy_cb->s = s;
+ copy_cb->offset = offset;
+ copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
+ copy_cb->iov.iov_len = len;
+ qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
+
+ qed_read_backing_file(s, pos, &copy_cb->qiov,
+ qed_copy_from_backing_file_write, copy_cb);
+}
+
+/**
+ * Link one or more contiguous clusters into a table
+ *
+ * @s: QED state
+ * @table: L2 table
+ * @index: First cluster index
+ * @n: Number of contiguous clusters
+ * @cluster: First cluster offset
+ *
+ * The cluster offset may be an allocated byte offset in the image file, the
+ * zero cluster marker, or the unallocated cluster marker.
+ */
+static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
+ unsigned int n, uint64_t cluster)
+{
+ int i;
+ for (i = index; i < index + n; i++) {
+ table->offsets[i] = cluster;
+ if (!qed_offset_is_unalloc_cluster(cluster) &&
+ !qed_offset_is_zero_cluster(cluster)) {
+ cluster += s->header.cluster_size;
+ }
+ }
+}
+
+static void qed_aio_complete_bh(void *opaque)
+{
+ QEDAIOCB *acb = opaque;
+ BlockDriverCompletionFunc *cb = acb->common.cb;
+ void *user_opaque = acb->common.opaque;
+ int ret = acb->bh_ret;
+ bool *finished = acb->finished;
+
+ qemu_bh_delete(acb->bh);
+ qemu_aio_release(acb);
+
+ /* Invoke callback */
+ cb(user_opaque, ret);
+
+ /* Signal cancel completion */
+ if (finished) {
+ *finished = true;
+ }
+}
+
+static void qed_aio_complete(QEDAIOCB *acb, int ret)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+
+ trace_qed_aio_complete(s, acb, ret);
+
+ /* Free resources */
+ qemu_iovec_destroy(&acb->cur_qiov);
+ qed_unref_l2_cache_entry(acb->request.l2_table);
+
+ /* Free the buffer we may have allocated for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ qemu_vfree(acb->qiov->iov[0].iov_base);
+ acb->qiov->iov[0].iov_base = NULL;
+ }
+
+ /* Arrange for a bh to invoke the completion function */
+ acb->bh_ret = ret;
+ acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
+ qemu_bh_schedule(acb->bh);
+
+ /* Start next allocating write request waiting behind this one. Note that
+ * requests enqueue themselves when they first hit an unallocated cluster
+ * but they wait until the entire request is finished before waking up the
+ * next request in the queue. This ensures that we don't cycle through
+ * requests multiple times but rather finish one at a time completely.
+ */
+ if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+ QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
+ acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+ if (acb) {
+ qed_aio_next_io(acb, 0);
+ } else if (s->header.features & QED_F_NEED_CHECK) {
+ qed_start_need_check_timer(s);
+ }
+ }
+}
+
+/**
+ * Commit the current L2 table to the cache
+ */
+static void qed_commit_l2_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ CachedL2Table *l2_table = acb->request.l2_table;
+ uint64_t l2_offset = l2_table->offset;
+
+ qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+ /* This is guaranteed to succeed because we just committed the entry to the
+ * cache.
+ */
+ acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+ assert(acb->request.l2_table != NULL);
+
+ qed_aio_next_io(opaque, ret);
+}
+
+/**
+ * Update L1 table with new L2 table offset and write it out
+ */
+static void qed_aio_write_l1_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ int index;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ index = qed_l1_index(s, acb->cur_pos);
+ s->l1_table->offsets[index] = acb->request.l2_table->offset;
+
+ qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+}
+
+/**
+ * Update L2 table with new cluster offsets and write them out
+ */
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
+ int index;
+
+ if (ret) {
+ goto err;
+ }
+
+ if (need_alloc) {
+ qed_unref_l2_cache_entry(acb->request.l2_table);
+ acb->request.l2_table = qed_new_l2_table(s);
+ }
+
+ index = qed_l2_index(s, acb->cur_pos);
+ qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
+ offset);
+
+ if (need_alloc) {
+ /* Write out the whole new L2 table */
+ qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
+ qed_aio_write_l1_update, acb);
+ } else {
+ /* Write out only the updated part of the L2 table */
+ qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
+ qed_aio_next_io, acb);
+ }
+ return;
+
+err:
+ qed_aio_complete(acb, ret);
+}
+
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Flush new data clusters before updating the L2 table
+ *
+ * This flush is necessary when a backing file is in use. A crash during an
+ * allocating write could result in empty clusters in the image. If the write
+ * only touched a subregion of the cluster, then backing image sectors have
+ * been lost in the untouched region. The solution is to flush after writing a
+ * new data cluster and before updating the L2 table.
+ */
+static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+
+ if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
+ qed_aio_complete(acb, -EIO);
+ }
+}
+
+/**
+ * Write data to the image file
+ */
+static void qed_aio_write_main(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t offset = acb->cur_cluster +
+ qed_offset_into_cluster(s, acb->cur_pos);
+ BlockDriverCompletionFunc *next_fn;
+
+ trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+ next_fn = qed_aio_next_io;
+ } else {
+ if (s->bs->backing_hd) {
+ next_fn = qed_aio_write_flush_before_l2_update;
+ } else {
+ next_fn = qed_aio_write_l2_update_cb;
+ }
+ }
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+ bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ next_fn, acb);
+}
+
+/**
+ * Populate back untouched region of new data cluster
+ */
+static void qed_aio_write_postfill(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t start = acb->cur_pos + acb->cur_qiov.size;
+ uint64_t len =
+ qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+ uint64_t offset = acb->cur_cluster +
+ qed_offset_into_cluster(s, acb->cur_pos) +
+ acb->cur_qiov.size;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ trace_qed_aio_write_postfill(s, acb, start, len, offset);
+ qed_copy_from_backing_file(s, start, len, offset,
+ qed_aio_write_main, acb);
+}
+
+/**
+ * Populate front untouched region of new data cluster
+ */
+static void qed_aio_write_prefill(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
+ uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+
+ trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+ qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
+ qed_aio_write_postfill, acb);
+}
+
+/**
+ * Check if the QED_F_NEED_CHECK bit should be set during allocating write
+ */
+static bool qed_should_set_need_check(BDRVQEDState *s)
+{
+ /* The flush before L2 update path ensures consistency */
+ if (s->bs->backing_hd) {
+ return false;
+ }
+
+ return !(s->header.features & QED_F_NEED_CHECK);
+}
+
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ qed_aio_write_l2_update(acb, 0, 1);
+}
+
+/**
+ * Write new data cluster
+ *
+ * @acb: Write request
+ * @len: Length in bytes
+ *
+ * This path is taken when writing to previously unallocated clusters.
+ */
+static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverCompletionFunc *cb;
+
+ /* Cancel timer when the first allocating request comes in */
+ if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+ qed_cancel_need_check_timer(s);
+ }
+
+ /* Freeze this request if another allocating write is in progress */
+ if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+ QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
+ }
+ if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
+ s->allocating_write_reqs_plugged) {
+ return; /* wait for existing request to finish */
+ }
+
+ acb->cur_nclusters = qed_bytes_to_clusters(s,
+ qed_offset_into_cluster(s, acb->cur_pos) + len);
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ if (acb->flags & QED_AIOCB_ZERO) {
+ /* Skip ahead if the clusters are already zero */
+ if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+ qed_aio_next_io(acb, 0);
+ return;
+ }
+
+ cb = qed_aio_write_zero_cluster;
+ } else {
+ cb = qed_aio_write_prefill;
+ acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+ }
+
+ if (qed_should_set_need_check(s)) {
+ s->header.features |= QED_F_NEED_CHECK;
+ qed_write_header(s, cb, acb);
+ } else {
+ cb(acb, 0);
+ }
+}
+
+/**
+ * Write data cluster in place
+ *
+ * @acb: Write request
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * This path is taken when writing to already allocated clusters.
+ */
+static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+{
+ /* Allocate buffer for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ struct iovec *iov = acb->qiov->iov;
+
+ if (!iov->iov_base) {
+ iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+ memset(iov->iov_base, 0, iov->iov_len);
+ }
+ }
+
+ /* Calculate the I/O vector */
+ acb->cur_cluster = offset;
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ /* Do the actual write */
+ qed_aio_write_main(acb, 0);
+}
+
+/**
+ * Write data cluster
+ *
+ * @opaque: Write request
+ * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ * or -errno
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_write_data(void *opaque, int ret,
+ uint64_t offset, size_t len)
+{
+ QEDAIOCB *acb = opaque;
+
+ trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
+
+ acb->find_cluster_ret = ret;
+
+ switch (ret) {
+ case QED_CLUSTER_FOUND:
+ qed_aio_write_inplace(acb, offset, len);
+ break;
+
+ case QED_CLUSTER_L2:
+ case QED_CLUSTER_L1:
+ case QED_CLUSTER_ZERO:
+ qed_aio_write_alloc(acb, len);
+ break;
+
+ default:
+ qed_aio_complete(acb, ret);
+ break;
+ }
+}
+
+/**
+ * Read data cluster
+ *
+ * @opaque: Read request
+ * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ * or -errno
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_read_data(void *opaque, int ret,
+ uint64_t offset, size_t len)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverState *bs = acb->common.bs;
+
+ /* Adjust offset into cluster */
+ offset += qed_offset_into_cluster(s, acb->cur_pos);
+
+ trace_qed_aio_read_data(s, acb, ret, offset, len);
+
+ if (ret < 0) {
+ goto err;
+ }
+
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ /* Handle zero cluster and backing file reads */
+ if (ret == QED_CLUSTER_ZERO) {
+ qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
+ qed_aio_next_io(acb, 0);
+ return;
+ } else if (ret != QED_CLUSTER_FOUND) {
+ qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+ qed_aio_next_io, acb);
+ return;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ qed_aio_next_io, acb);
+ return;
+
+err:
+ qed_aio_complete(acb, ret);
+}
+
+/**
+ * Begin next I/O or complete the request
+ */
+static void qed_aio_next_io(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+ qed_aio_write_data : qed_aio_read_data;
+
+ trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+
+ /* Handle I/O error */
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ acb->qiov_offset += acb->cur_qiov.size;
+ acb->cur_pos += acb->cur_qiov.size;
+ qemu_iovec_reset(&acb->cur_qiov);
+
+ /* Complete request */
+ if (acb->cur_pos >= acb->end_pos) {
+ qed_aio_complete(acb, 0);
+ return;
+ }
+
+ /* Find next cluster and start I/O */
+ qed_find_cluster(s, &acb->request,
+ acb->cur_pos, acb->end_pos - acb->cur_pos,
+ io_fn, acb);
+}
+
+static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, int flags)
+{
+ QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
+
+ trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
+ opaque, flags);
+
+ acb->flags = flags;
+ acb->finished = NULL;
+ acb->qiov = qiov;
+ acb->qiov_offset = 0;
+ acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+ acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
+ acb->request.l2_table = NULL;
+ qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+
+ /* Start request */
+ qed_aio_next_io(acb, 0);
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
+}
+
+typedef struct {
+ Coroutine *co;
+ int ret;
+ bool done;
+} QEDWriteZeroesCB;
+
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+{
+ QEDWriteZeroesCB *cb = opaque;
+
+ cb->done = true;
+ cb->ret = ret;
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
+}
+
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors)
+{
+ BlockDriverAIOCB *blockacb;
+ BDRVQEDState *s = bs->opaque;
+ QEDWriteZeroesCB cb = { .done = false };
+ QEMUIOVector qiov;
+ struct iovec iov;
+
+ /* Refuse if there are untouched backing file sectors */
+ if (bs->backing_hd) {
+ if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ }
+
+ /* Zero writes start without an I/O buffer. If a buffer becomes necessary
+ * then it will be allocated during request processing.
+ */
+ iov.iov_base = NULL,
+ iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+ qed_co_write_zeroes_cb, &cb,
+ QED_AIOCB_WRITE | QED_AIOCB_ZERO);
+ if (!blockacb) {
+ return -EIO;
+ }
+ if (!cb.done) {
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
+ }
+ assert(cb.done);
+ return cb.ret;
+}
+
+static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVQEDState *s = bs->opaque;
+ uint64_t old_image_size;
+ int ret;
+
+ if (!qed_is_image_size_valid(offset, s->header.cluster_size,
+ s->header.table_size)) {
+ return -EINVAL;
+ }
+
+ /* Shrinking is currently not supported */
+ if ((uint64_t)offset < s->header.image_size) {
+ return -ENOTSUP;
+ }
+
+ old_image_size = s->header.image_size;
+ s->header.image_size = offset;
+ ret = qed_write_header_sync(s);
+ if (ret < 0) {
+ s->header.image_size = old_image_size;
+ }
+ return ret;
+}
+
+static int64_t bdrv_qed_getlength(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+ return s->header.image_size;
+}
+
+static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ memset(bdi, 0, sizeof(*bdi));
+ bdi->cluster_size = s->header.cluster_size;
+ bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
+ return 0;
+}
+
+static int bdrv_qed_change_backing_file(BlockDriverState *bs,
+ const char *backing_file,
+ const char *backing_fmt)
+{
+ BDRVQEDState *s = bs->opaque;
+ QEDHeader new_header, le_header;
+ void *buffer;
+ size_t buffer_len, backing_file_len;
+ int ret;
+
+ /* Refuse to set backing filename if unknown compat feature bits are
+ * active. If the image uses an unknown compat feature then we may not
+ * know the layout of data following the header structure and cannot safely
+ * add a new string.
+ */
+ if (backing_file && (s->header.compat_features &
+ ~QED_COMPAT_FEATURE_MASK)) {
+ return -ENOTSUP;
+ }
+
+ memcpy(&new_header, &s->header, sizeof(new_header));
+
+ new_header.features &= ~(QED_F_BACKING_FILE |
+ QED_F_BACKING_FORMAT_NO_PROBE);
+
+ /* Adjust feature flags */
+ if (backing_file) {
+ new_header.features |= QED_F_BACKING_FILE;
+
+ if (qed_fmt_is_raw(backing_fmt)) {
+ new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+ }
+ }
+
+ /* Calculate new header size */
+ backing_file_len = 0;
+
+ if (backing_file) {
+ backing_file_len = strlen(backing_file);
+ }
+
+ buffer_len = sizeof(new_header);
+ new_header.backing_filename_offset = buffer_len;
+ new_header.backing_filename_size = backing_file_len;
+ buffer_len += backing_file_len;
+
+ /* Make sure we can rewrite header without failing */
+ if (buffer_len > new_header.header_size * new_header.cluster_size) {
+ return -ENOSPC;
+ }
+
+ /* Prepare new header */
+ buffer = g_malloc(buffer_len);
+
+ qed_header_cpu_to_le(&new_header, &le_header);
+ memcpy(buffer, &le_header, sizeof(le_header));
+ buffer_len = sizeof(le_header);
+
+ if (backing_file) {
+ memcpy(buffer + buffer_len, backing_file, backing_file_len);
+ buffer_len += backing_file_len;
+ }
+
+ /* Write new header */
+ ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
+ g_free(buffer);
+ if (ret == 0) {
+ memcpy(&s->header, &new_header, sizeof(new_header));
+ }
+ return ret;
+}
+
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ bdrv_qed_close(bs);
+ memset(s, 0, sizeof(BDRVQEDState));
+ bdrv_qed_open(bs, bs->open_flags);
+}
+
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ return qed_check(s, result, !!fix);
+}
+
+static QEMUOptionParameter qed_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size (in bytes)"
+ }, {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ }, {
+ .name = BLOCK_OPT_BACKING_FMT,
+ .type = OPT_STRING,
+ .help = "Image format of the base image"
+ }, {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "Cluster size (in bytes)",
+ .value = { .n = QED_DEFAULT_CLUSTER_SIZE },
+ }, {
+ .name = BLOCK_OPT_TABLE_SIZE,
+ .type = OPT_SIZE,
+ .help = "L1/L2 table size (in clusters)"
+ },
+ { /* end of list */ }
+};
+
+static BlockDriver bdrv_qed = {
+ .format_name = "qed",
+ .instance_size = sizeof(BDRVQEDState),
+ .create_options = qed_create_options,
+
+ .bdrv_probe = bdrv_qed_probe,
+ .bdrv_rebind = bdrv_qed_rebind,
+ .bdrv_open = bdrv_qed_open,
+ .bdrv_close = bdrv_qed_close,
+ .bdrv_create = bdrv_qed_create,
+ .bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
+ .bdrv_make_empty = bdrv_qed_make_empty,
+ .bdrv_aio_readv = bdrv_qed_aio_readv,
+ .bdrv_aio_writev = bdrv_qed_aio_writev,
+ .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes,
+ .bdrv_truncate = bdrv_qed_truncate,
+ .bdrv_getlength = bdrv_qed_getlength,
+ .bdrv_get_info = bdrv_qed_get_info,
+ .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+ .bdrv_invalidate_cache = bdrv_qed_invalidate_cache,
+ .bdrv_check = bdrv_qed_check,
+};
+
+static void bdrv_qed_init(void)
+{
+ bdrv_register(&bdrv_qed);
+}
+
+block_init(bdrv_qed_init);
diff --git a/block/qed.h b/block/qed.h
new file mode 100644
index 000000000..a063bf70a
--- /dev/null
+++ b/block/qed.h
@@ -0,0 +1,344 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_QED_H
+#define BLOCK_QED_H
+
+#include "block_int.h"
+
+/* The layout of a QED file is as follows:
+ *
+ * +--------+----------+----------+----------+-----+
+ * | header | L1 table | cluster0 | cluster1 | ... |
+ * +--------+----------+----------+----------+-----+
+ *
+ * There is a 2-level pagetable for cluster allocation:
+ *
+ * +----------+
+ * | L1 table |
+ * +----------+
+ * ,------' | '------.
+ * +----------+ | +----------+
+ * | L2 table | ... | L2 table |
+ * +----------+ +----------+
+ * ,------' | '------.
+ * +----------+ | +----------+
+ * | Data | ... | Data |
+ * +----------+ +----------+
+ *
+ * The L1 table is fixed size and always present. L2 tables are allocated on
+ * demand. The L1 table size determines the maximum possible image size; it
+ * can be influenced using the cluster_size and table_size values.
+ *
+ * All fields are little-endian on disk.
+ */
+
+enum {
+ QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24,
+
+ /* The image supports a backing file */
+ QED_F_BACKING_FILE = 0x01,
+
+ /* The image needs a consistency check before use */
+ QED_F_NEED_CHECK = 0x02,
+
+ /* The backing file format must not be probed, treat as raw image */
+ QED_F_BACKING_FORMAT_NO_PROBE = 0x04,
+
+ /* Feature bits must be used when the on-disk format changes */
+ QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
+ QED_F_NEED_CHECK |
+ QED_F_BACKING_FORMAT_NO_PROBE,
+ QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */
+ QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */
+
+ /* Data is stored in groups of sectors called clusters. Cluster size must
+ * be large to avoid keeping too much metadata. I/O requests that have
+ * sub-cluster size will require read-modify-write.
+ */
+ QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */
+ QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024,
+ QED_DEFAULT_CLUSTER_SIZE = 64 * 1024,
+
+ /* Allocated clusters are tracked using a 2-level pagetable. Table size is
+ * a multiple of clusters so large maximum image sizes can be supported
+ * without jacking up the cluster size too much.
+ */
+ QED_MIN_TABLE_SIZE = 1, /* in clusters */
+ QED_MAX_TABLE_SIZE = 16,
+ QED_DEFAULT_TABLE_SIZE = 4,
+
+ /* Delay to flush and clean image after last allocating write completes */
+ QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */
+};
+
+typedef struct {
+ uint32_t magic; /* QED\0 */
+
+ uint32_t cluster_size; /* in bytes */
+ uint32_t table_size; /* for L1 and L2 tables, in clusters */
+ uint32_t header_size; /* in clusters */
+
+ uint64_t features; /* format feature bits */
+ uint64_t compat_features; /* compatible feature bits */
+ uint64_t autoclear_features; /* self-resetting feature bits */
+
+ uint64_t l1_table_offset; /* in bytes */
+ uint64_t image_size; /* total logical image size, in bytes */
+
+ /* if (features & QED_F_BACKING_FILE) */
+ uint32_t backing_filename_offset; /* in bytes from start of header */
+ uint32_t backing_filename_size; /* in bytes */
+} QEDHeader;
+
+typedef struct {
+ uint64_t offsets[0]; /* in bytes */
+} QEDTable;
+
+/* The L2 cache is a simple write-through cache for L2 structures */
+typedef struct CachedL2Table {
+ QEDTable *table;
+ uint64_t offset; /* offset=0 indicates an invalidate entry */
+ QTAILQ_ENTRY(CachedL2Table) node;
+ int ref;
+} CachedL2Table;
+
+typedef struct {
+ QTAILQ_HEAD(, CachedL2Table) entries;
+ unsigned int n_entries;
+} L2TableCache;
+
+typedef struct QEDRequest {
+ CachedL2Table *l2_table;
+} QEDRequest;
+
+enum {
+ QED_AIOCB_WRITE = 0x0001, /* read or write? */
+ QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */
+};
+
+typedef struct QEDAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+ int bh_ret; /* final return status for completion bh */
+ QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */
+ int flags; /* QED_AIOCB_* bits ORed together */
+ bool *finished; /* signal for cancel completion */
+ uint64_t end_pos; /* request end on block device, in bytes */
+
+ /* User scatter-gather list */
+ QEMUIOVector *qiov;
+ size_t qiov_offset; /* byte count already processed */
+
+ /* Current cluster scatter-gather list */
+ QEMUIOVector cur_qiov;
+ uint64_t cur_pos; /* position on block device, in bytes */
+ uint64_t cur_cluster; /* cluster offset in image file */
+ unsigned int cur_nclusters; /* number of clusters being accessed */
+ int find_cluster_ret; /* used for L1/L2 update */
+
+ QEDRequest request;
+} QEDAIOCB;
+
+typedef struct {
+ BlockDriverState *bs; /* device */
+ uint64_t file_size; /* length of image file, in bytes */
+
+ QEDHeader header; /* always cpu-endian */
+ QEDTable *l1_table;
+ L2TableCache l2_cache; /* l2 table cache */
+ uint32_t table_nelems;
+ uint32_t l1_shift;
+ uint32_t l2_shift;
+ uint32_t l2_mask;
+
+ /* Allocating write request queue */
+ QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+ bool allocating_write_reqs_plugged;
+
+ /* Periodic flush and clear need check flag */
+ QEMUTimer *need_check_timer;
+} BDRVQEDState;
+
+enum {
+ QED_CLUSTER_FOUND, /* cluster found */
+ QED_CLUSTER_ZERO, /* zero cluster found */
+ QED_CLUSTER_L2, /* cluster missing in L2 */
+ QED_CLUSTER_L1, /* cluster missing in L1 */
+};
+
+/**
+ * qed_find_cluster() completion callback
+ *
+ * @opaque: User data for completion callback
+ * @ret: QED_CLUSTER_FOUND Success
+ * QED_CLUSTER_L2 Data cluster unallocated in L2
+ * QED_CLUSTER_L1 L2 unallocated in L1
+ * -errno POSIX error occurred
+ * @offset: Data cluster offset
+ * @len: Contiguous bytes starting from cluster offset
+ *
+ * This function is invoked when qed_find_cluster() completes.
+ *
+ * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
+ * in the image file.
+ *
+ * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
+ * table offset, respectively. len is number of contiguous unallocated bytes.
+ */
+typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
+
+/**
+ * Generic callback for chaining async callbacks
+ */
+typedef struct {
+ BlockDriverCompletionFunc *cb;
+ void *opaque;
+} GenericCB;
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
+void gencb_complete(void *opaque, int ret);
+
+/**
+ * Header functions
+ */
+int qed_write_header_sync(BDRVQEDState *s);
+
+/**
+ * L2 cache functions
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache);
+void qed_free_l2_cache(L2TableCache *l2_cache);
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache);
+void qed_unref_l2_cache_entry(CachedL2Table *entry);
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset);
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
+
+/**
+ * Table I/O functions
+ */
+int qed_read_l1_table_sync(BDRVQEDState *s);
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+ BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+ unsigned int n);
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+ uint64_t offset);
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+ BlockDriverCompletionFunc *cb, void *opaque);
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush,
+ BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush);
+
+/**
+ * Cluster functions
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+ size_t len, QEDFindClusterFunc *cb, void *opaque);
+
+/**
+ * Consistency check
+ */
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix);
+
+QEDTable *qed_alloc_table(BDRVQEDState *s);
+
+/**
+ * Round down to the start of a cluster
+ */
+static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset)
+{
+ return offset & ~(uint64_t)(s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset)
+{
+ return offset & (s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes)
+{
+ return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) /
+ (s->header.cluster_size - 1);
+}
+
+static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos)
+{
+ return pos >> s->l1_shift;
+}
+
+static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos)
+{
+ return (pos >> s->l2_shift) & s->l2_mask;
+}
+
+/**
+ * Test if a cluster offset is valid
+ */
+static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset)
+{
+ uint64_t header_size = (uint64_t)s->header.header_size *
+ s->header.cluster_size;
+
+ if (offset & (s->header.cluster_size - 1)) {
+ return false;
+ }
+ return offset >= header_size && offset < s->file_size;
+}
+
+/**
+ * Test if a table offset is valid
+ */
+static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset)
+{
+ uint64_t end_offset = offset + (s->header.table_size - 1) *
+ s->header.cluster_size;
+
+ /* Overflow check */
+ if (end_offset <= offset) {
+ return false;
+ }
+
+ return qed_check_cluster_offset(s, offset) &&
+ qed_check_cluster_offset(s, end_offset);
+}
+
+static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s,
+ uint64_t offset)
+{
+ if (qed_offset_into_cluster(s, offset)) {
+ return false;
+ }
+ return true;
+}
+
+static inline bool qed_offset_is_unalloc_cluster(uint64_t offset)
+{
+ if (offset == 0) {
+ return true;
+ }
+ return false;
+}
+
+static inline bool qed_offset_is_zero_cluster(uint64_t offset)
+{
+ if (offset == 1) {
+ return true;
+ }
+ return false;
+}
+
+#endif /* BLOCK_QED_H */
diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h
new file mode 100644
index 000000000..ba118f616
--- /dev/null
+++ b/block/raw-posix-aio.h
@@ -0,0 +1,45 @@
+/*
+ * QEMU Posix block I/O backend AIO support
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#ifndef QEMU_RAW_POSIX_AIO_H
+#define QEMU_RAW_POSIX_AIO_H
+
+/* AIO request types */
+#define QEMU_AIO_READ 0x0001
+#define QEMU_AIO_WRITE 0x0002
+#define QEMU_AIO_IOCTL 0x0004
+#define QEMU_AIO_FLUSH 0x0008
+#define QEMU_AIO_TYPE_MASK \
+ (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH)
+
+/* AIO flags */
+#define QEMU_AIO_MISALIGNED 0x1000
+
+
+/* posix-aio-compat.c - thread pool based implementation */
+int paio_init(void);
+BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type);
+BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque);
+
+/* linux-aio.c - Linux native implementation */
+void *laio_init(void);
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type);
+
+#endif /* QEMU_RAW_POSIX_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
new file mode 100644
index 000000000..6be20b192
--- /dev/null
+++ b/block/raw-posix.c
@@ -0,0 +1,1383 @@
+/*
+ * Block driver for RAW files (posix)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "qemu-char.h"
+#include "qemu-log.h"
+#include "block_int.h"
+#include "module.h"
+#include "block/raw-posix-aio.h"
+
+#if defined(__APPLE__) && (__MACH__)
+#include <paths.h>
+#include <sys/param.h>
+#include <IOKit/IOKitLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/storage/IOMediaBSDClient.h>
+#include <IOKit/storage/IOMedia.h>
+#include <IOKit/storage/IOCDMedia.h>
+//#include <IOKit/storage/IOCDTypes.h>
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+#ifdef __sun__
+#define _POSIX_PTHREAD_SEMANTICS 1
+#include <sys/dkio.h>
+#endif
+#ifdef __linux__
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/param.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#include <linux/fs.h>
+#endif
+#ifdef CONFIG_FIEMAP
+#include <linux/fiemap.h>
+#endif
+#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <sys/disk.h>
+#include <sys/cdio.h>
+#endif
+
+#ifdef __OpenBSD__
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <sys/dkio.h>
+#endif
+
+#ifdef __NetBSD__
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <sys/dkio.h>
+#include <sys/disk.h>
+#endif
+
+#ifdef __DragonFly__
+#include <sys/ioctl.h>
+#include <sys/diskslice.h>
+#endif
+
+#ifdef CONFIG_XFS
+#include <xfs/xfs.h>
+#endif
+
+//#define DEBUG_FLOPPY
+
+//#define DEBUG_BLOCK
+#if defined(DEBUG_BLOCK)
+#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
+ { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+#else
+#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+#endif
+
+/* OS X does not have O_DSYNC */
+#ifndef O_DSYNC
+#ifdef O_SYNC
+#define O_DSYNC O_SYNC
+#elif defined(O_FSYNC)
+#define O_DSYNC O_FSYNC
+#endif
+#endif
+
+/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
+#ifndef O_DIRECT
+#define O_DIRECT O_DSYNC
+#endif
+
+#define FTYPE_FILE 0
+#define FTYPE_CD 1
+#define FTYPE_FD 2
+
+/* if the FD is not accessed during that time (in ns), we try to
+ reopen it to see if the disk has been changed */
+#define FD_OPEN_TIMEOUT (1000000000)
+
+#define MAX_BLOCKSIZE 4096
+
+typedef struct BDRVRawState {
+ int fd;
+ int type;
+ int open_flags;
+#if defined(__linux__)
+ /* linux floppy specific */
+ int64_t fd_open_time;
+ int64_t fd_error_time;
+ int fd_got_error;
+ int fd_media_changed;
+#endif
+#ifdef CONFIG_LINUX_AIO
+ int use_aio;
+ void *aio_ctx;
+#endif
+ uint8_t *aligned_buf;
+ unsigned aligned_buf_size;
+#ifdef CONFIG_XFS
+ bool is_xfs : 1;
+#endif
+} BDRVRawState;
+
+static int fd_open(BlockDriverState *bs);
+static int64_t raw_getlength(BlockDriverState *bs);
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+static int cdrom_reopen(BlockDriverState *bs);
+#endif
+
+#if defined(__NetBSD__)
+static int raw_normalize_devicepath(const char **filename)
+{
+ static char namebuf[PATH_MAX];
+ const char *dp, *fname;
+ struct stat sb;
+
+ fname = *filename;
+ dp = strrchr(fname, '/');
+ if (lstat(fname, &sb) < 0) {
+ fprintf(stderr, "%s: stat failed: %s\n",
+ fname, strerror(errno));
+ return -errno;
+ }
+
+ if (!S_ISBLK(sb.st_mode)) {
+ return 0;
+ }
+
+ if (dp == NULL) {
+ snprintf(namebuf, PATH_MAX, "r%s", fname);
+ } else {
+ snprintf(namebuf, PATH_MAX, "%.*s/r%s",
+ (int)(dp - fname), fname, dp + 1);
+ }
+ fprintf(stderr, "%s is a block device", fname);
+ *filename = namebuf;
+ fprintf(stderr, ", using %s\n", *filename);
+
+ return 0;
+}
+#else
+static int raw_normalize_devicepath(const char **filename)
+{
+ return 0;
+}
+#endif
+
+static int raw_open_common(BlockDriverState *bs, const char *filename,
+ int bdrv_flags, int open_flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd, ret;
+
+ ret = raw_normalize_devicepath(&filename);
+ if (ret != 0) {
+ return ret;
+ }
+
+ s->open_flags = open_flags | O_BINARY;
+ s->open_flags &= ~O_ACCMODE;
+ if (bdrv_flags & BDRV_O_RDWR) {
+ s->open_flags |= O_RDWR;
+ } else {
+ s->open_flags |= O_RDONLY;
+ }
+
+ /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+ * and O_DIRECT for no caching. */
+ if ((bdrv_flags & BDRV_O_NOCACHE))
+ s->open_flags |= O_DIRECT;
+ if (!(bdrv_flags & BDRV_O_CACHE_WB))
+ s->open_flags |= O_DSYNC;
+
+ s->fd = -1;
+ fd = qemu_open(filename, s->open_flags, 0644);
+ if (fd < 0) {
+ ret = -errno;
+ if (ret == -EROFS)
+ ret = -EACCES;
+ return ret;
+ }
+ s->fd = fd;
+ s->aligned_buf = NULL;
+
+ if ((bdrv_flags & BDRV_O_NOCACHE)) {
+ /*
+ * Allocate a buffer for read/modify/write cycles. Chose the size
+ * pessimistically as we don't know the block size yet.
+ */
+ s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
+ s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
+ if (s->aligned_buf == NULL) {
+ goto out_close;
+ }
+ }
+
+ /* We're falling back to POSIX AIO in some cases so init always */
+ if (paio_init() < 0) {
+ goto out_free_buf;
+ }
+
+#ifdef CONFIG_LINUX_AIO
+ /*
+ * Currently Linux do AIO only for files opened with O_DIRECT
+ * specified so check NOCACHE flag too
+ */
+ if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+ (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+ s->aio_ctx = laio_init();
+ if (!s->aio_ctx) {
+ goto out_free_buf;
+ }
+ s->use_aio = 1;
+ } else
+#endif
+ {
+#ifdef CONFIG_LINUX_AIO
+ s->use_aio = 0;
+#endif
+ }
+
+#ifdef CONFIG_XFS
+ if (platform_test_xfs_fd(s->fd)) {
+ s->is_xfs = 1;
+ }
+#endif
+
+ return 0;
+
+out_free_buf:
+ qemu_vfree(s->aligned_buf);
+out_close:
+ qemu_close(fd);
+ return -errno;
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+
+ s->type = FTYPE_FILE;
+ return raw_open_common(bs, filename, flags, 0);
+}
+
+/* XXX: use host sector size if necessary with:
+#ifdef DIOCGSECTORSIZE
+ {
+ unsigned int sectorsize = 512;
+ if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
+ sectorsize > bufsize)
+ bufsize = sectorsize;
+ }
+#endif
+#ifdef CONFIG_COCOA
+ uint32_t blockSize = 512;
+ if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
+ bufsize = blockSize;
+ }
+#endif
+*/
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+
+ /*
+ * If O_DIRECT is used the buffer needs to be aligned on a sector
+ * boundary. Check if this is the case or tell the low-level
+ * driver that it needs to copy the buffer.
+ */
+ if (s->aligned_buf) {
+ if (!qiov_is_aligned(bs, qiov)) {
+ type |= QEMU_AIO_MISALIGNED;
+#ifdef CONFIG_LINUX_AIO
+ } else if (s->use_aio) {
+ return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+ nb_sectors, cb, opaque, type);
+#endif
+ }
+ }
+
+ return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
+ cb, opaque, type);
+}
+
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+ cb, opaque, QEMU_AIO_READ);
+}
+
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+ cb, opaque, QEMU_AIO_WRITE);
+}
+
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+
+ return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ if (s->fd >= 0) {
+ qemu_close(s->fd);
+ s->fd = -1;
+ if (s->aligned_buf != NULL)
+ qemu_vfree(s->aligned_buf);
+ }
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVRawState *s = bs->opaque;
+ struct stat st;
+
+ if (fstat(s->fd, &st)) {
+ return -errno;
+ }
+
+ if (S_ISREG(st.st_mode)) {
+ if (ftruncate(s->fd, offset) < 0) {
+ return -errno;
+ }
+ } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+ if (offset > raw_getlength(bs)) {
+ return -EINVAL;
+ }
+ } else {
+ return -ENOTSUP;
+ }
+
+ return 0;
+}
+
+#ifdef __OpenBSD__
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = s->fd;
+ struct stat st;
+
+ if (fstat(fd, &st))
+ return -1;
+ if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+ struct disklabel dl;
+
+ if (ioctl(fd, DIOCGDINFO, &dl))
+ return -1;
+ return (uint64_t)dl.d_secsize *
+ dl.d_partitions[DISKPART(st.st_rdev)].p_size;
+ } else
+ return st.st_size;
+}
+#elif defined(__NetBSD__)
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = s->fd;
+ struct stat st;
+
+ if (fstat(fd, &st))
+ return -1;
+ if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+ struct dkwedge_info dkw;
+
+ if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
+ return dkw.dkw_size * 512;
+ } else {
+ struct disklabel dl;
+
+ if (ioctl(fd, DIOCGDINFO, &dl))
+ return -1;
+ return (uint64_t)dl.d_secsize *
+ dl.d_partitions[DISKPART(st.st_rdev)].p_size;
+ }
+ } else
+ return st.st_size;
+}
+#elif defined(__sun__)
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ struct dk_minfo minfo;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * Use the DKIOCGMEDIAINFO ioctl to read the size.
+ */
+ ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
+ if (ret != -1) {
+ return minfo.dki_lbsize * minfo.dki_capacity;
+ }
+
+ /*
+ * There are reports that lseek on some devices fails, but
+ * irc discussion said that contingency on contingency was overkill.
+ */
+ return lseek(s->fd, 0, SEEK_END);
+}
+#elif defined(CONFIG_BSD)
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = s->fd;
+ int64_t size;
+ struct stat sb;
+#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
+ int reopened = 0;
+#endif
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0)
+ return ret;
+
+#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
+again:
+#endif
+ if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+#ifdef DIOCGMEDIASIZE
+ if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+#elif defined(DIOCGPART)
+ {
+ struct partinfo pi;
+ if (ioctl(fd, DIOCGPART, &pi) == 0)
+ size = pi.media_size;
+ else
+ size = 0;
+ }
+ if (size == 0)
+#endif
+#if defined(__APPLE__) && defined(__MACH__)
+ size = LONG_LONG_MAX;
+#else
+ size = lseek(fd, 0LL, SEEK_END);
+#endif
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+ switch(s->type) {
+ case FTYPE_CD:
+ /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
+ if (size == 2048LL * (unsigned)-1)
+ size = 0;
+ /* XXX no disc? maybe we need to reopen... */
+ if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
+ reopened = 1;
+ goto again;
+ }
+ }
+#endif
+ } else {
+ size = lseek(fd, 0, SEEK_END);
+ }
+ return size;
+}
+#else
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return lseek(s->fd, 0, SEEK_END);
+}
+#endif
+
+static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
+{
+ struct stat st;
+ BDRVRawState *s = bs->opaque;
+
+ if (fstat(s->fd, &st) < 0) {
+ return -errno;
+ }
+ return (int64_t)st.st_blocks * 512;
+}
+
+static int raw_create(const char *filename, QEMUOptionParameter *options)
+{
+ int fd;
+ int result = 0;
+ int64_t total_size = 0;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+ 0644);
+ if (fd < 0) {
+ result = -errno;
+ } else {
+ if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+ result = -errno;
+ }
+ if (qemu_close(fd) != 0) {
+ result = -errno;
+ }
+ }
+ return result;
+}
+
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ off_t start, data, hole;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ start = sector_num * BDRV_SECTOR_SIZE;
+
+#ifdef CONFIG_FIEMAP
+
+ BDRVRawState *s = bs->opaque;
+ struct {
+ struct fiemap fm;
+ struct fiemap_extent fe;
+ } f;
+
+ f.fm.fm_start = start;
+ f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
+ f.fm.fm_flags = 0;
+ f.fm.fm_extent_count = 1;
+ f.fm.fm_reserved = 0;
+ if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
+ /* Assume everything is allocated. */
+ *pnum = nb_sectors;
+ return 1;
+ }
+
+ if (f.fm.fm_mapped_extents == 0) {
+ /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
+ * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
+ */
+ off_t length = lseek(s->fd, 0, SEEK_END);
+ hole = f.fm.fm_start;
+ data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+ } else {
+ data = f.fe.fe_logical;
+ hole = f.fe.fe_logical + f.fe.fe_length;
+ }
+
+#elif defined SEEK_HOLE && defined SEEK_DATA
+
+ BDRVRawState *s = bs->opaque;
+
+ hole = lseek(s->fd, start, SEEK_HOLE);
+ if (hole == -1) {
+ /* -ENXIO indicates that sector_num was past the end of the file.
+ * There is a virtual hole there. */
+ assert(errno != -ENXIO);
+
+ /* Most likely EINVAL. Assume everything is allocated. */
+ *pnum = nb_sectors;
+ return 1;
+ }
+
+ if (hole > start) {
+ data = start;
+ } else {
+ /* On a hole. We need another syscall to find its end. */
+ data = lseek(s->fd, start, SEEK_DATA);
+ if (data == -1) {
+ data = lseek(s->fd, 0, SEEK_END);
+ }
+ }
+#else
+ *pnum = nb_sectors;
+ return 1;
+#endif
+
+ if (data <= start) {
+ /* On a data extent, compute sectors to the end of the extent. */
+ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+ return 1;
+ } else {
+ /* On a hole, compute sectors to the beginning of the next extent. */
+ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+ return 0;
+ }
+}
+
+#ifdef CONFIG_XFS
+static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
+{
+ struct xfs_flock64 fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.l_whence = SEEK_SET;
+ fl.l_start = sector_num << 9;
+ fl.l_len = (int64_t)nb_sectors << 9;
+
+ if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
+ DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
+ return -errno;
+ }
+
+ return 0;
+}
+#endif
+
+static coroutine_fn int raw_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+#ifdef CONFIG_XFS
+ BDRVRawState *s = bs->opaque;
+
+ if (s->is_xfs) {
+ return xfs_discard(s, sector_num, nb_sectors);
+ }
+#endif
+
+ return 0;
+}
+
+static QEMUOptionParameter raw_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_file = {
+ .format_name = "file",
+ .protocol_name = "file",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe = NULL, /* no probe for protocols */
+ .bdrv_file_open = raw_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = raw_create,
+ .bdrv_co_discard = raw_co_discard,
+ .bdrv_co_is_allocated = raw_co_is_allocated,
+
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ .create_options = raw_create_options,
+};
+
+/***********************************************/
+/* host device */
+
+#if defined(__APPLE__) && defined(__MACH__)
+static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
+static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
+
+kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
+{
+ kern_return_t kernResult;
+ mach_port_t masterPort;
+ CFMutableDictionaryRef classesToMatch;
+
+ kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
+ if ( KERN_SUCCESS != kernResult ) {
+ printf( "IOMasterPort returned %d\n", kernResult );
+ }
+
+ classesToMatch = IOServiceMatching( kIOCDMediaClass );
+ if ( classesToMatch == NULL ) {
+ printf( "IOServiceMatching returned a NULL dictionary.\n" );
+ } else {
+ CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
+ }
+ kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
+ if ( KERN_SUCCESS != kernResult )
+ {
+ printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
+ }
+
+ return kernResult;
+}
+
+kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+{
+ io_object_t nextMedia;
+ kern_return_t kernResult = KERN_FAILURE;
+ *bsdPath = '\0';
+ nextMedia = IOIteratorNext( mediaIterator );
+ if ( nextMedia )
+ {
+ CFTypeRef bsdPathAsCFString;
+ bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
+ if ( bsdPathAsCFString ) {
+ size_t devPathLength;
+ strcpy( bsdPath, _PATH_DEV );
+ strcat( bsdPath, "r" );
+ devPathLength = strlen( bsdPath );
+ if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
+ kernResult = KERN_SUCCESS;
+ }
+ CFRelease( bsdPathAsCFString );
+ }
+ IOObjectRelease( nextMedia );
+ }
+
+ return kernResult;
+}
+
+#endif
+
+static int hdev_probe_device(const char *filename)
+{
+ struct stat st;
+
+ /* allow a dedicated CD-ROM driver to match with a higher priority */
+ if (strstart(filename, "/dev/cdrom", NULL))
+ return 50;
+
+ if (stat(filename, &st) >= 0 &&
+ (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
+ return 100;
+ }
+
+ return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+
+#if defined(__APPLE__) && defined(__MACH__)
+ if (strstart(filename, "/dev/cdrom", NULL)) {
+ kern_return_t kernResult;
+ io_iterator_t mediaIterator;
+ char bsdPath[ MAXPATHLEN ];
+ int fd;
+
+ kernResult = FindEjectableCDMedia( &mediaIterator );
+ kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
+
+ if ( bsdPath[ 0 ] != '\0' ) {
+ strcat(bsdPath,"s0");
+ /* some CDs don't have a partition 0 */
+ fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if (fd < 0) {
+ bsdPath[strlen(bsdPath)-1] = '1';
+ } else {
+ qemu_close(fd);
+ }
+ filename = bsdPath;
+ }
+
+ if ( mediaIterator )
+ IOObjectRelease( mediaIterator );
+ }
+#endif
+
+ s->type = FTYPE_FILE;
+#if defined(__linux__)
+ {
+ char resolved_path[ MAXPATHLEN ], *temp;
+
+ temp = realpath(filename, resolved_path);
+ if (temp && strstart(temp, "/dev/sg", NULL)) {
+ bs->sg = 1;
+ }
+ }
+#endif
+
+ return raw_open_common(bs, filename, flags, 0);
+}
+
+#if defined(__linux__)
+/* Note: we do not have a reliable method to detect if the floppy is
+ present. The current method is to try to open the floppy at every
+ I/O and to keep it opened during a few hundreds of ms. */
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int last_media_present;
+
+ if (s->type != FTYPE_FD)
+ return 0;
+ last_media_present = (s->fd >= 0);
+ if (s->fd >= 0 &&
+ (get_clock() - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
+ qemu_close(s->fd);
+ s->fd = -1;
+#ifdef DEBUG_FLOPPY
+ printf("Floppy closed\n");
+#endif
+ }
+ if (s->fd < 0) {
+ if (s->fd_got_error &&
+ (get_clock() - s->fd_error_time) < FD_OPEN_TIMEOUT) {
+#ifdef DEBUG_FLOPPY
+ printf("No floppy (open delayed)\n");
+#endif
+ return -EIO;
+ }
+ s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
+ if (s->fd < 0) {
+ s->fd_error_time = get_clock();
+ s->fd_got_error = 1;
+ if (last_media_present)
+ s->fd_media_changed = 1;
+#ifdef DEBUG_FLOPPY
+ printf("No floppy\n");
+#endif
+ return -EIO;
+ }
+#ifdef DEBUG_FLOPPY
+ printf("Floppy opened\n");
+#endif
+ }
+ if (!last_media_present)
+ s->fd_media_changed = 1;
+ s->fd_open_time = get_clock();
+ s->fd_got_error = 0;
+ return 0;
+}
+
+static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BDRVRawState *s = bs->opaque;
+
+ return ioctl(s->fd, req, buf);
+}
+
+static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+ return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
+}
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ /* this is just to ensure s->fd is sane (its called by io ops) */
+ if (s->fd >= 0)
+ return 0;
+ return -EIO;
+}
+#else /* !linux && !FreeBSD */
+
+static int fd_open(BlockDriverState *bs)
+{
+ return 0;
+}
+
+#endif /* !linux && !FreeBSD */
+
+static int hdev_create(const char *filename, QEMUOptionParameter *options)
+{
+ int fd;
+ int ret = 0;
+ struct stat stat_buf;
+ int64_t total_size = 0;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, "size")) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ fd = qemu_open(filename, O_WRONLY | O_BINARY);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &stat_buf) < 0)
+ ret = -errno;
+ else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode))
+ ret = -ENODEV;
+ else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE)
+ ret = -ENOSPC;
+
+ qemu_close(fd);
+ return ret;
+}
+
+static int hdev_has_zero_init(BlockDriverState *bs)
+{
+ return 0;
+}
+
+static BlockDriver bdrv_host_device = {
+ .format_name = "host_device",
+ .protocol_name = "host_device",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe_device = hdev_probe_device,
+ .bdrv_file_open = hdev_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = hdev_create,
+ .create_options = raw_create_options,
+ .bdrv_has_zero_init = hdev_has_zero_init,
+
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ /* generic scsi device */
+#ifdef __linux__
+ .bdrv_ioctl = hdev_ioctl,
+ .bdrv_aio_ioctl = hdev_aio_ioctl,
+#endif
+};
+
+#ifdef __linux__
+static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ s->type = FTYPE_FD;
+
+ /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
+ ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
+ if (ret)
+ return ret;
+
+ /* close fd so that we can reopen it as needed */
+ qemu_close(s->fd);
+ s->fd = -1;
+ s->fd_media_changed = 1;
+
+ return 0;
+}
+
+static int floppy_probe_device(const char *filename)
+{
+ int fd, ret;
+ int prio = 0;
+ struct floppy_struct fdparam;
+ struct stat st;
+
+ if (strstart(filename, "/dev/fd", NULL) &&
+ !strstart(filename, "/dev/fdset/", NULL)) {
+ prio = 50;
+ }
+
+ fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ goto out;
+ }
+ ret = fstat(fd, &st);
+ if (ret == -1 || !S_ISBLK(st.st_mode)) {
+ goto outc;
+ }
+
+ /* Attempt to detect via a floppy specific ioctl */
+ ret = ioctl(fd, FDGETPRM, &fdparam);
+ if (ret >= 0)
+ prio = 100;
+
+outc:
+ qemu_close(fd);
+out:
+ return prio;
+}
+
+
+static int floppy_is_inserted(BlockDriverState *bs)
+{
+ return fd_open(bs) >= 0;
+}
+
+static int floppy_media_changed(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ /*
+ * XXX: we do not have a true media changed indication.
+ * It does not work if the floppy is changed without trying to read it.
+ */
+ fd_open(bs);
+ ret = s->fd_media_changed;
+ s->fd_media_changed = 0;
+#ifdef DEBUG_FLOPPY
+ printf("Floppy changed=%d\n", ret);
+#endif
+ return ret;
+}
+
+static void floppy_eject(BlockDriverState *bs, bool eject_flag)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd;
+
+ if (s->fd >= 0) {
+ qemu_close(s->fd);
+ s->fd = -1;
+ }
+ fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK);
+ if (fd >= 0) {
+ if (ioctl(fd, FDEJECT, 0) < 0)
+ perror("FDEJECT");
+ qemu_close(fd);
+ }
+}
+
+static BlockDriver bdrv_host_floppy = {
+ .format_name = "host_floppy",
+ .protocol_name = "host_floppy",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe_device = floppy_probe_device,
+ .bdrv_file_open = floppy_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = hdev_create,
+ .create_options = raw_create_options,
+ .bdrv_has_zero_init = hdev_has_zero_init,
+
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ /* removable device support */
+ .bdrv_is_inserted = floppy_is_inserted,
+ .bdrv_media_changed = floppy_media_changed,
+ .bdrv_eject = floppy_eject,
+};
+
+static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+
+ s->type = FTYPE_CD;
+
+ /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
+ return raw_open_common(bs, filename, flags, O_NONBLOCK);
+}
+
+static int cdrom_probe_device(const char *filename)
+{
+ int fd, ret;
+ int prio = 0;
+ struct stat st;
+
+ fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ goto out;
+ }
+ ret = fstat(fd, &st);
+ if (ret == -1 || !S_ISBLK(st.st_mode)) {
+ goto outc;
+ }
+
+ /* Attempt to detect via a CDROM specific ioctl */
+ ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+ if (ret >= 0)
+ prio = 100;
+
+outc:
+ qemu_close(fd);
+out:
+ return prio;
+}
+
+static int cdrom_is_inserted(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+ if (ret == CDS_DISC_OK)
+ return 1;
+ return 0;
+}
+
+static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (eject_flag) {
+ if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
+ perror("CDROMEJECT");
+ } else {
+ if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
+ perror("CDROMEJECT");
+ }
+}
+
+static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
+ /*
+ * Note: an error can happen if the distribution automatically
+ * mounts the CD-ROM
+ */
+ /* perror("CDROM_LOCKDOOR"); */
+ }
+}
+
+static BlockDriver bdrv_host_cdrom = {
+ .format_name = "host_cdrom",
+ .protocol_name = "host_cdrom",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe_device = cdrom_probe_device,
+ .bdrv_file_open = cdrom_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = hdev_create,
+ .create_options = raw_create_options,
+ .bdrv_has_zero_init = hdev_has_zero_init,
+
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ /* removable device support */
+ .bdrv_is_inserted = cdrom_is_inserted,
+ .bdrv_eject = cdrom_eject,
+ .bdrv_lock_medium = cdrom_lock_medium,
+
+ /* generic scsi device */
+ .bdrv_ioctl = hdev_ioctl,
+ .bdrv_aio_ioctl = hdev_aio_ioctl,
+};
+#endif /* __linux__ */
+
+#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
+static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ s->type = FTYPE_CD;
+
+ ret = raw_open_common(bs, filename, flags, 0);
+ if (ret)
+ return ret;
+
+ /* make sure the door isn't locked at this time */
+ ioctl(s->fd, CDIOCALLOW);
+ return 0;
+}
+
+static int cdrom_probe_device(const char *filename)
+{
+ if (strstart(filename, "/dev/cd", NULL) ||
+ strstart(filename, "/dev/acd", NULL))
+ return 100;
+ return 0;
+}
+
+static int cdrom_reopen(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd;
+
+ /*
+ * Force reread of possibly changed/newly loaded disc,
+ * FreeBSD seems to not notice sometimes...
+ */
+ if (s->fd >= 0)
+ qemu_close(s->fd);
+ fd = qemu_open(bs->filename, s->open_flags, 0644);
+ if (fd < 0) {
+ s->fd = -1;
+ return -EIO;
+ }
+ s->fd = fd;
+
+ /* make sure the door isn't locked at this time */
+ ioctl(s->fd, CDIOCALLOW);
+ return 0;
+}
+
+static int cdrom_is_inserted(BlockDriverState *bs)
+{
+ return raw_getlength(bs) > 0;
+}
+
+static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (s->fd < 0)
+ return;
+
+ (void) ioctl(s->fd, CDIOCALLOW);
+
+ if (eject_flag) {
+ if (ioctl(s->fd, CDIOCEJECT) < 0)
+ perror("CDIOCEJECT");
+ } else {
+ if (ioctl(s->fd, CDIOCCLOSE) < 0)
+ perror("CDIOCCLOSE");
+ }
+
+ cdrom_reopen(bs);
+}
+
+static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (s->fd < 0)
+ return;
+ if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
+ /*
+ * Note: an error can happen if the distribution automatically
+ * mounts the CD-ROM
+ */
+ /* perror("CDROM_LOCKDOOR"); */
+ }
+}
+
+static BlockDriver bdrv_host_cdrom = {
+ .format_name = "host_cdrom",
+ .protocol_name = "host_cdrom",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe_device = cdrom_probe_device,
+ .bdrv_file_open = cdrom_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = hdev_create,
+ .create_options = raw_create_options,
+ .bdrv_has_zero_init = hdev_has_zero_init,
+
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ /* removable device support */
+ .bdrv_is_inserted = cdrom_is_inserted,
+ .bdrv_eject = cdrom_eject,
+ .bdrv_lock_medium = cdrom_lock_medium,
+};
+#endif /* __FreeBSD__ */
+
+static void bdrv_file_init(void)
+{
+ /*
+ * Register all the drivers. Note that order is important, the driver
+ * registered last will get probed first.
+ */
+ bdrv_register(&bdrv_file);
+ bdrv_register(&bdrv_host_device);
+#ifdef __linux__
+ bdrv_register(&bdrv_host_floppy);
+ bdrv_register(&bdrv_host_cdrom);
+#endif
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+ bdrv_register(&bdrv_host_cdrom);
+#endif
+}
+
+block_init(bdrv_file_init);
diff --git a/block/raw-win32.c b/block/raw-win32.c
new file mode 100644
index 000000000..c56bf8337
--- /dev/null
+++ b/block/raw-win32.c
@@ -0,0 +1,431 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD 1
+#define FTYPE_HARDDISK 2
+
+typedef struct BDRVRawState {
+ HANDLE hfile;
+ int type;
+ char drive_path[16]; /* format: "d:\" */
+} BDRVRawState;
+
+int qemu_ftruncate64(int fd, int64_t length)
+{
+ LARGE_INTEGER li;
+ DWORD dw;
+ LONG high;
+ HANDLE h;
+ BOOL res;
+
+ if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0)
+ return -1;
+
+ h = (HANDLE)_get_osfhandle(fd);
+
+ /* get current position, ftruncate do not change position */
+ li.HighPart = 0;
+ li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT);
+ if (li.LowPart == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+ return -1;
+ }
+
+ high = length >> 32;
+ dw = SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN);
+ if (dw == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+ return -1;
+ }
+ res = SetEndOfFile(h);
+
+ /* back to old position */
+ SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN);
+ return res ? 0 : -1;
+}
+
+static int set_sparse(int fd)
+{
+ DWORD returned;
+ return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE,
+ NULL, 0, NULL, 0, &returned, NULL);
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int access_flags;
+ DWORD overlapped;
+
+ s->type = FTYPE_FILE;
+
+ if (flags & BDRV_O_RDWR) {
+ access_flags = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ access_flags = GENERIC_READ;
+ }
+
+ overlapped = FILE_ATTRIBUTE_NORMAL;
+ if (flags & BDRV_O_NOCACHE)
+ overlapped |= FILE_FLAG_NO_BUFFERING;
+ if (!(flags & BDRV_O_CACHE_WB))
+ overlapped |= FILE_FLAG_WRITE_THROUGH;
+ s->hfile = CreateFile(filename, access_flags,
+ FILE_SHARE_READ, NULL,
+ OPEN_EXISTING, overlapped, NULL);
+ if (s->hfile == INVALID_HANDLE_VALUE) {
+ int err = GetLastError();
+
+ if (err == ERROR_ACCESS_DENIED)
+ return -EACCES;
+ return -1;
+ }
+ return 0;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVRawState *s = bs->opaque;
+ OVERLAPPED ov;
+ DWORD ret_count;
+ int ret;
+ int64_t offset = sector_num * 512;
+ int count = nb_sectors * 512;
+
+ memset(&ov, 0, sizeof(ov));
+ ov.Offset = offset;
+ ov.OffsetHigh = offset >> 32;
+ ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
+ if (!ret)
+ return ret_count;
+ if (ret_count == count)
+ ret_count = 0;
+ return ret_count;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVRawState *s = bs->opaque;
+ OVERLAPPED ov;
+ DWORD ret_count;
+ int ret;
+ int64_t offset = sector_num * 512;
+ int count = nb_sectors * 512;
+
+ memset(&ov, 0, sizeof(ov));
+ ov.Offset = offset;
+ ov.OffsetHigh = offset >> 32;
+ ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
+ if (!ret)
+ return ret_count;
+ if (ret_count == count)
+ ret_count = 0;
+ return ret_count;
+}
+
+static int raw_flush(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = FlushFileBuffers(s->hfile);
+ if (ret == 0) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ CloseHandle(s->hfile);
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVRawState *s = bs->opaque;
+ LONG low, high;
+
+ low = offset;
+ high = offset >> 32;
+ if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
+ return -EIO;
+ if (!SetEndOfFile(s->hfile))
+ return -EIO;
+ return 0;
+}
+
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ LARGE_INTEGER l;
+ ULARGE_INTEGER available, total, total_free;
+ DISK_GEOMETRY_EX dg;
+ DWORD count;
+ BOOL status;
+
+ switch(s->type) {
+ case FTYPE_FILE:
+ l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart);
+ if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+ return -EIO;
+ break;
+ case FTYPE_CD:
+ if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free))
+ return -EIO;
+ l.QuadPart = total.QuadPart;
+ break;
+ case FTYPE_HARDDISK:
+ status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+ NULL, 0, &dg, sizeof(dg), &count, NULL);
+ if (status != 0) {
+ l = dg.DiskSize;
+ }
+ break;
+ default:
+ return -EIO;
+ }
+ return l.QuadPart;
+}
+
+static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
+{
+ typedef DWORD (WINAPI * get_compressed_t)(const char *filename,
+ DWORD * high);
+ get_compressed_t get_compressed;
+ struct _stati64 st;
+ const char *filename = bs->filename;
+ /* WinNT support GetCompressedFileSize to determine allocate size */
+ get_compressed =
+ (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"),
+ "GetCompressedFileSizeA");
+ if (get_compressed) {
+ DWORD high, low;
+ low = get_compressed(filename, &high);
+ if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR) {
+ return (((int64_t) high) << 32) + low;
+ }
+ }
+
+ if (_stati64(filename, &st) < 0) {
+ return -1;
+ }
+ return st.st_size;
+}
+
+static int raw_create(const char *filename, QEMUOptionParameter *options)
+{
+ int fd;
+ int64_t total_size = 0;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / 512;
+ }
+ options++;
+ }
+
+ fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+ 0644);
+ if (fd < 0)
+ return -EIO;
+ set_sparse(fd);
+ ftruncate(fd, total_size * 512);
+ qemu_close(fd);
+ return 0;
+}
+
+static QEMUOptionParameter raw_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_file = {
+ .format_name = "file",
+ .protocol_name = "file",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_file_open = raw_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = raw_create,
+
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_co_flush_to_disk = raw_flush,
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+
+ .create_options = raw_create_options,
+};
+
+/***********************************************/
+/* host device */
+
+static int find_cdrom(char *cdrom_name, int cdrom_name_size)
+{
+ char drives[256], *pdrv = drives;
+ UINT type;
+
+ memset(drives, 0, sizeof(drives));
+ GetLogicalDriveStrings(sizeof(drives), drives);
+ while(pdrv[0] != '\0') {
+ type = GetDriveType(pdrv);
+ switch(type) {
+ case DRIVE_CDROM:
+ snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]);
+ return 0;
+ break;
+ }
+ pdrv += lstrlen(pdrv) + 1;
+ }
+ return -1;
+}
+
+static int find_device_type(BlockDriverState *bs, const char *filename)
+{
+ BDRVRawState *s = bs->opaque;
+ UINT type;
+ const char *p;
+
+ if (strstart(filename, "\\\\.\\", &p) ||
+ strstart(filename, "//./", &p)) {
+ if (stristart(p, "PhysicalDrive", NULL))
+ return FTYPE_HARDDISK;
+ snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]);
+ type = GetDriveType(s->drive_path);
+ switch (type) {
+ case DRIVE_REMOVABLE:
+ case DRIVE_FIXED:
+ return FTYPE_HARDDISK;
+ case DRIVE_CDROM:
+ return FTYPE_CD;
+ default:
+ return FTYPE_FILE;
+ }
+ } else {
+ return FTYPE_FILE;
+ }
+}
+
+static int hdev_probe_device(const char *filename)
+{
+ if (strstart(filename, "/dev/cdrom", NULL))
+ return 100;
+ if (is_windows_drive(filename))
+ return 100;
+ return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int access_flags, create_flags;
+ DWORD overlapped;
+ char device_name[64];
+
+ if (strstart(filename, "/dev/cdrom", NULL)) {
+ if (find_cdrom(device_name, sizeof(device_name)) < 0)
+ return -ENOENT;
+ filename = device_name;
+ } else {
+ /* transform drive letters into device name */
+ if (((filename[0] >= 'a' && filename[0] <= 'z') ||
+ (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+ filename[1] == ':' && filename[2] == '\0') {
+ snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]);
+ filename = device_name;
+ }
+ }
+ s->type = find_device_type(bs, filename);
+
+ if (flags & BDRV_O_RDWR) {
+ access_flags = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ access_flags = GENERIC_READ;
+ }
+ create_flags = OPEN_EXISTING;
+
+ overlapped = FILE_ATTRIBUTE_NORMAL;
+ if (flags & BDRV_O_NOCACHE)
+ overlapped |= FILE_FLAG_NO_BUFFERING;
+ if (!(flags & BDRV_O_CACHE_WB))
+ overlapped |= FILE_FLAG_WRITE_THROUGH;
+ s->hfile = CreateFile(filename, access_flags,
+ FILE_SHARE_READ, NULL,
+ create_flags, overlapped, NULL);
+ if (s->hfile == INVALID_HANDLE_VALUE) {
+ int err = GetLastError();
+
+ if (err == ERROR_ACCESS_DENIED)
+ return -EACCES;
+ return -1;
+ }
+ return 0;
+}
+
+static int hdev_has_zero_init(BlockDriverState *bs)
+{
+ return 0;
+}
+
+static BlockDriver bdrv_host_device = {
+ .format_name = "host_device",
+ .protocol_name = "host_device",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe_device = hdev_probe_device,
+ .bdrv_file_open = hdev_open,
+ .bdrv_close = raw_close,
+ .bdrv_has_zero_init = hdev_has_zero_init,
+
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_co_flush_to_disk = raw_flush,
+
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+};
+
+static void bdrv_file_init(void)
+{
+ bdrv_register(&bdrv_file);
+ bdrv_register(&bdrv_host_device);
+}
+
+block_init(bdrv_file_init);
diff --git a/block/raw.c b/block/raw.c
new file mode 100644
index 000000000..ff34ea41e
--- /dev/null
+++ b/block/raw.c
@@ -0,0 +1,145 @@
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+static int raw_open(BlockDriverState *bs, int flags)
+{
+ bs->sg = bs->file->sg;
+ return 0;
+}
+
+static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov);
+}
+
+static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+ return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+}
+
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
+}
+
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ return bdrv_getlength(bs->file);
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+ return bdrv_truncate(bs->file, offset);
+}
+
+static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ return 1; /* everything can be opened as raw image */
+}
+
+static int coroutine_fn raw_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ return bdrv_co_discard(bs->file, sector_num, nb_sectors);
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+ return bdrv_is_inserted(bs->file);
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+ return bdrv_media_changed(bs->file);
+}
+
+static void raw_eject(BlockDriverState *bs, bool eject_flag)
+{
+ bdrv_eject(bs->file, eject_flag);
+}
+
+static void raw_lock_medium(BlockDriverState *bs, bool locked)
+{
+ bdrv_lock_medium(bs->file, locked);
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ return bdrv_ioctl(bs->file, req, buf);
+}
+
+static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque);
+}
+
+static int raw_create(const char *filename, QEMUOptionParameter *options)
+{
+ return bdrv_create_file(filename, options);
+}
+
+static QEMUOptionParameter raw_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+static int raw_has_zero_init(BlockDriverState *bs)
+{
+ return bdrv_has_zero_init(bs->file);
+}
+
+static BlockDriver bdrv_raw = {
+ .format_name = "raw",
+
+ /* It's really 0, but we need to make g_malloc() happy */
+ .instance_size = 1,
+
+ .bdrv_open = raw_open,
+ .bdrv_close = raw_close,
+
+ .bdrv_co_readv = raw_co_readv,
+ .bdrv_co_writev = raw_co_writev,
+ .bdrv_co_is_allocated = raw_co_is_allocated,
+ .bdrv_co_discard = raw_co_discard,
+
+ .bdrv_probe = raw_probe,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_truncate = raw_truncate,
+
+ .bdrv_is_inserted = raw_is_inserted,
+ .bdrv_media_changed = raw_media_changed,
+ .bdrv_eject = raw_eject,
+ .bdrv_lock_medium = raw_lock_medium,
+
+ .bdrv_ioctl = raw_ioctl,
+ .bdrv_aio_ioctl = raw_aio_ioctl,
+
+ .bdrv_create = raw_create,
+ .create_options = raw_create_options,
+ .bdrv_has_zero_init = raw_has_zero_init,
+};
+
+static void bdrv_raw_init(void)
+{
+ bdrv_register(&bdrv_raw);
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/rbd.c b/block/rbd.c
new file mode 100644
index 000000000..5a0f79fc8
--- /dev/null
+++ b/block/rbd.c
@@ -0,0 +1,970 @@
+/*
+ * QEMU Block driver for RADOS (Ceph)
+ *
+ * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
+ * Josh Durgin <josh.durgin@dreamhost.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include <inttypes.h>
+
+#include "qemu-common.h"
+#include "qemu-error.h"
+#include "block_int.h"
+
+#include <rbd/librbd.h>
+
+/*
+ * When specifying the image filename use:
+ *
+ * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
+ *
+ * poolname must be the name of an existing rados pool.
+ *
+ * devicename is the name of the rbd image.
+ *
+ * Each option given is used to configure rados, and may be any valid
+ * Ceph option, "id", or "conf".
+ *
+ * The "id" option indicates what user we should authenticate as to
+ * the Ceph cluster. If it is excluded we will use the Ceph default
+ * (normally 'admin').
+ *
+ * The "conf" option specifies a Ceph configuration file to read. If
+ * it is not specified, we will read from the default Ceph locations
+ * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration
+ * file, specify conf=/dev/null.
+ *
+ * Configuration values containing :, @, or = can be escaped with a
+ * leading "\".
+ */
+
+/* rbd_aio_discard added in 0.1.2 */
+#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2)
+#define LIBRBD_SUPPORTS_DISCARD
+#else
+#undef LIBRBD_SUPPORTS_DISCARD
+#endif
+
+#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
+
+#define RBD_MAX_CONF_NAME_SIZE 128
+#define RBD_MAX_CONF_VAL_SIZE 512
+#define RBD_MAX_CONF_SIZE 1024
+#define RBD_MAX_POOL_NAME_SIZE 128
+#define RBD_MAX_SNAP_NAME_SIZE 128
+#define RBD_MAX_SNAPS 100
+
+typedef enum {
+ RBD_AIO_READ,
+ RBD_AIO_WRITE,
+ RBD_AIO_DISCARD
+} RBDAIOCmd;
+
+typedef struct RBDAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+ int ret;
+ QEMUIOVector *qiov;
+ char *bounce;
+ RBDAIOCmd cmd;
+ int64_t sector_num;
+ int error;
+ struct BDRVRBDState *s;
+ int cancelled;
+} RBDAIOCB;
+
+typedef struct RADOSCB {
+ int rcbid;
+ RBDAIOCB *acb;
+ struct BDRVRBDState *s;
+ int done;
+ int64_t size;
+ char *buf;
+ int ret;
+} RADOSCB;
+
+#define RBD_FD_READ 0
+#define RBD_FD_WRITE 1
+
+typedef struct BDRVRBDState {
+ int fds[2];
+ rados_t cluster;
+ rados_ioctx_t io_ctx;
+ rbd_image_t image;
+ char name[RBD_MAX_IMAGE_NAME_SIZE];
+ int qemu_aio_count;
+ char *snap;
+ int event_reader_pos;
+ RADOSCB *event_rcb;
+} BDRVRBDState;
+
+static void rbd_aio_bh_cb(void *opaque);
+
+static int qemu_rbd_next_tok(char *dst, int dst_len,
+ char *src, char delim,
+ const char *name,
+ char **p)
+{
+ int l;
+ char *end;
+
+ *p = NULL;
+
+ if (delim != '\0') {
+ for (end = src; *end; ++end) {
+ if (*end == delim) {
+ break;
+ }
+ if (*end == '\\' && end[1] != '\0') {
+ end++;
+ }
+ }
+ if (*end == delim) {
+ *p = end + 1;
+ *end = '\0';
+ }
+ }
+ l = strlen(src);
+ if (l >= dst_len) {
+ error_report("%s too long", name);
+ return -EINVAL;
+ } else if (l == 0) {
+ error_report("%s too short", name);
+ return -EINVAL;
+ }
+
+ pstrcpy(dst, dst_len, src);
+
+ return 0;
+}
+
+static void qemu_rbd_unescape(char *src)
+{
+ char *p;
+
+ for (p = src; *src; ++src, ++p) {
+ if (*src == '\\' && src[1] != '\0') {
+ src++;
+ }
+ *p = *src;
+ }
+ *p = '\0';
+}
+
+static int qemu_rbd_parsename(const char *filename,
+ char *pool, int pool_len,
+ char *snap, int snap_len,
+ char *name, int name_len,
+ char *conf, int conf_len)
+{
+ const char *start;
+ char *p, *buf;
+ int ret;
+
+ if (!strstart(filename, "rbd:", &start)) {
+ return -EINVAL;
+ }
+
+ buf = g_strdup(start);
+ p = buf;
+ *snap = '\0';
+ *conf = '\0';
+
+ ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p);
+ if (ret < 0 || !p) {
+ ret = -EINVAL;
+ goto done;
+ }
+ qemu_rbd_unescape(pool);
+
+ if (strchr(p, '@')) {
+ ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p);
+ if (ret < 0) {
+ goto done;
+ }
+ ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p);
+ qemu_rbd_unescape(snap);
+ } else {
+ ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p);
+ }
+ qemu_rbd_unescape(name);
+ if (ret < 0 || !p) {
+ goto done;
+ }
+
+ ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p);
+
+done:
+ g_free(buf);
+ return ret;
+}
+
+static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
+{
+ const char *p = conf;
+
+ while (*p) {
+ int len;
+ const char *end = strchr(p, ':');
+
+ if (end) {
+ len = end - p;
+ } else {
+ len = strlen(p);
+ }
+
+ if (strncmp(p, "id=", 3) == 0) {
+ len -= 3;
+ strncpy(clientname, p + 3, len);
+ clientname[len] = '\0';
+ return clientname;
+ }
+ if (end == NULL) {
+ break;
+ }
+ p = end + 1;
+ }
+ return NULL;
+}
+
+static int qemu_rbd_set_conf(rados_t cluster, const char *conf)
+{
+ char *p, *buf;
+ char name[RBD_MAX_CONF_NAME_SIZE];
+ char value[RBD_MAX_CONF_VAL_SIZE];
+ int ret = 0;
+
+ buf = g_strdup(conf);
+ p = buf;
+
+ while (p) {
+ ret = qemu_rbd_next_tok(name, sizeof(name), p,
+ '=', "conf option name", &p);
+ if (ret < 0) {
+ break;
+ }
+ qemu_rbd_unescape(name);
+
+ if (!p) {
+ error_report("conf option %s has no value", name);
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = qemu_rbd_next_tok(value, sizeof(value), p,
+ ':', "conf option value", &p);
+ if (ret < 0) {
+ break;
+ }
+ qemu_rbd_unescape(value);
+
+ if (strcmp(name, "conf") == 0) {
+ ret = rados_conf_read_file(cluster, value);
+ if (ret < 0) {
+ error_report("error reading conf file %s", value);
+ break;
+ }
+ } else if (strcmp(name, "id") == 0) {
+ /* ignore, this is parsed by qemu_rbd_parse_clientname() */
+ } else {
+ ret = rados_conf_set(cluster, name, value);
+ if (ret < 0) {
+ error_report("invalid conf option %s", name);
+ ret = -EINVAL;
+ break;
+ }
+ }
+ }
+
+ g_free(buf);
+ return ret;
+}
+
+static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options)
+{
+ int64_t bytes = 0;
+ int64_t objsize;
+ int obj_order = 0;
+ char pool[RBD_MAX_POOL_NAME_SIZE];
+ char name[RBD_MAX_IMAGE_NAME_SIZE];
+ char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
+ char conf[RBD_MAX_CONF_SIZE];
+ char clientname_buf[RBD_MAX_CONF_SIZE];
+ char *clientname;
+ rados_t cluster;
+ rados_ioctx_t io_ctx;
+ int ret;
+
+ if (qemu_rbd_parsename(filename, pool, sizeof(pool),
+ snap_buf, sizeof(snap_buf),
+ name, sizeof(name),
+ conf, sizeof(conf)) < 0) {
+ return -EINVAL;
+ }
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ bytes = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ objsize = options->value.n;
+ if ((objsize - 1) & objsize) { /* not a power of 2? */
+ error_report("obj size needs to be power of 2");
+ return -EINVAL;
+ }
+ if (objsize < 4096) {
+ error_report("obj size too small");
+ return -EINVAL;
+ }
+ obj_order = ffs(objsize) - 1;
+ }
+ }
+ options++;
+ }
+
+ clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
+ if (rados_create(&cluster, clientname) < 0) {
+ error_report("error initializing");
+ return -EIO;
+ }
+
+ if (strstr(conf, "conf=") == NULL) {
+ /* try default location, but ignore failure */
+ rados_conf_read_file(cluster, NULL);
+ }
+
+ if (conf[0] != '\0' &&
+ qemu_rbd_set_conf(cluster, conf) < 0) {
+ error_report("error setting config options");
+ rados_shutdown(cluster);
+ return -EIO;
+ }
+
+ if (rados_connect(cluster) < 0) {
+ error_report("error connecting");
+ rados_shutdown(cluster);
+ return -EIO;
+ }
+
+ if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
+ error_report("error opening pool %s", pool);
+ rados_shutdown(cluster);
+ return -EIO;
+ }
+
+ ret = rbd_create(io_ctx, name, bytes, &obj_order);
+ rados_ioctx_destroy(io_ctx);
+ rados_shutdown(cluster);
+
+ return ret;
+}
+
+/*
+ * This aio completion is being called from qemu_rbd_aio_event_reader()
+ * and runs in qemu context. It schedules a bh, but just in case the aio
+ * was not cancelled before.
+ */
+static void qemu_rbd_complete_aio(RADOSCB *rcb)
+{
+ RBDAIOCB *acb = rcb->acb;
+ int64_t r;
+
+ if (acb->cancelled) {
+ qemu_vfree(acb->bounce);
+ qemu_aio_release(acb);
+ goto done;
+ }
+
+ r = rcb->ret;
+
+ if (acb->cmd == RBD_AIO_WRITE ||
+ acb->cmd == RBD_AIO_DISCARD) {
+ if (r < 0) {
+ acb->ret = r;
+ acb->error = 1;
+ } else if (!acb->error) {
+ acb->ret = rcb->size;
+ }
+ } else {
+ if (r < 0) {
+ memset(rcb->buf, 0, rcb->size);
+ acb->ret = r;
+ acb->error = 1;
+ } else if (r < rcb->size) {
+ memset(rcb->buf + r, 0, rcb->size - r);
+ if (!acb->error) {
+ acb->ret = rcb->size;
+ }
+ } else if (!acb->error) {
+ acb->ret = r;
+ }
+ }
+ /* Note that acb->bh can be NULL in case where the aio was cancelled */
+ acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
+ qemu_bh_schedule(acb->bh);
+done:
+ g_free(rcb);
+}
+
+/*
+ * aio fd read handler. It runs in the qemu context and calls the
+ * completion handling of completed rados aio operations.
+ */
+static void qemu_rbd_aio_event_reader(void *opaque)
+{
+ BDRVRBDState *s = opaque;
+
+ ssize_t ret;
+
+ do {
+ char *p = (char *)&s->event_rcb;
+
+ /* now read the rcb pointer that was sent from a non qemu thread */
+ ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
+ sizeof(s->event_rcb) - s->event_reader_pos);
+ if (ret > 0) {
+ s->event_reader_pos += ret;
+ if (s->event_reader_pos == sizeof(s->event_rcb)) {
+ s->event_reader_pos = 0;
+ qemu_rbd_complete_aio(s->event_rcb);
+ s->qemu_aio_count--;
+ }
+ }
+ } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_rbd_aio_flush_cb(void *opaque)
+{
+ BDRVRBDState *s = opaque;
+
+ return (s->qemu_aio_count > 0);
+}
+
+static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRBDState *s = bs->opaque;
+ char pool[RBD_MAX_POOL_NAME_SIZE];
+ char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
+ char conf[RBD_MAX_CONF_SIZE];
+ char clientname_buf[RBD_MAX_CONF_SIZE];
+ char *clientname;
+ int r;
+
+ if (qemu_rbd_parsename(filename, pool, sizeof(pool),
+ snap_buf, sizeof(snap_buf),
+ s->name, sizeof(s->name),
+ conf, sizeof(conf)) < 0) {
+ return -EINVAL;
+ }
+
+ clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
+ r = rados_create(&s->cluster, clientname);
+ if (r < 0) {
+ error_report("error initializing");
+ return r;
+ }
+
+ s->snap = NULL;
+ if (snap_buf[0] != '\0') {
+ s->snap = g_strdup(snap_buf);
+ }
+
+ /*
+ * Fallback to more conservative semantics if setting cache
+ * options fails. Ignore errors from setting rbd_cache because the
+ * only possible error is that the option does not exist, and
+ * librbd defaults to no caching. If write through caching cannot
+ * be set up, fall back to no caching.
+ */
+ if (flags & BDRV_O_NOCACHE) {
+ rados_conf_set(s->cluster, "rbd_cache", "false");
+ } else {
+ rados_conf_set(s->cluster, "rbd_cache", "true");
+ if (!(flags & BDRV_O_CACHE_WB)) {
+ r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
+ if (r < 0) {
+ rados_conf_set(s->cluster, "rbd_cache", "false");
+ }
+ }
+ }
+
+ if (strstr(conf, "conf=") == NULL) {
+ /* try default location, but ignore failure */
+ rados_conf_read_file(s->cluster, NULL);
+ }
+
+ if (conf[0] != '\0') {
+ r = qemu_rbd_set_conf(s->cluster, conf);
+ if (r < 0) {
+ error_report("error setting config options");
+ goto failed_shutdown;
+ }
+ }
+
+ r = rados_connect(s->cluster);
+ if (r < 0) {
+ error_report("error connecting");
+ goto failed_shutdown;
+ }
+
+ r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
+ if (r < 0) {
+ error_report("error opening pool %s", pool);
+ goto failed_shutdown;
+ }
+
+ r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
+ if (r < 0) {
+ error_report("error reading header from %s", s->name);
+ goto failed_open;
+ }
+
+ bs->read_only = (s->snap != NULL);
+
+ s->event_reader_pos = 0;
+ r = qemu_pipe(s->fds);
+ if (r < 0) {
+ error_report("error opening eventfd");
+ goto failed;
+ }
+ fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
+ fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
+ qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
+ NULL, qemu_rbd_aio_flush_cb, s);
+
+
+ return 0;
+
+failed:
+ rbd_close(s->image);
+failed_open:
+ rados_ioctx_destroy(s->io_ctx);
+failed_shutdown:
+ rados_shutdown(s->cluster);
+ g_free(s->snap);
+ return r;
+}
+
+static void qemu_rbd_close(BlockDriverState *bs)
+{
+ BDRVRBDState *s = bs->opaque;
+
+ close(s->fds[0]);
+ close(s->fds[1]);
+ qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL);
+
+ rbd_close(s->image);
+ rados_ioctx_destroy(s->io_ctx);
+ g_free(s->snap);
+ rados_shutdown(s->cluster);
+}
+
+/*
+ * Cancel aio. Since we don't reference acb in a non qemu threads,
+ * it is safe to access it here.
+ */
+static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ RBDAIOCB *acb = (RBDAIOCB *) blockacb;
+ acb->cancelled = 1;
+}
+
+static AIOPool rbd_aio_pool = {
+ .aiocb_size = sizeof(RBDAIOCB),
+ .cancel = qemu_rbd_aio_cancel,
+};
+
+static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
+{
+ int ret = 0;
+ while (1) {
+ fd_set wfd;
+ int fd = s->fds[RBD_FD_WRITE];
+
+ /* send the op pointer to the qemu thread that is responsible
+ for the aio/op completion. Must do it in a qemu thread context */
+ ret = write(fd, (void *)&rcb, sizeof(rcb));
+ if (ret >= 0) {
+ break;
+ }
+ if (errno == EINTR) {
+ continue;
+ }
+ if (errno != EAGAIN) {
+ break;
+ }
+
+ FD_ZERO(&wfd);
+ FD_SET(fd, &wfd);
+ do {
+ ret = select(fd + 1, NULL, &wfd, NULL, NULL);
+ } while (ret < 0 && errno == EINTR);
+ }
+
+ return ret;
+}
+
+/*
+ * This is the callback function for rbd_aio_read and _write
+ *
+ * Note: this function is being called from a non qemu thread so
+ * we need to be careful about what we do here. Generally we only
+ * write to the block notification pipe, and do the rest of the
+ * io completion handling from qemu_rbd_aio_event_reader() which
+ * runs in a qemu context.
+ */
+static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
+{
+ int ret;
+ rcb->ret = rbd_aio_get_return_value(c);
+ rbd_aio_release(c);
+ ret = qemu_rbd_send_pipe(rcb->s, rcb);
+ if (ret < 0) {
+ error_report("failed writing to acb->s->fds");
+ g_free(rcb);
+ }
+}
+
+/* Callback when all queued rbd_aio requests are complete */
+
+static void rbd_aio_bh_cb(void *opaque)
+{
+ RBDAIOCB *acb = opaque;
+
+ if (acb->cmd == RBD_AIO_READ) {
+ qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+ }
+ qemu_vfree(acb->bounce);
+ acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+
+ qemu_aio_release(acb);
+}
+
+static int rbd_aio_discard_wrapper(rbd_image_t image,
+ uint64_t off,
+ uint64_t len,
+ rbd_completion_t comp)
+{
+#ifdef LIBRBD_SUPPORTS_DISCARD
+ return rbd_aio_discard(image, off, len, comp);
+#else
+ return -ENOTSUP;
+#endif
+}
+
+static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque,
+ RBDAIOCmd cmd)
+{
+ RBDAIOCB *acb;
+ RADOSCB *rcb;
+ rbd_completion_t c;
+ int64_t off, size;
+ char *buf;
+ int r;
+
+ BDRVRBDState *s = bs->opaque;
+
+ acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+ acb->cmd = cmd;
+ acb->qiov = qiov;
+ if (cmd == RBD_AIO_DISCARD) {
+ acb->bounce = NULL;
+ } else {
+ acb->bounce = qemu_blockalign(bs, qiov->size);
+ }
+ acb->ret = 0;
+ acb->error = 0;
+ acb->s = s;
+ acb->cancelled = 0;
+ acb->bh = NULL;
+
+ if (cmd == RBD_AIO_WRITE) {
+ qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+ }
+
+ buf = acb->bounce;
+
+ off = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ s->qemu_aio_count++; /* All the RADOSCB */
+
+ rcb = g_malloc(sizeof(RADOSCB));
+ rcb->done = 0;
+ rcb->acb = acb;
+ rcb->buf = buf;
+ rcb->s = acb->s;
+ rcb->size = size;
+ r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
+ if (r < 0) {
+ goto failed;
+ }
+
+ switch (cmd) {
+ case RBD_AIO_WRITE:
+ r = rbd_aio_write(s->image, off, size, buf, c);
+ break;
+ case RBD_AIO_READ:
+ r = rbd_aio_read(s->image, off, size, buf, c);
+ break;
+ case RBD_AIO_DISCARD:
+ r = rbd_aio_discard_wrapper(s->image, off, size, c);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+ if (r < 0) {
+ goto failed;
+ }
+
+ return &acb->common;
+
+failed:
+ g_free(rcb);
+ s->qemu_aio_count--;
+ qemu_aio_release(acb);
+ return NULL;
+}
+
+static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+ RBD_AIO_READ);
+}
+
+static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+ RBD_AIO_WRITE);
+}
+
+static int qemu_rbd_co_flush(BlockDriverState *bs)
+{
+#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
+ /* rbd_flush added in 0.1.1 */
+ BDRVRBDState *s = bs->opaque;
+ return rbd_flush(s->image);
+#else
+ return 0;
+#endif
+}
+
+static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVRBDState *s = bs->opaque;
+ rbd_image_info_t info;
+ int r;
+
+ r = rbd_stat(s->image, &info, sizeof(info));
+ if (r < 0) {
+ return r;
+ }
+
+ bdi->cluster_size = info.obj_size;
+ return 0;
+}
+
+static int64_t qemu_rbd_getlength(BlockDriverState *bs)
+{
+ BDRVRBDState *s = bs->opaque;
+ rbd_image_info_t info;
+ int r;
+
+ r = rbd_stat(s->image, &info, sizeof(info));
+ if (r < 0) {
+ return r;
+ }
+
+ return info.size;
+}
+
+static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVRBDState *s = bs->opaque;
+ int r;
+
+ r = rbd_resize(s->image, offset);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+static int qemu_rbd_snap_create(BlockDriverState *bs,
+ QEMUSnapshotInfo *sn_info)
+{
+ BDRVRBDState *s = bs->opaque;
+ int r;
+
+ if (sn_info->name[0] == '\0') {
+ return -EINVAL; /* we need a name for rbd snapshots */
+ }
+
+ /*
+ * rbd snapshots are using the name as the user controlled unique identifier
+ * we can't use the rbd snapid for that purpose, as it can't be set
+ */
+ if (sn_info->id_str[0] != '\0' &&
+ strcmp(sn_info->id_str, sn_info->name) != 0) {
+ return -EINVAL;
+ }
+
+ if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) {
+ return -ERANGE;
+ }
+
+ r = rbd_snap_create(s->image, sn_info->name);
+ if (r < 0) {
+ error_report("failed to create snap: %s", strerror(-r));
+ return r;
+ }
+
+ return 0;
+}
+
+static int qemu_rbd_snap_remove(BlockDriverState *bs,
+ const char *snapshot_name)
+{
+ BDRVRBDState *s = bs->opaque;
+ int r;
+
+ r = rbd_snap_remove(s->image, snapshot_name);
+ return r;
+}
+
+static int qemu_rbd_snap_rollback(BlockDriverState *bs,
+ const char *snapshot_name)
+{
+ BDRVRBDState *s = bs->opaque;
+ int r;
+
+ r = rbd_snap_rollback(s->image, snapshot_name);
+ return r;
+}
+
+static int qemu_rbd_snap_list(BlockDriverState *bs,
+ QEMUSnapshotInfo **psn_tab)
+{
+ BDRVRBDState *s = bs->opaque;
+ QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
+ int i, snap_count;
+ rbd_snap_info_t *snaps;
+ int max_snaps = RBD_MAX_SNAPS;
+
+ do {
+ snaps = g_malloc(sizeof(*snaps) * max_snaps);
+ snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
+ if (snap_count < 0) {
+ g_free(snaps);
+ }
+ } while (snap_count == -ERANGE);
+
+ if (snap_count <= 0) {
+ goto done;
+ }
+
+ sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
+
+ for (i = 0; i < snap_count; i++) {
+ const char *snap_name = snaps[i].name;
+
+ sn_info = sn_tab + i;
+ pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
+ pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
+
+ sn_info->vm_state_size = snaps[i].size;
+ sn_info->date_sec = 0;
+ sn_info->date_nsec = 0;
+ sn_info->vm_clock_nsec = 0;
+ }
+ rbd_snap_list_end(snaps);
+
+ done:
+ *psn_tab = sn_tab;
+ return snap_count;
+}
+
+#ifdef LIBRBD_SUPPORTS_DISCARD
+static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
+ RBD_AIO_DISCARD);
+}
+#endif
+
+static QEMUOptionParameter qemu_rbd_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "RBD object size"
+ },
+ {NULL}
+};
+
+static BlockDriver bdrv_rbd = {
+ .format_name = "rbd",
+ .instance_size = sizeof(BDRVRBDState),
+ .bdrv_file_open = qemu_rbd_open,
+ .bdrv_close = qemu_rbd_close,
+ .bdrv_create = qemu_rbd_create,
+ .bdrv_get_info = qemu_rbd_getinfo,
+ .create_options = qemu_rbd_create_options,
+ .bdrv_getlength = qemu_rbd_getlength,
+ .bdrv_truncate = qemu_rbd_truncate,
+ .protocol_name = "rbd",
+
+ .bdrv_aio_readv = qemu_rbd_aio_readv,
+ .bdrv_aio_writev = qemu_rbd_aio_writev,
+ .bdrv_co_flush_to_disk = qemu_rbd_co_flush,
+
+#ifdef LIBRBD_SUPPORTS_DISCARD
+ .bdrv_aio_discard = qemu_rbd_aio_discard,
+#endif
+
+ .bdrv_snapshot_create = qemu_rbd_snap_create,
+ .bdrv_snapshot_delete = qemu_rbd_snap_remove,
+ .bdrv_snapshot_list = qemu_rbd_snap_list,
+ .bdrv_snapshot_goto = qemu_rbd_snap_rollback,
+};
+
+static void bdrv_rbd_init(void)
+{
+ bdrv_register(&bdrv_rbd);
+}
+
+block_init(bdrv_rbd_init);
diff --git a/block/sheepdog.c b/block/sheepdog.c
new file mode 100644
index 000000000..df4f44107
--- /dev/null
+++ b/block/sheepdog.c
@@ -0,0 +1,2083 @@
+/*
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu-common.h"
+#include "qemu-error.h"
+#include "qemu_socket.h"
+#include "block_int.h"
+#include "bitops.h"
+
+#define SD_PROTO_VER 0x01
+
+#define SD_DEFAULT_ADDR "localhost"
+#define SD_DEFAULT_PORT "7000"
+
+#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
+#define SD_OP_READ_OBJ 0x02
+#define SD_OP_WRITE_OBJ 0x03
+
+#define SD_OP_NEW_VDI 0x11
+#define SD_OP_LOCK_VDI 0x12
+#define SD_OP_RELEASE_VDI 0x13
+#define SD_OP_GET_VDI_INFO 0x14
+#define SD_OP_READ_VDIS 0x15
+#define SD_OP_FLUSH_VDI 0x16
+
+#define SD_FLAG_CMD_WRITE 0x01
+#define SD_FLAG_CMD_COW 0x02
+#define SD_FLAG_CMD_CACHE 0x04
+
+#define SD_RES_SUCCESS 0x00 /* Success */
+#define SD_RES_UNKNOWN 0x01 /* Unknown error */
+#define SD_RES_NO_OBJ 0x02 /* No object found */
+#define SD_RES_EIO 0x03 /* I/O error */
+#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
+#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
+#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
+#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
+#define SD_RES_NO_VDI 0x08 /* No vdi found */
+#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
+#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
+#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
+#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
+#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
+#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
+#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
+#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
+#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
+#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
+#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
+#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
+#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
+#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
+#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
+#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
+
+/*
+ * Object ID rules
+ *
+ * 0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): vdi object space
+ * 56 - 59 ( 4 bits): reserved vdi object space
+ * 60 - 63 ( 4 bits): object type identifier space
+ */
+
+#define VDI_SPACE_SHIFT 32
+#define VDI_BIT (UINT64_C(1) << 63)
+#define VMSTATE_BIT (UINT64_C(1) << 62)
+#define MAX_DATA_OBJS (UINT64_C(1) << 20)
+#define MAX_CHILDREN 1024
+#define SD_MAX_VDI_LEN 256
+#define SD_MAX_VDI_TAG_LEN 256
+#define SD_NR_VDIS (1U << 24)
+#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
+#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+#define SECTOR_SIZE 512
+
+#define SD_INODE_SIZE (sizeof(SheepdogInode))
+#define CURRENT_VDI_ID 0
+
+typedef struct SheepdogReq {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t opcode_specific[8];
+} SheepdogReq;
+
+typedef struct SheepdogRsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t result;
+ uint32_t opcode_specific[7];
+} SheepdogRsp;
+
+typedef struct SheepdogObjReq {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint64_t oid;
+ uint64_t cow_oid;
+ uint32_t copies;
+ uint32_t rsvd;
+ uint64_t offset;
+} SheepdogObjReq;
+
+typedef struct SheepdogObjRsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t result;
+ uint32_t copies;
+ uint32_t pad[6];
+} SheepdogObjRsp;
+
+typedef struct SheepdogVdiReq {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint64_t vdi_size;
+ uint32_t base_vdi_id;
+ uint32_t copies;
+ uint32_t snapid;
+ uint32_t pad[3];
+} SheepdogVdiReq;
+
+typedef struct SheepdogVdiRsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t result;
+ uint32_t rsvd;
+ uint32_t vdi_id;
+ uint32_t pad[5];
+} SheepdogVdiRsp;
+
+typedef struct SheepdogInode {
+ char name[SD_MAX_VDI_LEN];
+ char tag[SD_MAX_VDI_TAG_LEN];
+ uint64_t ctime;
+ uint64_t snap_ctime;
+ uint64_t vm_clock_nsec;
+ uint64_t vdi_size;
+ uint64_t vm_state_size;
+ uint16_t copy_policy;
+ uint8_t nr_copies;
+ uint8_t block_size_shift;
+ uint32_t snap_id;
+ uint32_t vdi_id;
+ uint32_t parent_vdi_id;
+ uint32_t child_vdi_id[MAX_CHILDREN];
+ uint32_t data_vdi_id[MAX_DATA_OBJS];
+} SheepdogInode;
+
+/*
+ * 64 bit FNV-1a non-zero initial basis
+ */
+#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
+
+/*
+ * 64 bit Fowler/Noll/Vo FNV-1a hash code
+ */
+static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
+{
+ unsigned char *bp = buf;
+ unsigned char *be = bp + len;
+ while (bp < be) {
+ hval ^= (uint64_t) *bp++;
+ hval += (hval << 1) + (hval << 4) + (hval << 5) +
+ (hval << 7) + (hval << 8) + (hval << 40);
+ }
+ return hval;
+}
+
+static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
+{
+ return inode->vdi_id == inode->data_vdi_id[idx];
+}
+
+static inline int is_data_obj(uint64_t oid)
+{
+ return !(VDI_BIT & oid);
+}
+
+static inline uint64_t data_oid_to_idx(uint64_t oid)
+{
+ return oid & (MAX_DATA_OBJS - 1);
+}
+
+static inline uint64_t vid_to_vdi_oid(uint32_t vid)
+{
+ return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
+}
+
+static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
+{
+ return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
+}
+
+static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
+{
+ return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
+}
+
+static inline int is_snapshot(struct SheepdogInode *inode)
+{
+ return !!inode->snap_ctime;
+}
+
+#undef dprintf
+#ifdef DEBUG_SDOG
+#define dprintf(fmt, args...) \
+ do { \
+ fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
+ } while (0)
+#else
+#define dprintf(fmt, args...)
+#endif
+
+typedef struct SheepdogAIOCB SheepdogAIOCB;
+
+typedef struct AIOReq {
+ SheepdogAIOCB *aiocb;
+ unsigned int iov_offset;
+
+ uint64_t oid;
+ uint64_t base_oid;
+ uint64_t offset;
+ unsigned int data_len;
+ uint8_t flags;
+ uint32_t id;
+
+ QLIST_ENTRY(AIOReq) aio_siblings;
+} AIOReq;
+
+enum AIOCBState {
+ AIOCB_WRITE_UDATA,
+ AIOCB_READ_UDATA,
+};
+
+struct SheepdogAIOCB {
+ BlockDriverAIOCB common;
+
+ QEMUIOVector *qiov;
+
+ int64_t sector_num;
+ int nb_sectors;
+
+ int ret;
+ enum AIOCBState aiocb_type;
+
+ Coroutine *coroutine;
+ void (*aio_done_func)(SheepdogAIOCB *);
+
+ int canceled;
+ int nr_pending;
+};
+
+typedef struct BDRVSheepdogState {
+ SheepdogInode inode;
+
+ uint32_t min_dirty_data_idx;
+ uint32_t max_dirty_data_idx;
+
+ char name[SD_MAX_VDI_LEN];
+ int is_snapshot;
+ uint8_t cache_enabled;
+
+ char *addr;
+ char *port;
+ int fd;
+ int flush_fd;
+
+ CoMutex lock;
+ Coroutine *co_send;
+ Coroutine *co_recv;
+
+ uint32_t aioreq_seq_num;
+ QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
+ QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
+} BDRVSheepdogState;
+
+static const char * sd_strerror(int err)
+{
+ int i;
+
+ static const struct {
+ int err;
+ const char *desc;
+ } errors[] = {
+ {SD_RES_SUCCESS, "Success"},
+ {SD_RES_UNKNOWN, "Unknown error"},
+ {SD_RES_NO_OBJ, "No object found"},
+ {SD_RES_EIO, "I/O error"},
+ {SD_RES_VDI_EXIST, "VDI exists already"},
+ {SD_RES_INVALID_PARMS, "Invalid parameters"},
+ {SD_RES_SYSTEM_ERROR, "System error"},
+ {SD_RES_VDI_LOCKED, "VDI is already locked"},
+ {SD_RES_NO_VDI, "No vdi found"},
+ {SD_RES_NO_BASE_VDI, "No base VDI found"},
+ {SD_RES_VDI_READ, "Failed read the requested VDI"},
+ {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
+ {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
+ {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
+ {SD_RES_NO_TAG, "Failed to find the requested tag"},
+ {SD_RES_STARTUP, "The system is still booting"},
+ {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
+ {SD_RES_SHUTDOWN, "The system is shutting down"},
+ {SD_RES_NO_MEM, "Out of memory on the server"},
+ {SD_RES_FULL_VDI, "We already have the maximum vdis"},
+ {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
+ {SD_RES_NO_SPACE, "Server has no space for new objects"},
+ {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
+ {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
+ {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
+ };
+
+ for (i = 0; i < ARRAY_SIZE(errors); ++i) {
+ if (errors[i].err == err) {
+ return errors[i].desc;
+ }
+ }
+
+ return "Invalid error code";
+}
+
+/*
+ * Sheepdog I/O handling:
+ *
+ * 1. In sd_co_rw_vector, we send the I/O requests to the server and
+ * link the requests to the inflight_list in the
+ * BDRVSheepdogState. The function exits without waiting for
+ * receiving the response.
+ *
+ * 2. We receive the response in aio_read_response, the fd handler to
+ * the sheepdog connection. If metadata update is needed, we send
+ * the write request to the vdi object in sd_write_done, the write
+ * completion function. We switch back to sd_co_readv/writev after
+ * all the requests belonging to the AIOCB are finished.
+ */
+
+static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
+ uint64_t oid, unsigned int data_len,
+ uint64_t offset, uint8_t flags,
+ uint64_t base_oid, unsigned int iov_offset)
+{
+ AIOReq *aio_req;
+
+ aio_req = g_malloc(sizeof(*aio_req));
+ aio_req->aiocb = acb;
+ aio_req->iov_offset = iov_offset;
+ aio_req->oid = oid;
+ aio_req->base_oid = base_oid;
+ aio_req->offset = offset;
+ aio_req->data_len = data_len;
+ aio_req->flags = flags;
+ aio_req->id = s->aioreq_seq_num++;
+
+ acb->nr_pending++;
+ return aio_req;
+}
+
+static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+ SheepdogAIOCB *acb = aio_req->aiocb;
+
+ QLIST_REMOVE(aio_req, aio_siblings);
+ g_free(aio_req);
+
+ acb->nr_pending--;
+}
+
+static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
+{
+ if (!acb->canceled) {
+ qemu_coroutine_enter(acb->coroutine, NULL);
+ }
+ qemu_aio_release(acb);
+}
+
+static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
+
+ /*
+ * Sheepdog cannot cancel the requests which are already sent to
+ * the servers, so we just complete the request with -EIO here.
+ */
+ acb->ret = -EIO;
+ qemu_coroutine_enter(acb->coroutine, NULL);
+ acb->canceled = 1;
+}
+
+static AIOPool sd_aio_pool = {
+ .aiocb_size = sizeof(SheepdogAIOCB),
+ .cancel = sd_aio_cancel,
+};
+
+static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ SheepdogAIOCB *acb;
+
+ acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
+
+ acb->qiov = qiov;
+
+ acb->sector_num = sector_num;
+ acb->nb_sectors = nb_sectors;
+
+ acb->aio_done_func = NULL;
+ acb->canceled = 0;
+ acb->coroutine = qemu_coroutine_self();
+ acb->ret = 0;
+ acb->nr_pending = 0;
+ return acb;
+}
+
+static int connect_to_sdog(const char *addr, const char *port)
+{
+ char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
+ int fd, ret;
+ struct addrinfo hints, *res, *res0;
+
+ if (!addr) {
+ addr = SD_DEFAULT_ADDR;
+ port = SD_DEFAULT_PORT;
+ }
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_socktype = SOCK_STREAM;
+
+ ret = getaddrinfo(addr, port, &hints, &res0);
+ if (ret) {
+ error_report("unable to get address info %s, %s",
+ addr, strerror(errno));
+ return -errno;
+ }
+
+ for (res = res0; res; res = res->ai_next) {
+ ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
+ sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
+ if (ret) {
+ continue;
+ }
+
+ fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+ if (fd < 0) {
+ continue;
+ }
+
+ reconnect:
+ ret = connect(fd, res->ai_addr, res->ai_addrlen);
+ if (ret < 0) {
+ if (errno == EINTR) {
+ goto reconnect;
+ }
+ close(fd);
+ break;
+ }
+
+ dprintf("connected to %s:%s\n", addr, port);
+ goto success;
+ }
+ fd = -errno;
+ error_report("failed connect to %s:%s", addr, port);
+success:
+ freeaddrinfo(res0);
+ return fd;
+}
+
+static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen)
+{
+ int ret;
+
+ ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
+ if (ret < sizeof(*hdr)) {
+ error_report("failed to send a req, %s", strerror(errno));
+ return ret;
+ }
+
+ ret = qemu_co_send(sockfd, data, *wlen);
+ if (ret < *wlen) {
+ error_report("failed to send a req, %s", strerror(errno));
+ }
+
+ return ret;
+}
+
+static void restart_co_req(void *opaque)
+{
+ Coroutine *co = opaque;
+
+ qemu_coroutine_enter(co, NULL);
+}
+
+typedef struct SheepdogReqCo {
+ int sockfd;
+ SheepdogReq *hdr;
+ void *data;
+ unsigned int *wlen;
+ unsigned int *rlen;
+ int ret;
+ bool finished;
+} SheepdogReqCo;
+
+static coroutine_fn void do_co_req(void *opaque)
+{
+ int ret;
+ Coroutine *co;
+ SheepdogReqCo *srco = opaque;
+ int sockfd = srco->sockfd;
+ SheepdogReq *hdr = srco->hdr;
+ void *data = srco->data;
+ unsigned int *wlen = srco->wlen;
+ unsigned int *rlen = srco->rlen;
+
+ co = qemu_coroutine_self();
+ qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
+
+ socket_set_block(sockfd);
+ ret = send_co_req(sockfd, hdr, data, wlen);
+ if (ret < 0) {
+ goto out;
+ }
+
+ qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
+
+ ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
+ if (ret < sizeof(*hdr)) {
+ error_report("failed to get a rsp, %s", strerror(errno));
+ ret = -errno;
+ goto out;
+ }
+
+ if (*rlen > hdr->data_length) {
+ *rlen = hdr->data_length;
+ }
+
+ if (*rlen) {
+ ret = qemu_co_recv(sockfd, data, *rlen);
+ if (ret < *rlen) {
+ error_report("failed to get the data, %s", strerror(errno));
+ ret = -errno;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
+ socket_set_nonblock(sockfd);
+
+ srco->ret = ret;
+ srco->finished = true;
+}
+
+static int do_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen, unsigned int *rlen)
+{
+ Coroutine *co;
+ SheepdogReqCo srco = {
+ .sockfd = sockfd,
+ .hdr = hdr,
+ .data = data,
+ .wlen = wlen,
+ .rlen = rlen,
+ .ret = 0,
+ .finished = false,
+ };
+
+ if (qemu_in_coroutine()) {
+ do_co_req(&srco);
+ } else {
+ co = qemu_coroutine_create(do_co_req);
+ qemu_coroutine_enter(co, &srco);
+ while (!srco.finished) {
+ qemu_aio_wait();
+ }
+ }
+
+ return srco.ret;
+}
+
+static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+ struct iovec *iov, int niov, int create,
+ enum AIOCBState aiocb_type);
+
+
+static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
+{
+ AIOReq *aio_req;
+
+ QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
+ if (aio_req->oid == oid) {
+ return aio_req;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * This function searchs pending requests to the object `oid', and
+ * sends them.
+ */
+static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
+{
+ AIOReq *aio_req;
+ SheepdogAIOCB *acb;
+ int ret;
+
+ while ((aio_req = find_pending_req(s, oid)) != NULL) {
+ acb = aio_req->aiocb;
+ /* move aio_req from pending list to inflight one */
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ ret = add_aio_request(s, aio_req, acb->qiov->iov,
+ acb->qiov->niov, 0, acb->aiocb_type);
+ if (ret < 0) {
+ error_report("add_aio_request is failed");
+ free_aio_req(s, aio_req);
+ if (!acb->nr_pending) {
+ sd_finish_aiocb(acb);
+ }
+ }
+ }
+}
+
+/*
+ * Receive responses of the I/O requests.
+ *
+ * This function is registered as a fd handler, and called from the
+ * main loop when s->fd is ready for reading responses.
+ */
+static void coroutine_fn aio_read_response(void *opaque)
+{
+ SheepdogObjRsp rsp;
+ BDRVSheepdogState *s = opaque;
+ int fd = s->fd;
+ int ret;
+ AIOReq *aio_req = NULL;
+ SheepdogAIOCB *acb;
+ unsigned long idx;
+
+ if (QLIST_EMPTY(&s->inflight_aio_head)) {
+ goto out;
+ }
+
+ /* read a header */
+ ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
+ if (ret < 0) {
+ error_report("failed to get the header, %s", strerror(errno));
+ goto out;
+ }
+
+ /* find the right aio_req from the inflight aio list */
+ QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
+ if (aio_req->id == rsp.id) {
+ break;
+ }
+ }
+ if (!aio_req) {
+ error_report("cannot find aio_req %x", rsp.id);
+ goto out;
+ }
+
+ acb = aio_req->aiocb;
+
+ switch (acb->aiocb_type) {
+ case AIOCB_WRITE_UDATA:
+ /* this coroutine context is no longer suitable for co_recv
+ * because we may send data to update vdi objects */
+ s->co_recv = NULL;
+ if (!is_data_obj(aio_req->oid)) {
+ break;
+ }
+ idx = data_oid_to_idx(aio_req->oid);
+
+ if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
+ /*
+ * If the object is newly created one, we need to update
+ * the vdi object (metadata object). min_dirty_data_idx
+ * and max_dirty_data_idx are changed to include updated
+ * index between them.
+ */
+ s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+ s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+ s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+
+ /*
+ * Some requests may be blocked because simultaneous
+ * create requests are not allowed, so we search the
+ * pending requests here.
+ */
+ send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+ }
+ break;
+ case AIOCB_READ_UDATA:
+ ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
+ aio_req->iov_offset, rsp.data_length);
+ if (ret < 0) {
+ error_report("failed to get the data, %s", strerror(errno));
+ goto out;
+ }
+ break;
+ }
+
+ if (rsp.result != SD_RES_SUCCESS) {
+ acb->ret = -EIO;
+ error_report("%s", sd_strerror(rsp.result));
+ }
+
+ free_aio_req(s, aio_req);
+ if (!acb->nr_pending) {
+ /*
+ * We've finished all requests which belong to the AIOCB, so
+ * we can switch back to sd_co_readv/writev now.
+ */
+ acb->aio_done_func(acb);
+ }
+out:
+ s->co_recv = NULL;
+}
+
+static void co_read_response(void *opaque)
+{
+ BDRVSheepdogState *s = opaque;
+
+ if (!s->co_recv) {
+ s->co_recv = qemu_coroutine_create(aio_read_response);
+ }
+
+ qemu_coroutine_enter(s->co_recv, opaque);
+}
+
+static void co_write_request(void *opaque)
+{
+ BDRVSheepdogState *s = opaque;
+
+ qemu_coroutine_enter(s->co_send, NULL);
+}
+
+static int aio_flush_request(void *opaque)
+{
+ BDRVSheepdogState *s = opaque;
+
+ return !QLIST_EMPTY(&s->inflight_aio_head) ||
+ !QLIST_EMPTY(&s->pending_aio_head);
+}
+
+static int set_nodelay(int fd)
+{
+ int ret, opt;
+
+ opt = 1;
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
+ return ret;
+}
+
+/*
+ * Return a socket discriptor to read/write objects.
+ *
+ * We cannot use this discriptor for other operations because
+ * the block driver may be on waiting response from the server.
+ */
+static int get_sheep_fd(BDRVSheepdogState *s)
+{
+ int ret, fd;
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ error_report("%s", strerror(errno));
+ return fd;
+ }
+
+ socket_set_nonblock(fd);
+
+ ret = set_nodelay(fd);
+ if (ret) {
+ error_report("%s", strerror(errno));
+ closesocket(fd);
+ return -errno;
+ }
+
+ qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
+ return fd;
+}
+
+/*
+ * Parse a filename
+ *
+ * filename must be one of the following formats:
+ * 1. [vdiname]
+ * 2. [vdiname]:[snapid]
+ * 3. [vdiname]:[tag]
+ * 4. [hostname]:[port]:[vdiname]
+ * 5. [hostname]:[port]:[vdiname]:[snapid]
+ * 6. [hostname]:[port]:[vdiname]:[tag]
+ *
+ * You can boot from the snapshot images by specifying `snapid` or
+ * `tag'.
+ *
+ * You can run VMs outside the Sheepdog cluster by specifying
+ * `hostname' and `port' (experimental).
+ */
+static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
+ char *vdi, uint32_t *snapid, char *tag)
+{
+ char *p, *q;
+ int nr_sep;
+
+ p = q = g_strdup(filename);
+
+ /* count the number of separators */
+ nr_sep = 0;
+ while (*p) {
+ if (*p == ':') {
+ nr_sep++;
+ }
+ p++;
+ }
+ p = q;
+
+ /* use the first two tokens as hostname and port number. */
+ if (nr_sep >= 2) {
+ s->addr = p;
+ p = strchr(p, ':');
+ *p++ = '\0';
+
+ s->port = p;
+ p = strchr(p, ':');
+ *p++ = '\0';
+ } else {
+ s->addr = NULL;
+ s->port = 0;
+ }
+
+ strncpy(vdi, p, SD_MAX_VDI_LEN);
+
+ p = strchr(vdi, ':');
+ if (p) {
+ *p++ = '\0';
+ *snapid = strtoul(p, NULL, 10);
+ if (*snapid == 0) {
+ strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+ }
+ } else {
+ *snapid = CURRENT_VDI_ID; /* search current vdi */
+ }
+
+ if (s->addr == NULL) {
+ g_free(q);
+ }
+
+ return 0;
+}
+
+static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
+ char *tag, uint32_t *vid, int for_snapshot)
+{
+ int ret, fd;
+ SheepdogVdiReq hdr;
+ SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+ unsigned int wlen, rlen = 0;
+ char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ return fd;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ strncpy(buf, filename, SD_MAX_VDI_LEN);
+ strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
+
+ memset(&hdr, 0, sizeof(hdr));
+ if (for_snapshot) {
+ hdr.opcode = SD_OP_GET_VDI_INFO;
+ } else {
+ hdr.opcode = SD_OP_LOCK_VDI;
+ }
+ wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
+ hdr.proto_ver = SD_PROTO_VER;
+ hdr.data_length = wlen;
+ hdr.snapid = snapid;
+ hdr.flags = SD_FLAG_CMD_WRITE;
+
+ ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+ if (ret) {
+ goto out;
+ }
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ error_report("cannot get vdi info, %s, %s %d %s",
+ sd_strerror(rsp->result), filename, snapid, tag);
+ if (rsp->result == SD_RES_NO_VDI) {
+ ret = -ENOENT;
+ } else {
+ ret = -EIO;
+ }
+ goto out;
+ }
+ *vid = rsp->vdi_id;
+
+ ret = 0;
+out:
+ closesocket(fd);
+ return ret;
+}
+
+static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+ struct iovec *iov, int niov, int create,
+ enum AIOCBState aiocb_type)
+{
+ int nr_copies = s->inode.nr_copies;
+ SheepdogObjReq hdr;
+ unsigned int wlen;
+ int ret;
+ uint64_t oid = aio_req->oid;
+ unsigned int datalen = aio_req->data_len;
+ uint64_t offset = aio_req->offset;
+ uint8_t flags = aio_req->flags;
+ uint64_t old_oid = aio_req->base_oid;
+
+ if (!nr_copies) {
+ error_report("bug");
+ }
+
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (aiocb_type == AIOCB_READ_UDATA) {
+ wlen = 0;
+ hdr.opcode = SD_OP_READ_OBJ;
+ hdr.flags = flags;
+ } else if (create) {
+ wlen = datalen;
+ hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
+ hdr.flags = SD_FLAG_CMD_WRITE | flags;
+ } else {
+ wlen = datalen;
+ hdr.opcode = SD_OP_WRITE_OBJ;
+ hdr.flags = SD_FLAG_CMD_WRITE | flags;
+ }
+
+ if (s->cache_enabled) {
+ hdr.flags |= SD_FLAG_CMD_CACHE;
+ }
+
+ hdr.oid = oid;
+ hdr.cow_oid = old_oid;
+ hdr.copies = s->inode.nr_copies;
+
+ hdr.data_length = datalen;
+ hdr.offset = offset;
+
+ hdr.id = aio_req->id;
+
+ qemu_co_mutex_lock(&s->lock);
+ s->co_send = qemu_coroutine_self();
+ qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
+ aio_flush_request, s);
+ socket_set_cork(s->fd, 1);
+
+ /* send a header */
+ ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ error_report("failed to send a req, %s", strerror(errno));
+ return -errno;
+ }
+
+ if (wlen) {
+ ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ error_report("failed to send a data, %s", strerror(errno));
+ return -errno;
+ }
+ }
+
+ socket_set_cork(s->fd, 0);
+ qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
+ aio_flush_request, s);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return 0;
+}
+
+static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
+ unsigned int datalen, uint64_t offset,
+ int write, int create, uint8_t cache)
+{
+ SheepdogObjReq hdr;
+ SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
+ unsigned int wlen, rlen;
+ int ret;
+
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (write) {
+ wlen = datalen;
+ rlen = 0;
+ hdr.flags = SD_FLAG_CMD_WRITE;
+ if (create) {
+ hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
+ } else {
+ hdr.opcode = SD_OP_WRITE_OBJ;
+ }
+ } else {
+ wlen = 0;
+ rlen = datalen;
+ hdr.opcode = SD_OP_READ_OBJ;
+ }
+
+ if (cache) {
+ hdr.flags |= SD_FLAG_CMD_CACHE;
+ }
+
+ hdr.oid = oid;
+ hdr.data_length = datalen;
+ hdr.offset = offset;
+ hdr.copies = copies;
+
+ ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+ if (ret) {
+ error_report("failed to send a request to the sheep");
+ return ret;
+ }
+
+ switch (rsp->result) {
+ case SD_RES_SUCCESS:
+ return 0;
+ default:
+ error_report("%s", sd_strerror(rsp->result));
+ return -EIO;
+ }
+}
+
+static int read_object(int fd, char *buf, uint64_t oid, int copies,
+ unsigned int datalen, uint64_t offset, uint8_t cache)
+{
+ return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
+ cache);
+}
+
+static int write_object(int fd, char *buf, uint64_t oid, int copies,
+ unsigned int datalen, uint64_t offset, int create,
+ uint8_t cache)
+{
+ return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
+ cache);
+}
+
+static int sd_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ int ret, fd;
+ uint32_t vid = 0;
+ BDRVSheepdogState *s = bs->opaque;
+ char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+ uint32_t snapid;
+ char *buf = NULL;
+
+ strstart(filename, "sheepdog:", (const char **)&filename);
+
+ QLIST_INIT(&s->inflight_aio_head);
+ QLIST_INIT(&s->pending_aio_head);
+ s->fd = -1;
+
+ memset(vdi, 0, sizeof(vdi));
+ memset(tag, 0, sizeof(tag));
+ if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ s->fd = get_sheep_fd(s);
+ if (s->fd < 0) {
+ ret = s->fd;
+ goto out;
+ }
+
+ ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
+ if (ret) {
+ goto out;
+ }
+
+ if (flags & BDRV_O_CACHE_WB) {
+ s->cache_enabled = 1;
+ s->flush_fd = connect_to_sdog(s->addr, s->port);
+ if (s->flush_fd < 0) {
+ error_report("failed to connect");
+ ret = s->flush_fd;
+ goto out;
+ }
+ }
+
+ if (snapid || tag[0] != '\0') {
+ dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
+ s->is_snapshot = 1;
+ }
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ error_report("failed to connect");
+ ret = fd;
+ goto out;
+ }
+
+ buf = g_malloc(SD_INODE_SIZE);
+ ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
+ s->cache_enabled);
+
+ closesocket(fd);
+
+ if (ret) {
+ goto out;
+ }
+
+ memcpy(&s->inode, buf, sizeof(s->inode));
+ s->min_dirty_data_idx = UINT32_MAX;
+ s->max_dirty_data_idx = 0;
+
+ bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
+ strncpy(s->name, vdi, sizeof(s->name));
+ qemu_co_mutex_init(&s->lock);
+ g_free(buf);
+ return 0;
+out:
+ qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
+ if (s->fd >= 0) {
+ closesocket(s->fd);
+ }
+ g_free(buf);
+ return ret;
+}
+
+static int do_sd_create(char *filename, int64_t vdi_size,
+ uint32_t base_vid, uint32_t *vdi_id, int snapshot,
+ const char *addr, const char *port)
+{
+ SheepdogVdiReq hdr;
+ SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+ int fd, ret;
+ unsigned int wlen, rlen = 0;
+ char buf[SD_MAX_VDI_LEN];
+
+ fd = connect_to_sdog(addr, port);
+ if (fd < 0) {
+ return fd;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ strncpy(buf, filename, SD_MAX_VDI_LEN);
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.opcode = SD_OP_NEW_VDI;
+ hdr.base_vdi_id = base_vid;
+
+ wlen = SD_MAX_VDI_LEN;
+
+ hdr.flags = SD_FLAG_CMD_WRITE;
+ hdr.snapid = snapshot;
+
+ hdr.data_length = wlen;
+ hdr.vdi_size = vdi_size;
+
+ ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+
+ closesocket(fd);
+
+ if (ret) {
+ return ret;
+ }
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ error_report("%s, %s", sd_strerror(rsp->result), filename);
+ return -EIO;
+ }
+
+ if (vdi_id) {
+ *vdi_id = rsp->vdi_id;
+ }
+
+ return 0;
+}
+
+static int sd_prealloc(const char *filename)
+{
+ BlockDriverState *bs = NULL;
+ uint32_t idx, max_idx;
+ int64_t vdi_size;
+ void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+ int ret;
+
+ ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+ if (ret < 0) {
+ goto out;
+ }
+
+ vdi_size = bdrv_getlength(bs);
+ if (vdi_size < 0) {
+ ret = vdi_size;
+ goto out;
+ }
+ max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+ for (idx = 0; idx < max_idx; idx++) {
+ /*
+ * The created image can be a cloned image, so we need to read
+ * a data from the source image.
+ */
+ ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+out:
+ if (bs) {
+ bdrv_delete(bs);
+ }
+ g_free(buf);
+
+ return ret;
+}
+
+static int sd_create(const char *filename, QEMUOptionParameter *options)
+{
+ int ret = 0;
+ uint32_t vid = 0, base_vid = 0;
+ int64_t vdi_size = 0;
+ char *backing_file = NULL;
+ BDRVSheepdogState *s;
+ char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+ uint32_t snapid;
+ int prealloc = 0;
+ const char *vdiname;
+
+ s = g_malloc0(sizeof(BDRVSheepdogState));
+
+ strstart(filename, "sheepdog:", &vdiname);
+
+ memset(vdi, 0, sizeof(vdi));
+ memset(tag, 0, sizeof(tag));
+ if (parse_vdiname(s, vdiname, vdi, &snapid, tag) < 0) {
+ error_report("invalid filename");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ vdi_size = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+ if (!options->value.s || !strcmp(options->value.s, "off")) {
+ prealloc = 0;
+ } else if (!strcmp(options->value.s, "full")) {
+ prealloc = 1;
+ } else {
+ error_report("Invalid preallocation mode: '%s'",
+ options->value.s);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ options++;
+ }
+
+ if (vdi_size > SD_MAX_VDI_SIZE) {
+ error_report("too big image size");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (backing_file) {
+ BlockDriverState *bs;
+ BDRVSheepdogState *s;
+ BlockDriver *drv;
+
+ /* Currently, only Sheepdog backing image is supported. */
+ drv = bdrv_find_protocol(backing_file);
+ if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
+ error_report("backing_file must be a sheepdog image");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = bdrv_file_open(&bs, backing_file, 0);
+ if (ret < 0) {
+ goto out;
+ }
+
+ s = bs->opaque;
+
+ if (!is_snapshot(&s->inode)) {
+ error_report("cannot clone from a non snapshot vdi");
+ bdrv_delete(bs);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ base_vid = s->inode.vdi_id;
+ bdrv_delete(bs);
+ }
+
+ ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port);
+ if (!prealloc || ret) {
+ goto out;
+ }
+
+ ret = sd_prealloc(filename);
+out:
+ g_free(s);
+ return ret;
+}
+
+static void sd_close(BlockDriverState *bs)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ SheepdogVdiReq hdr;
+ SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+ unsigned int wlen, rlen = 0;
+ int fd, ret;
+
+ dprintf("%s\n", s->name);
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ return;
+ }
+
+ memset(&hdr, 0, sizeof(hdr));
+
+ hdr.opcode = SD_OP_RELEASE_VDI;
+ wlen = strlen(s->name) + 1;
+ hdr.data_length = wlen;
+ hdr.flags = SD_FLAG_CMD_WRITE;
+
+ ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
+
+ closesocket(fd);
+
+ if (!ret && rsp->result != SD_RES_SUCCESS &&
+ rsp->result != SD_RES_VDI_NOT_LOCKED) {
+ error_report("%s, %s", sd_strerror(rsp->result), s->name);
+ }
+
+ qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
+ closesocket(s->fd);
+ if (s->cache_enabled) {
+ closesocket(s->flush_fd);
+ }
+ g_free(s->addr);
+}
+
+static int64_t sd_getlength(BlockDriverState *bs)
+{
+ BDRVSheepdogState *s = bs->opaque;
+
+ return s->inode.vdi_size;
+}
+
+static int sd_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ int ret, fd;
+ unsigned int datalen;
+
+ if (offset < s->inode.vdi_size) {
+ error_report("shrinking is not supported");
+ return -EINVAL;
+ } else if (offset > SD_MAX_VDI_SIZE) {
+ error_report("too big image size");
+ return -EINVAL;
+ }
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ return fd;
+ }
+
+ /* we don't need to update entire object */
+ datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
+ s->inode.vdi_size = offset;
+ ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
+ s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+ close(fd);
+
+ if (ret < 0) {
+ error_report("failed to update an inode.");
+ }
+
+ return ret;
+}
+
+/*
+ * This function is called after writing data objects. If we need to
+ * update metadata, this sends a write request to the vdi object.
+ * Otherwise, this switches back to sd_co_readv/writev.
+ */
+static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
+{
+ int ret;
+ BDRVSheepdogState *s = acb->common.bs->opaque;
+ struct iovec iov;
+ AIOReq *aio_req;
+ uint32_t offset, data_len, mn, mx;
+
+ mn = s->min_dirty_data_idx;
+ mx = s->max_dirty_data_idx;
+ if (mn <= mx) {
+ /* we need to update the vdi object. */
+ offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
+ mn * sizeof(s->inode.data_vdi_id[0]);
+ data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
+
+ s->min_dirty_data_idx = UINT32_MAX;
+ s->max_dirty_data_idx = 0;
+
+ iov.iov_base = &s->inode;
+ iov.iov_len = sizeof(s->inode);
+ aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+ data_len, offset, 0, 0, offset);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
+ if (ret) {
+ free_aio_req(s, aio_req);
+ acb->ret = -EIO;
+ goto out;
+ }
+
+ acb->aio_done_func = sd_finish_aiocb;
+ acb->aiocb_type = AIOCB_WRITE_UDATA;
+ return;
+ }
+out:
+ sd_finish_aiocb(acb);
+}
+
+/*
+ * Create a writable VDI from a snapshot
+ */
+static int sd_create_branch(BDRVSheepdogState *s)
+{
+ int ret, fd;
+ uint32_t vid;
+ char *buf;
+
+ dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
+
+ buf = g_malloc(SD_INODE_SIZE);
+
+ ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
+ s->addr, s->port);
+ if (ret) {
+ goto out;
+ }
+
+ dprintf("%" PRIx32 " is created.\n", vid);
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ error_report("failed to connect");
+ ret = fd;
+ goto out;
+ }
+
+ ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
+ SD_INODE_SIZE, 0, s->cache_enabled);
+
+ closesocket(fd);
+
+ if (ret < 0) {
+ goto out;
+ }
+
+ memcpy(&s->inode, buf, sizeof(s->inode));
+
+ s->is_snapshot = 0;
+ ret = 0;
+ dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
+
+out:
+ g_free(buf);
+
+ return ret;
+}
+
+/*
+ * Send I/O requests to the server.
+ *
+ * This function sends requests to the server, links the requests to
+ * the inflight_list in BDRVSheepdogState, and exits without
+ * waiting the response. The responses are received in the
+ * `aio_read_response' function which is called from the main loop as
+ * a fd handler.
+ *
+ * Returns 1 when we need to wait a response, 0 when there is no sent
+ * request and -errno in error cases.
+ */
+static int coroutine_fn sd_co_rw_vector(void *p)
+{
+ SheepdogAIOCB *acb = p;
+ int ret = 0;
+ unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
+ unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+ uint64_t oid;
+ uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+ BDRVSheepdogState *s = acb->common.bs->opaque;
+ SheepdogInode *inode = &s->inode;
+ AIOReq *aio_req;
+
+ if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
+ /*
+ * In the case we open the snapshot VDI, Sheepdog creates the
+ * writable VDI when we do a write operation first.
+ */
+ ret = sd_create_branch(s);
+ if (ret) {
+ acb->ret = -EIO;
+ goto out;
+ }
+ }
+
+ /*
+ * Make sure we don't free the aiocb before we are done with all requests.
+ * This additional reference is dropped at the end of this function.
+ */
+ acb->nr_pending++;
+
+ while (done != total) {
+ uint8_t flags = 0;
+ uint64_t old_oid = 0;
+ int create = 0;
+
+ oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
+
+ len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
+
+ switch (acb->aiocb_type) {
+ case AIOCB_READ_UDATA:
+ if (!inode->data_vdi_id[idx]) {
+ qemu_iovec_memset(acb->qiov, done, 0, len);
+ goto done;
+ }
+ break;
+ case AIOCB_WRITE_UDATA:
+ if (!inode->data_vdi_id[idx]) {
+ create = 1;
+ } else if (!is_data_obj_writable(inode, idx)) {
+ /* Copy-On-Write */
+ create = 1;
+ old_oid = oid;
+ flags = SD_FLAG_CMD_COW;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (create) {
+ dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
+ inode->vdi_id, oid,
+ vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
+ oid = vid_to_data_oid(inode->vdi_id, idx);
+ dprintf("new oid %" PRIx64 "\n", oid);
+ }
+
+ aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
+
+ if (create) {
+ AIOReq *areq;
+ QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
+ if (areq->oid == oid) {
+ /*
+ * Sheepdog cannot handle simultaneous create
+ * requests to the same object. So we cannot send
+ * the request until the previous request
+ * finishes.
+ */
+ aio_req->flags = 0;
+ aio_req->base_oid = 0;
+ QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
+ aio_siblings);
+ goto done;
+ }
+ }
+ }
+
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
+ create, acb->aiocb_type);
+ if (ret < 0) {
+ error_report("add_aio_request is failed");
+ free_aio_req(s, aio_req);
+ acb->ret = -EIO;
+ goto out;
+ }
+ done:
+ offset = 0;
+ idx++;
+ done += len;
+ }
+out:
+ if (!--acb->nr_pending) {
+ return acb->ret;
+ }
+ return 1;
+}
+
+static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ SheepdogAIOCB *acb;
+ int ret;
+
+ if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
+ ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
+ if (ret < 0) {
+ return ret;
+ }
+ bs->total_sectors = sector_num + nb_sectors;
+ }
+
+ acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+ acb->aio_done_func = sd_write_done;
+ acb->aiocb_type = AIOCB_WRITE_UDATA;
+
+ ret = sd_co_rw_vector(acb);
+ if (ret <= 0) {
+ qemu_aio_release(acb);
+ return ret;
+ }
+
+ qemu_coroutine_yield();
+
+ return acb->ret;
+}
+
+static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ SheepdogAIOCB *acb;
+ int ret;
+
+ acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+ acb->aiocb_type = AIOCB_READ_UDATA;
+ acb->aio_done_func = sd_finish_aiocb;
+
+ ret = sd_co_rw_vector(acb);
+ if (ret <= 0) {
+ qemu_aio_release(acb);
+ return ret;
+ }
+
+ qemu_coroutine_yield();
+
+ return acb->ret;
+}
+
+static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ SheepdogObjReq hdr = { 0 };
+ SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
+ SheepdogInode *inode = &s->inode;
+ int ret;
+ unsigned int wlen = 0, rlen = 0;
+
+ if (!s->cache_enabled) {
+ return 0;
+ }
+
+ hdr.opcode = SD_OP_FLUSH_VDI;
+ hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+
+ ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
+ if (ret) {
+ error_report("failed to send a request to the sheep");
+ return ret;
+ }
+
+ if (rsp->result == SD_RES_INVALID_PARMS) {
+ dprintf("disable write cache since the server doesn't support it\n");
+
+ s->cache_enabled = 0;
+ closesocket(s->flush_fd);
+ return 0;
+ }
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ error_report("%s", sd_strerror(rsp->result));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ int ret, fd;
+ uint32_t new_vid;
+ SheepdogInode *inode;
+ unsigned int datalen;
+
+ dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
+ "is_snapshot %d\n", sn_info->name, sn_info->id_str,
+ s->name, sn_info->vm_state_size, s->is_snapshot);
+
+ if (s->is_snapshot) {
+ error_report("You can't create a snapshot of a snapshot VDI, "
+ "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
+
+ return -EINVAL;
+ }
+
+ dprintf("%s %s\n", sn_info->name, sn_info->id_str);
+
+ s->inode.vm_state_size = sn_info->vm_state_size;
+ s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+ strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
+ /* we don't need to update entire object */
+ datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
+
+ /* refresh inode. */
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ ret = fd;
+ goto cleanup;
+ }
+
+ ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
+ s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+ if (ret < 0) {
+ error_report("failed to write snapshot's inode.");
+ goto cleanup;
+ }
+
+ ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
+ s->addr, s->port);
+ if (ret < 0) {
+ error_report("failed to create inode for snapshot. %s",
+ strerror(errno));
+ goto cleanup;
+ }
+
+ inode = (SheepdogInode *)g_malloc(datalen);
+
+ ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
+ s->inode.nr_copies, datalen, 0, s->cache_enabled);
+
+ if (ret < 0) {
+ error_report("failed to read new inode info. %s", strerror(errno));
+ goto cleanup;
+ }
+
+ memcpy(&s->inode, inode, datalen);
+ dprintf("s->inode: name %s snap_id %x oid %x\n",
+ s->inode.name, s->inode.snap_id, s->inode.vdi_id);
+
+cleanup:
+ closesocket(fd);
+ return ret;
+}
+
+static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ BDRVSheepdogState *old_s;
+ char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+ char *buf = NULL;
+ uint32_t vid;
+ uint32_t snapid = 0;
+ int ret = 0, fd;
+
+ old_s = g_malloc(sizeof(BDRVSheepdogState));
+
+ memcpy(old_s, s, sizeof(BDRVSheepdogState));
+
+ memset(vdi, 0, sizeof(vdi));
+ strncpy(vdi, s->name, sizeof(vdi));
+
+ memset(tag, 0, sizeof(tag));
+ snapid = strtoul(snapshot_id, NULL, 10);
+ if (!snapid) {
+ strncpy(tag, s->name, sizeof(tag));
+ }
+
+ ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
+ if (ret) {
+ error_report("Failed to find_vdi_name");
+ goto out;
+ }
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ error_report("failed to connect");
+ ret = fd;
+ goto out;
+ }
+
+ buf = g_malloc(SD_INODE_SIZE);
+ ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
+ SD_INODE_SIZE, 0, s->cache_enabled);
+
+ closesocket(fd);
+
+ if (ret) {
+ goto out;
+ }
+
+ memcpy(&s->inode, buf, sizeof(s->inode));
+
+ if (!s->inode.vm_state_size) {
+ error_report("Invalid snapshot");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ s->is_snapshot = 1;
+
+ g_free(buf);
+ g_free(old_s);
+
+ return 0;
+out:
+ /* recover bdrv_sd_state */
+ memcpy(s, old_s, sizeof(BDRVSheepdogState));
+ g_free(buf);
+ g_free(old_s);
+
+ error_report("failed to open. recover old bdrv_sd_state.");
+
+ return ret;
+}
+
+static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+ /* FIXME: Delete specified snapshot id. */
+ return 0;
+}
+
+static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ SheepdogReq req;
+ int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
+ QEMUSnapshotInfo *sn_tab = NULL;
+ unsigned wlen, rlen;
+ int found = 0;
+ static SheepdogInode inode;
+ unsigned long *vdi_inuse;
+ unsigned int start_nr;
+ uint64_t hval;
+ uint32_t vid;
+
+ vdi_inuse = g_malloc(max);
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ ret = fd;
+ goto out;
+ }
+
+ rlen = max;
+ wlen = 0;
+
+ memset(&req, 0, sizeof(req));
+
+ req.opcode = SD_OP_READ_VDIS;
+ req.data_length = max;
+
+ ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
+
+ closesocket(fd);
+ if (ret) {
+ goto out;
+ }
+
+ sn_tab = g_malloc0(nr * sizeof(*sn_tab));
+
+ /* calculate a vdi id with hash function */
+ hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
+ start_nr = hval & (SD_NR_VDIS - 1);
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ error_report("failed to connect");
+ ret = fd;
+ goto out;
+ }
+
+ for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
+ if (!test_bit(vid, vdi_inuse)) {
+ break;
+ }
+
+ /* we don't need to read entire object */
+ ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
+ 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
+ s->cache_enabled);
+
+ if (ret) {
+ continue;
+ }
+
+ if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
+ sn_tab[found].date_sec = inode.snap_ctime >> 32;
+ sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
+ sn_tab[found].vm_state_size = inode.vm_state_size;
+ sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
+
+ snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
+ inode.snap_id);
+ strncpy(sn_tab[found].name, inode.tag,
+ MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+ found++;
+ }
+ }
+
+ closesocket(fd);
+out:
+ *psn_tab = sn_tab;
+
+ g_free(vdi_inuse);
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ return found;
+}
+
+static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
+ int64_t pos, int size, int load)
+{
+ int fd, create;
+ int ret = 0, remaining = size;
+ unsigned int data_len;
+ uint64_t vmstate_oid;
+ uint32_t vdi_index;
+ uint64_t offset;
+
+ fd = connect_to_sdog(s->addr, s->port);
+ if (fd < 0) {
+ return fd;
+ }
+
+ while (remaining) {
+ vdi_index = pos / SD_DATA_OBJ_SIZE;
+ offset = pos % SD_DATA_OBJ_SIZE;
+
+ data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+
+ vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
+
+ create = (offset == 0);
+ if (load) {
+ ret = read_object(fd, (char *)data, vmstate_oid,
+ s->inode.nr_copies, data_len, offset,
+ s->cache_enabled);
+ } else {
+ ret = write_object(fd, (char *)data, vmstate_oid,
+ s->inode.nr_copies, data_len, offset, create,
+ s->cache_enabled);
+ }
+
+ if (ret < 0) {
+ error_report("failed to save vmstate %s", strerror(errno));
+ goto cleanup;
+ }
+
+ pos += data_len;
+ remaining -= data_len;
+ }
+ ret = size;
+cleanup:
+ closesocket(fd);
+ return ret;
+}
+
+static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
+ int64_t pos, int size)
+{
+ BDRVSheepdogState *s = bs->opaque;
+
+ return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
+}
+
+static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
+ int64_t pos, int size)
+{
+ BDRVSheepdogState *s = bs->opaque;
+
+ return do_load_save_vmstate(s, data, pos, size, 1);
+}
+
+
+static QEMUOptionParameter sd_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_PREALLOC,
+ .type = OPT_STRING,
+ .help = "Preallocation mode (allowed values: off, full)"
+ },
+ { NULL }
+};
+
+BlockDriver bdrv_sheepdog = {
+ .format_name = "sheepdog",
+ .protocol_name = "sheepdog",
+ .instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_file_open = sd_open,
+ .bdrv_close = sd_close,
+ .bdrv_create = sd_create,
+ .bdrv_getlength = sd_getlength,
+ .bdrv_truncate = sd_truncate,
+
+ .bdrv_co_readv = sd_co_readv,
+ .bdrv_co_writev = sd_co_writev,
+ .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
+
+ .bdrv_snapshot_create = sd_snapshot_create,
+ .bdrv_snapshot_goto = sd_snapshot_goto,
+ .bdrv_snapshot_delete = sd_snapshot_delete,
+ .bdrv_snapshot_list = sd_snapshot_list,
+
+ .bdrv_save_vmstate = sd_save_vmstate,
+ .bdrv_load_vmstate = sd_load_vmstate,
+
+ .create_options = sd_create_options,
+};
+
+static void bdrv_sheepdog_init(void)
+{
+ bdrv_register(&bdrv_sheepdog);
+}
+block_init(bdrv_sheepdog_init);
diff --git a/block/stream.c b/block/stream.c
new file mode 100644
index 000000000..c4f87dd5b
--- /dev/null
+++ b/block/stream.c
@@ -0,0 +1,209 @@
+/*
+ * Image streaming
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+ /*
+ * Size of data buffer for populating the image file. This should be large
+ * enough to process multiple clusters in a single call, so that populating
+ * contiguous regions of the image is efficient.
+ */
+ STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct StreamBlockJob {
+ BlockJob common;
+ RateLimit limit;
+ BlockDriverState *base;
+ char backing_file_id[1024];
+} StreamBlockJob;
+
+static int coroutine_fn stream_populate(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ void *buf)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+ };
+ QEMUIOVector qiov;
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ /* Copy-on-read the unallocated clusters */
+ return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+}
+
+static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
+ const char *base_id)
+{
+ BlockDriverState *intermediate;
+ intermediate = top->backing_hd;
+
+ while (intermediate) {
+ BlockDriverState *unused;
+
+ /* reached base */
+ if (intermediate == base) {
+ break;
+ }
+
+ unused = intermediate;
+ intermediate = intermediate->backing_hd;
+ unused->backing_hd = NULL;
+ bdrv_delete(unused);
+ }
+ top->backing_hd = base;
+}
+
+static void coroutine_fn stream_run(void *opaque)
+{
+ StreamBlockJob *s = opaque;
+ BlockDriverState *bs = s->common.bs;
+ BlockDriverState *base = s->base;
+ int64_t sector_num, end;
+ int ret = 0;
+ int n = 0;
+ void *buf;
+
+ s->common.len = bdrv_getlength(bs);
+ if (s->common.len < 0) {
+ block_job_complete(&s->common, s->common.len);
+ return;
+ }
+
+ end = s->common.len >> BDRV_SECTOR_BITS;
+ buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
+
+ /* Turn on copy-on-read for the whole block device so that guest read
+ * requests help us make progress. Only do this when copying the entire
+ * backing chain since the copy-on-read operation does not take base into
+ * account.
+ */
+ if (!base) {
+ bdrv_enable_copy_on_read(bs);
+ }
+
+ for (sector_num = 0; sector_num < end; sector_num += n) {
+ uint64_t delay_ns = 0;
+ bool copy;
+
+wait:
+ /* Note that even when no rate limit is applied we need to yield
+ * with no pending I/O here so that qemu_aio_flush() returns.
+ */
+ block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+ if (block_job_is_cancelled(&s->common)) {
+ break;
+ }
+
+ ret = bdrv_co_is_allocated(bs, sector_num,
+ STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+ if (ret == 1) {
+ /* Allocated in the top, no need to copy. */
+ copy = false;
+ } else {
+ /* Copy if allocated in the intermediate images. Limit to the
+ * known-unallocated area [sector_num, sector_num+n). */
+ ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
+ sector_num, n, &n);
+
+ /* Finish early if end of backing file has been reached */
+ if (ret == 0 && n == 0) {
+ n = end - sector_num;
+ }
+
+ copy = (ret == 1);
+ }
+ trace_stream_one_iteration(s, sector_num, n, ret);
+ if (ret >= 0 && copy) {
+ if (s->common.speed) {
+ delay_ns = ratelimit_calculate_delay(&s->limit, n);
+ if (delay_ns > 0) {
+ goto wait;
+ }
+ }
+ ret = stream_populate(bs, sector_num, n, buf);
+ }
+ if (ret < 0) {
+ break;
+ }
+ ret = 0;
+
+ /* Publish progress */
+ s->common.offset += n * BDRV_SECTOR_SIZE;
+ }
+
+ if (!base) {
+ bdrv_disable_copy_on_read(bs);
+ }
+
+ if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
+ const char *base_id = NULL, *base_fmt = NULL;
+ if (base) {
+ base_id = s->backing_file_id;
+ if (base->drv) {
+ base_fmt = base->drv->format_name;
+ }
+ }
+ ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+ close_unused_images(bs, base, base_id);
+ }
+
+ qemu_vfree(buf);
+ block_job_complete(&s->common, ret);
+}
+
+static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+ StreamBlockJob *s = container_of(job, StreamBlockJob, common);
+
+ if (speed < 0) {
+ error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ return;
+ }
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType stream_job_type = {
+ .instance_size = sizeof(StreamBlockJob),
+ .job_type = "stream",
+ .set_speed = stream_set_speed,
+};
+
+void stream_start(BlockDriverState *bs, BlockDriverState *base,
+ const char *base_id, int64_t speed,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, Error **errp)
+{
+ StreamBlockJob *s;
+
+ s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
+ if (!s) {
+ return;
+ }
+
+ s->base = base;
+ if (base_id) {
+ pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
+ }
+
+ s->common.co = qemu_coroutine_create(stream_run);
+ trace_stream_start(bs, base, s, s->common.co, opaque);
+ qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/vdi.c b/block/vdi.c
new file mode 100644
index 000000000..c4f1529db
--- /dev/null
+++ b/block/vdi.c
@@ -0,0 +1,786 @@
+/*
+ * Block driver for the Virtual Disk Image (VDI) format
+ *
+ * Copyright (c) 2009, 2012 Stefan Weil
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) version 3 or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Reference:
+ * http://forums.virtualbox.org/viewtopic.php?t=8046
+ *
+ * This driver supports create / read / write operations on VDI images.
+ *
+ * Todo (see also TODO in code):
+ *
+ * Some features like snapshots are still missing.
+ *
+ * Deallocation of zero-filled blocks and shrinking images are missing, too
+ * (might be added to common block layer).
+ *
+ * Allocation of blocks could be optimized (less writes to block map and
+ * header).
+ *
+ * Read and write of adjacents blocks could be done in one operation
+ * (current code uses one operation per block (1 MiB).
+ *
+ * The code is not thread safe (missing locks for changes in header and
+ * block table, no problem with current QEMU).
+ *
+ * Hints:
+ *
+ * Blocks (VDI documentation) correspond to clusters (QEMU).
+ * QEMU's backing files could be implemented using VDI snapshot files (TODO).
+ * VDI snapshot files may also contain the complete machine state.
+ * Maybe this machine state can be converted to QEMU PC machine snapshot data.
+ *
+ * The driver keeps a block cache (little endian entries) in memory.
+ * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM,
+ * so this seems to be reasonable.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "migration.h"
+
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#else
+/* TODO: move uuid emulation to some central place in QEMU. */
+#include "sysemu.h" /* UUID_FMT */
+typedef unsigned char uuid_t[16];
+void uuid_generate(uuid_t out);
+int uuid_is_null(const uuid_t uu);
+void uuid_unparse(const uuid_t uu, char *out);
+#endif
+
+/* Code configuration options. */
+
+/* Enable debug messages. */
+//~ #define CONFIG_VDI_DEBUG
+
+/* Support write operations on VDI images. */
+#define CONFIG_VDI_WRITE
+
+/* Support non-standard block (cluster) size. This is untested.
+ * Maybe it will be needed for very large images.
+ */
+//~ #define CONFIG_VDI_BLOCK_SIZE
+
+/* Support static (fixed, pre-allocated) images. */
+#define CONFIG_VDI_STATIC_IMAGE
+
+/* Command line option for static images. */
+#define BLOCK_OPT_STATIC "static"
+
+#define KiB 1024
+#define MiB (KiB * KiB)
+
+#define SECTOR_SIZE 512
+#define DEFAULT_CLUSTER_SIZE (1 * MiB)
+
+#if defined(CONFIG_VDI_DEBUG)
+#define logout(fmt, ...) \
+ fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#else
+#define logout(fmt, ...) ((void)0)
+#endif
+
+/* Image signature. */
+#define VDI_SIGNATURE 0xbeda107f
+
+/* Image version. */
+#define VDI_VERSION_1_1 0x00010001
+
+/* Image type. */
+#define VDI_TYPE_DYNAMIC 1
+#define VDI_TYPE_STATIC 2
+
+/* Innotek / SUN images use these strings in header.text:
+ * "<<< innotek VirtualBox Disk Image >>>\n"
+ * "<<< Sun xVM VirtualBox Disk Image >>>\n"
+ * "<<< Sun VirtualBox Disk Image >>>\n"
+ * The value does not matter, so QEMU created images use a different text.
+ */
+#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n"
+
+/* A never-allocated block; semantically arbitrary content. */
+#define VDI_UNALLOCATED 0xffffffffU
+
+/* A discarded (no longer allocated) block; semantically zero-filled. */
+#define VDI_DISCARDED 0xfffffffeU
+
+#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
+
+#if !defined(CONFIG_UUID)
+void uuid_generate(uuid_t out)
+{
+ memset(out, 0, sizeof(uuid_t));
+}
+
+int uuid_is_null(const uuid_t uu)
+{
+ uuid_t null_uuid = { 0 };
+ return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
+}
+
+void uuid_unparse(const uuid_t uu, char *out)
+{
+ snprintf(out, 37, UUID_FMT,
+ uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
+ uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
+}
+#endif
+
+typedef struct {
+ char text[0x40];
+ uint32_t signature;
+ uint32_t version;
+ uint32_t header_size;
+ uint32_t image_type;
+ uint32_t image_flags;
+ char description[256];
+ uint32_t offset_bmap;
+ uint32_t offset_data;
+ uint32_t cylinders; /* disk geometry, unused here */
+ uint32_t heads; /* disk geometry, unused here */
+ uint32_t sectors; /* disk geometry, unused here */
+ uint32_t sector_size;
+ uint32_t unused1;
+ uint64_t disk_size;
+ uint32_t block_size;
+ uint32_t block_extra; /* unused here */
+ uint32_t blocks_in_image;
+ uint32_t blocks_allocated;
+ uuid_t uuid_image;
+ uuid_t uuid_last_snap;
+ uuid_t uuid_link;
+ uuid_t uuid_parent;
+ uint64_t unused2[7];
+} VdiHeader;
+
+typedef struct {
+ /* The block map entries are little endian (even in memory). */
+ uint32_t *bmap;
+ /* Size of block (bytes). */
+ uint32_t block_size;
+ /* Size of block (sectors). */
+ uint32_t block_sectors;
+ /* First sector of block map. */
+ uint32_t bmap_sector;
+ /* VDI header (converted to host endianness). */
+ VdiHeader header;
+
+ Error *migration_blocker;
+} BDRVVdiState;
+
+/* Change UUID from little endian (IPRT = VirtualBox format) to big endian
+ * format (network byte order, standard, see RFC 4122) and vice versa.
+ */
+static void uuid_convert(uuid_t uuid)
+{
+ bswap32s((uint32_t *)&uuid[0]);
+ bswap16s((uint16_t *)&uuid[4]);
+ bswap16s((uint16_t *)&uuid[6]);
+}
+
+static void vdi_header_to_cpu(VdiHeader *header)
+{
+ le32_to_cpus(&header->signature);
+ le32_to_cpus(&header->version);
+ le32_to_cpus(&header->header_size);
+ le32_to_cpus(&header->image_type);
+ le32_to_cpus(&header->image_flags);
+ le32_to_cpus(&header->offset_bmap);
+ le32_to_cpus(&header->offset_data);
+ le32_to_cpus(&header->cylinders);
+ le32_to_cpus(&header->heads);
+ le32_to_cpus(&header->sectors);
+ le32_to_cpus(&header->sector_size);
+ le64_to_cpus(&header->disk_size);
+ le32_to_cpus(&header->block_size);
+ le32_to_cpus(&header->block_extra);
+ le32_to_cpus(&header->blocks_in_image);
+ le32_to_cpus(&header->blocks_allocated);
+ uuid_convert(header->uuid_image);
+ uuid_convert(header->uuid_last_snap);
+ uuid_convert(header->uuid_link);
+ uuid_convert(header->uuid_parent);
+}
+
+static void vdi_header_to_le(VdiHeader *header)
+{
+ cpu_to_le32s(&header->signature);
+ cpu_to_le32s(&header->version);
+ cpu_to_le32s(&header->header_size);
+ cpu_to_le32s(&header->image_type);
+ cpu_to_le32s(&header->image_flags);
+ cpu_to_le32s(&header->offset_bmap);
+ cpu_to_le32s(&header->offset_data);
+ cpu_to_le32s(&header->cylinders);
+ cpu_to_le32s(&header->heads);
+ cpu_to_le32s(&header->sectors);
+ cpu_to_le32s(&header->sector_size);
+ cpu_to_le64s(&header->disk_size);
+ cpu_to_le32s(&header->block_size);
+ cpu_to_le32s(&header->block_extra);
+ cpu_to_le32s(&header->blocks_in_image);
+ cpu_to_le32s(&header->blocks_allocated);
+ cpu_to_le32s(&header->blocks_allocated);
+ uuid_convert(header->uuid_image);
+ uuid_convert(header->uuid_last_snap);
+ uuid_convert(header->uuid_link);
+ uuid_convert(header->uuid_parent);
+}
+
+#if defined(CONFIG_VDI_DEBUG)
+static void vdi_header_print(VdiHeader *header)
+{
+ char uuid[37];
+ logout("text %s", header->text);
+ logout("signature 0x%04x\n", header->signature);
+ logout("header size 0x%04x\n", header->header_size);
+ logout("image type 0x%04x\n", header->image_type);
+ logout("image flags 0x%04x\n", header->image_flags);
+ logout("description %s\n", header->description);
+ logout("offset bmap 0x%04x\n", header->offset_bmap);
+ logout("offset data 0x%04x\n", header->offset_data);
+ logout("cylinders 0x%04x\n", header->cylinders);
+ logout("heads 0x%04x\n", header->heads);
+ logout("sectors 0x%04x\n", header->sectors);
+ logout("sector size 0x%04x\n", header->sector_size);
+ logout("image size 0x%" PRIx64 " B (%" PRIu64 " MiB)\n",
+ header->disk_size, header->disk_size / MiB);
+ logout("block size 0x%04x\n", header->block_size);
+ logout("block extra 0x%04x\n", header->block_extra);
+ logout("blocks tot. 0x%04x\n", header->blocks_in_image);
+ logout("blocks all. 0x%04x\n", header->blocks_allocated);
+ uuid_unparse(header->uuid_image, uuid);
+ logout("uuid image %s\n", uuid);
+ uuid_unparse(header->uuid_last_snap, uuid);
+ logout("uuid snap %s\n", uuid);
+ uuid_unparse(header->uuid_link, uuid);
+ logout("uuid link %s\n", uuid);
+ uuid_unparse(header->uuid_parent, uuid);
+ logout("uuid parent %s\n", uuid);
+}
+#endif
+
+static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ /* TODO: additional checks possible. */
+ BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
+ uint32_t blocks_allocated = 0;
+ uint32_t block;
+ uint32_t *bmap;
+ logout("\n");
+
+ if (fix) {
+ return -ENOTSUP;
+ }
+
+ bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t));
+ memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));
+
+ /* Check block map and value of blocks_allocated. */
+ for (block = 0; block < s->header.blocks_in_image; block++) {
+ uint32_t bmap_entry = le32_to_cpu(s->bmap[block]);
+ if (VDI_IS_ALLOCATED(bmap_entry)) {
+ if (bmap_entry < s->header.blocks_in_image) {
+ blocks_allocated++;
+ if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) {
+ bmap[bmap_entry] = bmap_entry;
+ } else {
+ fprintf(stderr, "ERROR: block index %" PRIu32
+ " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry);
+ res->corruptions++;
+ }
+ } else {
+ fprintf(stderr, "ERROR: block index %" PRIu32
+ " too large, is %" PRIu32 "\n", block, bmap_entry);
+ res->corruptions++;
+ }
+ }
+ }
+ if (blocks_allocated != s->header.blocks_allocated) {
+ fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32
+ ", should be %" PRIu32 "\n",
+ blocks_allocated, s->header.blocks_allocated);
+ res->corruptions++;
+ }
+
+ g_free(bmap);
+
+ return 0;
+}
+
+static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ /* TODO: vdi_get_info would be needed for machine snapshots.
+ vm_state_offset is still missing. */
+ BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
+ logout("\n");
+ bdi->cluster_size = s->block_size;
+ bdi->vm_state_offset = 0;
+ return 0;
+}
+
+static int vdi_make_empty(BlockDriverState *bs)
+{
+ /* TODO: missing code. */
+ logout("\n");
+ /* The return value for missing code must be 0, see block.c. */
+ return 0;
+}
+
+static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const VdiHeader *header = (const VdiHeader *)buf;
+ int result = 0;
+
+ logout("\n");
+
+ if (buf_size < sizeof(*header)) {
+ /* Header too small, no VDI. */
+ } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
+ result = 100;
+ }
+
+ if (result == 0) {
+ logout("no vdi image\n");
+ } else {
+ logout("%s", header->text);
+ }
+
+ return result;
+}
+
+static int vdi_open(BlockDriverState *bs, int flags)
+{
+ BDRVVdiState *s = bs->opaque;
+ VdiHeader header;
+ size_t bmap_size;
+
+ logout("\n");
+
+ if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) {
+ goto fail;
+ }
+
+ vdi_header_to_cpu(&header);
+#if defined(CONFIG_VDI_DEBUG)
+ vdi_header_print(&header);
+#endif
+
+ if (header.disk_size % SECTOR_SIZE != 0) {
+ /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
+ We accept them but round the disk size to the next multiple of
+ SECTOR_SIZE. */
+ logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
+ header.disk_size += SECTOR_SIZE - 1;
+ header.disk_size &= ~(SECTOR_SIZE - 1);
+ }
+
+ if (header.version != VDI_VERSION_1_1) {
+ logout("unsupported version %u.%u\n",
+ header.version >> 16, header.version & 0xffff);
+ goto fail;
+ } else if (header.offset_bmap % SECTOR_SIZE != 0) {
+ /* We only support block maps which start on a sector boundary. */
+ logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
+ goto fail;
+ } else if (header.offset_data % SECTOR_SIZE != 0) {
+ /* We only support data blocks which start on a sector boundary. */
+ logout("unsupported data offset 0x%x B\n", header.offset_data);
+ goto fail;
+ } else if (header.sector_size != SECTOR_SIZE) {
+ logout("unsupported sector size %u B\n", header.sector_size);
+ goto fail;
+ } else if (header.block_size != 1 * MiB) {
+ logout("unsupported block size %u B\n", header.block_size);
+ goto fail;
+ } else if (header.disk_size >
+ (uint64_t)header.blocks_in_image * header.block_size) {
+ logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
+ goto fail;
+ } else if (!uuid_is_null(header.uuid_link)) {
+ logout("link uuid != 0, unsupported\n");
+ goto fail;
+ } else if (!uuid_is_null(header.uuid_parent)) {
+ logout("parent uuid != 0, unsupported\n");
+ goto fail;
+ }
+
+ bs->total_sectors = header.disk_size / SECTOR_SIZE;
+
+ s->block_size = header.block_size;
+ s->block_sectors = header.block_size / SECTOR_SIZE;
+ s->bmap_sector = header.offset_bmap / SECTOR_SIZE;
+ s->header = header;
+
+ bmap_size = header.blocks_in_image * sizeof(uint32_t);
+ bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
+ if (bmap_size > 0) {
+ s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+ }
+ if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) {
+ goto fail_free_bmap;
+ }
+
+ /* Disable migration when vdi images are used */
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "vdi", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ return 0;
+
+ fail_free_bmap:
+ g_free(s->bmap);
+
+ fail:
+ return -1;
+}
+
+static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
+ BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
+ size_t bmap_index = sector_num / s->block_sectors;
+ size_t sector_in_block = sector_num % s->block_sectors;
+ int n_sectors = s->block_sectors - sector_in_block;
+ uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
+ logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum);
+ if (n_sectors > nb_sectors) {
+ n_sectors = nb_sectors;
+ }
+ *pnum = n_sectors;
+ return VDI_IS_ALLOCATED(bmap_entry);
+}
+
+static int vdi_co_read(BlockDriverState *bs,
+ int64_t sector_num, uint8_t *buf, int nb_sectors)
+{
+ BDRVVdiState *s = bs->opaque;
+ uint32_t bmap_entry;
+ uint32_t block_index;
+ uint32_t sector_in_block;
+ uint32_t n_sectors;
+ int ret = 0;
+
+ logout("\n");
+
+ while (ret >= 0 && nb_sectors > 0) {
+ block_index = sector_num / s->block_sectors;
+ sector_in_block = sector_num % s->block_sectors;
+ n_sectors = s->block_sectors - sector_in_block;
+ if (n_sectors > nb_sectors) {
+ n_sectors = nb_sectors;
+ }
+
+ logout("will read %u sectors starting at sector %" PRIu64 "\n",
+ n_sectors, sector_num);
+
+ /* prepare next AIO request */
+ bmap_entry = le32_to_cpu(s->bmap[block_index]);
+ if (!VDI_IS_ALLOCATED(bmap_entry)) {
+ /* Block not allocated, return zeros, no need to wait. */
+ memset(buf, 0, n_sectors * SECTOR_SIZE);
+ ret = 0;
+ } else {
+ uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+ (uint64_t)bmap_entry * s->block_sectors +
+ sector_in_block;
+ ret = bdrv_read(bs->file, offset, buf, n_sectors);
+ }
+ logout("%u sectors read\n", n_sectors);
+
+ nb_sectors -= n_sectors;
+ sector_num += n_sectors;
+ buf += n_sectors * SECTOR_SIZE;
+ }
+
+ return ret;
+}
+
+static int vdi_co_write(BlockDriverState *bs,
+ int64_t sector_num, const uint8_t *buf, int nb_sectors)
+{
+ BDRVVdiState *s = bs->opaque;
+ uint32_t bmap_entry;
+ uint32_t block_index;
+ uint32_t sector_in_block;
+ uint32_t n_sectors;
+ uint32_t bmap_first = VDI_UNALLOCATED;
+ uint32_t bmap_last = VDI_UNALLOCATED;
+ uint8_t *block = NULL;
+ int ret = 0;
+
+ logout("\n");
+
+ while (ret >= 0 && nb_sectors > 0) {
+ block_index = sector_num / s->block_sectors;
+ sector_in_block = sector_num % s->block_sectors;
+ n_sectors = s->block_sectors - sector_in_block;
+ if (n_sectors > nb_sectors) {
+ n_sectors = nb_sectors;
+ }
+
+ logout("will write %u sectors starting at sector %" PRIu64 "\n",
+ n_sectors, sector_num);
+
+ /* prepare next AIO request */
+ bmap_entry = le32_to_cpu(s->bmap[block_index]);
+ if (!VDI_IS_ALLOCATED(bmap_entry)) {
+ /* Allocate new block and write to it. */
+ uint64_t offset;
+ bmap_entry = s->header.blocks_allocated;
+ s->bmap[block_index] = cpu_to_le32(bmap_entry);
+ s->header.blocks_allocated++;
+ offset = s->header.offset_data / SECTOR_SIZE +
+ (uint64_t)bmap_entry * s->block_sectors;
+ if (block == NULL) {
+ block = g_malloc(s->block_size);
+ bmap_first = block_index;
+ }
+ bmap_last = block_index;
+ /* Copy data to be written to new block and zero unused parts. */
+ memset(block, 0, sector_in_block * SECTOR_SIZE);
+ memcpy(block + sector_in_block * SECTOR_SIZE,
+ buf, n_sectors * SECTOR_SIZE);
+ memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
+ (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+ ret = bdrv_write(bs->file, offset, block, s->block_sectors);
+ } else {
+ uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+ (uint64_t)bmap_entry * s->block_sectors +
+ sector_in_block;
+ ret = bdrv_write(bs->file, offset, buf, n_sectors);
+ }
+
+ nb_sectors -= n_sectors;
+ sector_num += n_sectors;
+ buf += n_sectors * SECTOR_SIZE;
+
+ logout("%u sectors written\n", n_sectors);
+ }
+
+ logout("finished data write\n");
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (block) {
+ /* One or more new blocks were allocated. */
+ VdiHeader *header = (VdiHeader *) block;
+ uint8_t *base;
+ uint64_t offset;
+
+ logout("now writing modified header\n");
+ assert(VDI_IS_ALLOCATED(bmap_first));
+ *header = s->header;
+ vdi_header_to_le(header);
+ ret = bdrv_write(bs->file, 0, block, 1);
+ g_free(block);
+ block = NULL;
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ logout("now writing modified block map entry %u...%u\n",
+ bmap_first, bmap_last);
+ /* Write modified sectors from block map. */
+ bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
+ bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
+ n_sectors = bmap_last - bmap_first + 1;
+ offset = s->bmap_sector + bmap_first;
+ base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
+ logout("will write %u block map sectors starting from entry %u\n",
+ n_sectors, bmap_first);
+ ret = bdrv_write(bs->file, offset, base, n_sectors);
+ }
+
+ return ret;
+}
+
+static int vdi_create(const char *filename, QEMUOptionParameter *options)
+{
+ int fd;
+ int result = 0;
+ uint64_t bytes = 0;
+ uint32_t blocks;
+ size_t block_size = DEFAULT_CLUSTER_SIZE;
+ uint32_t image_type = VDI_TYPE_DYNAMIC;
+ VdiHeader header;
+ size_t i;
+ size_t bmap_size;
+ uint32_t *bmap;
+
+ logout("\n");
+
+ /* Read out options. */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ bytes = options->value.n;
+#if defined(CONFIG_VDI_BLOCK_SIZE)
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */
+ block_size = options->value.n;
+ }
+#endif
+#if defined(CONFIG_VDI_STATIC_IMAGE)
+ } else if (!strcmp(options->name, BLOCK_OPT_STATIC)) {
+ if (options->value.n) {
+ image_type = VDI_TYPE_STATIC;
+ }
+#endif
+ }
+ options++;
+ }
+
+ fd = qemu_open(filename,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ if (fd < 0) {
+ return -errno;
+ }
+
+ /* We need enough blocks to store the given disk size,
+ so always round up. */
+ blocks = (bytes + block_size - 1) / block_size;
+
+ bmap_size = blocks * sizeof(uint32_t);
+ bmap_size = ((bmap_size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE -1));
+
+ memset(&header, 0, sizeof(header));
+ pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
+ header.signature = VDI_SIGNATURE;
+ header.version = VDI_VERSION_1_1;
+ header.header_size = 0x180;
+ header.image_type = image_type;
+ header.offset_bmap = 0x200;
+ header.offset_data = 0x200 + bmap_size;
+ header.sector_size = SECTOR_SIZE;
+ header.disk_size = bytes;
+ header.block_size = block_size;
+ header.blocks_in_image = blocks;
+ if (image_type == VDI_TYPE_STATIC) {
+ header.blocks_allocated = blocks;
+ }
+ uuid_generate(header.uuid_image);
+ uuid_generate(header.uuid_last_snap);
+ /* There is no need to set header.uuid_link or header.uuid_parent here. */
+#if defined(CONFIG_VDI_DEBUG)
+ vdi_header_print(&header);
+#endif
+ vdi_header_to_le(&header);
+ if (write(fd, &header, sizeof(header)) < 0) {
+ result = -errno;
+ }
+
+ bmap = NULL;
+ if (bmap_size > 0) {
+ bmap = (uint32_t *)g_malloc0(bmap_size);
+ }
+ for (i = 0; i < blocks; i++) {
+ if (image_type == VDI_TYPE_STATIC) {
+ bmap[i] = i;
+ } else {
+ bmap[i] = VDI_UNALLOCATED;
+ }
+ }
+ if (write(fd, bmap, bmap_size) < 0) {
+ result = -errno;
+ }
+ g_free(bmap);
+ if (image_type == VDI_TYPE_STATIC) {
+ if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
+ result = -errno;
+ }
+ }
+
+ if (close(fd) < 0) {
+ result = -errno;
+ }
+
+ return result;
+}
+
+static void vdi_close(BlockDriverState *bs)
+{
+ BDRVVdiState *s = bs->opaque;
+
+ g_free(s->bmap);
+
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+}
+
+static QEMUOptionParameter vdi_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+#if defined(CONFIG_VDI_BLOCK_SIZE)
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "VDI cluster (block) size",
+ .value = { .n = DEFAULT_CLUSTER_SIZE },
+ },
+#endif
+#if defined(CONFIG_VDI_STATIC_IMAGE)
+ {
+ .name = BLOCK_OPT_STATIC,
+ .type = OPT_FLAG,
+ .help = "VDI static (pre-allocated) image"
+ },
+#endif
+ /* TODO: An additional option to set UUID values might be useful. */
+ { NULL }
+};
+
+static BlockDriver bdrv_vdi = {
+ .format_name = "vdi",
+ .instance_size = sizeof(BDRVVdiState),
+ .bdrv_probe = vdi_probe,
+ .bdrv_open = vdi_open,
+ .bdrv_close = vdi_close,
+ .bdrv_create = vdi_create,
+ .bdrv_co_is_allocated = vdi_co_is_allocated,
+ .bdrv_make_empty = vdi_make_empty,
+
+ .bdrv_read = vdi_co_read,
+#if defined(CONFIG_VDI_WRITE)
+ .bdrv_write = vdi_co_write,
+#endif
+
+ .bdrv_get_info = vdi_get_info,
+
+ .create_options = vdi_create_options,
+ .bdrv_check = vdi_check,
+};
+
+static void bdrv_vdi_init(void)
+{
+ logout("\n");
+ bdrv_register(&bdrv_vdi);
+}
+
+block_init(bdrv_vdi_init);
diff --git a/block/vmdk.c b/block/vmdk.c
new file mode 100644
index 000000000..bba4c61a7
--- /dev/null
+++ b/block/vmdk.c
@@ -0,0 +1,1665 @@
+/*
+ * Block driver for the VMDK format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ * Copyright (c) 2005 Filip Navara
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "migration.h"
+#include <zlib.h>
+
+#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
+#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
+#define VMDK4_COMPRESSION_DEFLATE 1
+#define VMDK4_FLAG_RGD (1 << 1)
+#define VMDK4_FLAG_COMPRESS (1 << 16)
+#define VMDK4_FLAG_MARKER (1 << 17)
+#define VMDK4_GD_AT_END 0xffffffffffffffffULL
+
+typedef struct {
+ uint32_t version;
+ uint32_t flags;
+ uint32_t disk_sectors;
+ uint32_t granularity;
+ uint32_t l1dir_offset;
+ uint32_t l1dir_size;
+ uint32_t file_sectors;
+ uint32_t cylinders;
+ uint32_t heads;
+ uint32_t sectors_per_track;
+} VMDK3Header;
+
+typedef struct {
+ uint32_t version;
+ uint32_t flags;
+ int64_t capacity;
+ int64_t granularity;
+ int64_t desc_offset;
+ int64_t desc_size;
+ int32_t num_gtes_per_gte;
+ int64_t rgd_offset;
+ int64_t gd_offset;
+ int64_t grain_offset;
+ char filler[1];
+ char check_bytes[4];
+ uint16_t compressAlgorithm;
+} QEMU_PACKED VMDK4Header;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct VmdkExtent {
+ BlockDriverState *file;
+ bool flat;
+ bool compressed;
+ bool has_marker;
+ int64_t sectors;
+ int64_t end_sector;
+ int64_t flat_start_offset;
+ int64_t l1_table_offset;
+ int64_t l1_backup_table_offset;
+ uint32_t *l1_table;
+ uint32_t *l1_backup_table;
+ unsigned int l1_size;
+ uint32_t l1_entry_sectors;
+
+ unsigned int l2_size;
+ uint32_t *l2_cache;
+ uint32_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+ unsigned int cluster_sectors;
+} VmdkExtent;
+
+typedef struct BDRVVmdkState {
+ CoMutex lock;
+ int desc_offset;
+ bool cid_updated;
+ uint32_t parent_cid;
+ int num_extents;
+ /* Extent array with num_extents entries, ascend ordered by address */
+ VmdkExtent *extents;
+ Error *migration_blocker;
+} BDRVVmdkState;
+
+typedef struct VmdkMetaData {
+ uint32_t offset;
+ unsigned int l1_index;
+ unsigned int l2_index;
+ unsigned int l2_offset;
+ int valid;
+} VmdkMetaData;
+
+typedef struct VmdkGrainMarker {
+ uint64_t lba;
+ uint32_t size;
+ uint8_t data[0];
+} VmdkGrainMarker;
+
+enum {
+ MARKER_END_OF_STREAM = 0,
+ MARKER_GRAIN_TABLE = 1,
+ MARKER_GRAIN_DIRECTORY = 2,
+ MARKER_FOOTER = 3,
+};
+
+static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ uint32_t magic;
+
+ if (buf_size < 4) {
+ return 0;
+ }
+ magic = be32_to_cpu(*(uint32_t *)buf);
+ if (magic == VMDK3_MAGIC ||
+ magic == VMDK4_MAGIC) {
+ return 100;
+ } else {
+ const char *p = (const char *)buf;
+ const char *end = p + buf_size;
+ while (p < end) {
+ if (*p == '#') {
+ /* skip comment line */
+ while (p < end && *p != '\n') {
+ p++;
+ }
+ p++;
+ continue;
+ }
+ if (*p == ' ') {
+ while (p < end && *p == ' ') {
+ p++;
+ }
+ /* skip '\r' if windows line endings used. */
+ if (p < end && *p == '\r') {
+ p++;
+ }
+ /* only accept blank lines before 'version=' line */
+ if (p == end || *p != '\n') {
+ return 0;
+ }
+ p++;
+ continue;
+ }
+ if (end - p >= strlen("version=X\n")) {
+ if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
+ strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
+ return 100;
+ }
+ }
+ if (end - p >= strlen("version=X\r\n")) {
+ if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
+ strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
+ return 100;
+ }
+ }
+ return 0;
+ }
+ return 0;
+ }
+}
+
+#define CHECK_CID 1
+
+#define SECTOR_SIZE 512
+#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */
+#define BUF_SIZE 4096
+#define HEADER_SIZE 512 /* first sector of 512 bytes */
+
+static void vmdk_free_extents(BlockDriverState *bs)
+{
+ int i;
+ BDRVVmdkState *s = bs->opaque;
+ VmdkExtent *e;
+
+ for (i = 0; i < s->num_extents; i++) {
+ e = &s->extents[i];
+ g_free(e->l1_table);
+ g_free(e->l2_cache);
+ g_free(e->l1_backup_table);
+ if (e->file != bs->file) {
+ bdrv_delete(e->file);
+ }
+ }
+ g_free(s->extents);
+}
+
+static void vmdk_free_last_extent(BlockDriverState *bs)
+{
+ BDRVVmdkState *s = bs->opaque;
+
+ if (s->num_extents == 0) {
+ return;
+ }
+ s->num_extents--;
+ s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
+}
+
+static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
+{
+ char desc[DESC_SIZE];
+ uint32_t cid = 0xffffffff;
+ const char *p_name, *cid_str;
+ size_t cid_str_size;
+ BDRVVmdkState *s = bs->opaque;
+ int ret;
+
+ ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+ if (ret < 0) {
+ return 0;
+ }
+
+ if (parent) {
+ cid_str = "parentCID";
+ cid_str_size = sizeof("parentCID");
+ } else {
+ cid_str = "CID";
+ cid_str_size = sizeof("CID");
+ }
+
+ desc[DESC_SIZE - 1] = '\0';
+ p_name = strstr(desc, cid_str);
+ if (p_name != NULL) {
+ p_name += cid_str_size;
+ sscanf(p_name, "%x", &cid);
+ }
+
+ return cid;
+}
+
+static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
+{
+ char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+ char *p_name, *tmp_str;
+ BDRVVmdkState *s = bs->opaque;
+ int ret;
+
+ ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+ if (ret < 0) {
+ return ret;
+ }
+
+ desc[DESC_SIZE - 1] = '\0';
+ tmp_str = strstr(desc, "parentCID");
+ if (tmp_str == NULL) {
+ return -EINVAL;
+ }
+
+ pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+ p_name = strstr(desc, "CID");
+ if (p_name != NULL) {
+ p_name += sizeof("CID");
+ snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
+ pstrcat(desc, sizeof(desc), tmp_desc);
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vmdk_is_cid_valid(BlockDriverState *bs)
+{
+#ifdef CHECK_CID
+ BDRVVmdkState *s = bs->opaque;
+ BlockDriverState *p_bs = bs->backing_hd;
+ uint32_t cur_pcid;
+
+ if (p_bs) {
+ cur_pcid = vmdk_read_cid(p_bs, 0);
+ if (s->parent_cid != cur_pcid) {
+ /* CID not valid */
+ return 0;
+ }
+ }
+#endif
+ /* CID valid */
+ return 1;
+}
+
+static int vmdk_parent_open(BlockDriverState *bs)
+{
+ char *p_name;
+ char desc[DESC_SIZE + 1];
+ BDRVVmdkState *s = bs->opaque;
+ int ret;
+
+ desc[DESC_SIZE] = '\0';
+ ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+ if (ret < 0) {
+ return ret;
+ }
+
+ p_name = strstr(desc, "parentFileNameHint");
+ if (p_name != NULL) {
+ char *end_name;
+
+ p_name += sizeof("parentFileNameHint") + 1;
+ end_name = strchr(p_name, '\"');
+ if (end_name == NULL) {
+ return -EINVAL;
+ }
+ if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
+ return -EINVAL;
+ }
+
+ pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
+ }
+
+ return 0;
+}
+
+/* Create and append extent to the extent array. Return the added VmdkExtent
+ * address. return NULL if allocation failed. */
+static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
+ BlockDriverState *file, bool flat, int64_t sectors,
+ int64_t l1_offset, int64_t l1_backup_offset,
+ uint32_t l1_size,
+ int l2_size, unsigned int cluster_sectors)
+{
+ VmdkExtent *extent;
+ BDRVVmdkState *s = bs->opaque;
+
+ s->extents = g_realloc(s->extents,
+ (s->num_extents + 1) * sizeof(VmdkExtent));
+ extent = &s->extents[s->num_extents];
+ s->num_extents++;
+
+ memset(extent, 0, sizeof(VmdkExtent));
+ extent->file = file;
+ extent->flat = flat;
+ extent->sectors = sectors;
+ extent->l1_table_offset = l1_offset;
+ extent->l1_backup_table_offset = l1_backup_offset;
+ extent->l1_size = l1_size;
+ extent->l1_entry_sectors = l2_size * cluster_sectors;
+ extent->l2_size = l2_size;
+ extent->cluster_sectors = cluster_sectors;
+
+ if (s->num_extents > 1) {
+ extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
+ } else {
+ extent->end_sector = extent->sectors;
+ }
+ bs->total_sectors = extent->end_sector;
+ return extent;
+}
+
+static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
+{
+ int ret;
+ int l1_size, i;
+
+ /* read the L1 table */
+ l1_size = extent->l1_size * sizeof(uint32_t);
+ extent->l1_table = g_malloc(l1_size);
+ ret = bdrv_pread(extent->file,
+ extent->l1_table_offset,
+ extent->l1_table,
+ l1_size);
+ if (ret < 0) {
+ goto fail_l1;
+ }
+ for (i = 0; i < extent->l1_size; i++) {
+ le32_to_cpus(&extent->l1_table[i]);
+ }
+
+ if (extent->l1_backup_table_offset) {
+ extent->l1_backup_table = g_malloc(l1_size);
+ ret = bdrv_pread(extent->file,
+ extent->l1_backup_table_offset,
+ extent->l1_backup_table,
+ l1_size);
+ if (ret < 0) {
+ goto fail_l1b;
+ }
+ for (i = 0; i < extent->l1_size; i++) {
+ le32_to_cpus(&extent->l1_backup_table[i]);
+ }
+ }
+
+ extent->l2_cache =
+ g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+ return 0;
+ fail_l1b:
+ g_free(extent->l1_backup_table);
+ fail_l1:
+ g_free(extent->l1_table);
+ return ret;
+}
+
+static int vmdk_open_vmdk3(BlockDriverState *bs,
+ BlockDriverState *file,
+ int flags)
+{
+ int ret;
+ uint32_t magic;
+ VMDK3Header header;
+ VmdkExtent *extent;
+
+ ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
+ if (ret < 0) {
+ return ret;
+ }
+ extent = vmdk_add_extent(bs,
+ bs->file, false,
+ le32_to_cpu(header.disk_sectors),
+ le32_to_cpu(header.l1dir_offset) << 9,
+ 0, 1 << 6, 1 << 9,
+ le32_to_cpu(header.granularity));
+ ret = vmdk_init_tables(bs, extent);
+ if (ret) {
+ /* free extent allocated by vmdk_add_extent */
+ vmdk_free_last_extent(bs);
+ }
+ return ret;
+}
+
+static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
+ int64_t desc_offset);
+
+static int vmdk_open_vmdk4(BlockDriverState *bs,
+ BlockDriverState *file,
+ int flags)
+{
+ int ret;
+ uint32_t magic;
+ uint32_t l1_size, l1_entry_sectors;
+ VMDK4Header header;
+ VmdkExtent *extent;
+ int64_t l1_backup_offset = 0;
+
+ ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
+ if (ret < 0) {
+ return ret;
+ }
+ if (header.capacity == 0 && header.desc_offset) {
+ return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
+ }
+
+ if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
+ /*
+ * The footer takes precedence over the header, so read it in. The
+ * footer starts at offset -1024 from the end: One sector for the
+ * footer, and another one for the end-of-stream marker.
+ */
+ struct {
+ struct {
+ uint64_t val;
+ uint32_t size;
+ uint32_t type;
+ uint8_t pad[512 - 16];
+ } QEMU_PACKED footer_marker;
+
+ uint32_t magic;
+ VMDK4Header header;
+ uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
+
+ struct {
+ uint64_t val;
+ uint32_t size;
+ uint32_t type;
+ uint8_t pad[512 - 16];
+ } QEMU_PACKED eos_marker;
+ } QEMU_PACKED footer;
+
+ ret = bdrv_pread(file,
+ bs->file->total_sectors * 512 - 1536,
+ &footer, sizeof(footer));
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Some sanity checks for the footer */
+ if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
+ le32_to_cpu(footer.footer_marker.size) != 0 ||
+ le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
+ le64_to_cpu(footer.eos_marker.val) != 0 ||
+ le32_to_cpu(footer.eos_marker.size) != 0 ||
+ le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
+ {
+ return -EINVAL;
+ }
+
+ header = footer.header;
+ }
+
+ l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
+ * le64_to_cpu(header.granularity);
+ if (l1_entry_sectors == 0) {
+ return -EINVAL;
+ }
+ l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
+ / l1_entry_sectors;
+ if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
+ l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
+ }
+ extent = vmdk_add_extent(bs, file, false,
+ le64_to_cpu(header.capacity),
+ le64_to_cpu(header.gd_offset) << 9,
+ l1_backup_offset,
+ l1_size,
+ le32_to_cpu(header.num_gtes_per_gte),
+ le64_to_cpu(header.granularity));
+ extent->compressed =
+ le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
+ extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
+ ret = vmdk_init_tables(bs, extent);
+ if (ret) {
+ /* free extent allocated by vmdk_add_extent */
+ vmdk_free_last_extent(bs);
+ }
+ return ret;
+}
+
+/* find an option value out of descriptor file */
+static int vmdk_parse_description(const char *desc, const char *opt_name,
+ char *buf, int buf_size)
+{
+ char *opt_pos, *opt_end;
+ const char *end = desc + strlen(desc);
+
+ opt_pos = strstr(desc, opt_name);
+ if (!opt_pos) {
+ return -1;
+ }
+ /* Skip "=\"" following opt_name */
+ opt_pos += strlen(opt_name) + 2;
+ if (opt_pos >= end) {
+ return -1;
+ }
+ opt_end = opt_pos;
+ while (opt_end < end && *opt_end != '"') {
+ opt_end++;
+ }
+ if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
+ return -1;
+ }
+ pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
+ return 0;
+}
+
+/* Open an extent file and append to bs array */
+static int vmdk_open_sparse(BlockDriverState *bs,
+ BlockDriverState *file,
+ int flags)
+{
+ uint32_t magic;
+
+ if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
+ return -EIO;
+ }
+
+ magic = be32_to_cpu(magic);
+ switch (magic) {
+ case VMDK3_MAGIC:
+ return vmdk_open_vmdk3(bs, file, flags);
+ break;
+ case VMDK4_MAGIC:
+ return vmdk_open_vmdk4(bs, file, flags);
+ break;
+ default:
+ return -EINVAL;
+ break;
+ }
+}
+
+static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
+ const char *desc_file_path)
+{
+ int ret;
+ char access[11];
+ char type[11];
+ char fname[512];
+ const char *p = desc;
+ int64_t sectors = 0;
+ int64_t flat_offset;
+ char extent_path[PATH_MAX];
+ BlockDriverState *extent_file;
+
+ while (*p) {
+ /* parse extent line:
+ * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
+ * or
+ * RW [size in sectors] SPARSE "file-name.vmdk"
+ */
+ flat_offset = -1;
+ ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64,
+ access, &sectors, type, fname, &flat_offset);
+ if (ret < 4 || strcmp(access, "RW")) {
+ goto next_line;
+ } else if (!strcmp(type, "FLAT")) {
+ if (ret != 5 || flat_offset < 0) {
+ return -EINVAL;
+ }
+ } else if (ret != 4) {
+ return -EINVAL;
+ }
+
+ /* trim the quotation marks around */
+ if (fname[0] == '"') {
+ memmove(fname, fname + 1, strlen(fname));
+ if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') {
+ return -EINVAL;
+ }
+ fname[strlen(fname) - 1] = '\0';
+ }
+ if (sectors <= 0 ||
+ (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
+ (strcmp(access, "RW"))) {
+ goto next_line;
+ }
+
+ path_combine(extent_path, sizeof(extent_path),
+ desc_file_path, fname);
+ ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags);
+ if (ret) {
+ return ret;
+ }
+
+ /* save to extents array */
+ if (!strcmp(type, "FLAT")) {
+ /* FLAT extent */
+ VmdkExtent *extent;
+
+ extent = vmdk_add_extent(bs, extent_file, true, sectors,
+ 0, 0, 0, 0, sectors);
+ extent->flat_start_offset = flat_offset << 9;
+ } else if (!strcmp(type, "SPARSE")) {
+ /* SPARSE extent */
+ ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
+ if (ret) {
+ bdrv_delete(extent_file);
+ return ret;
+ }
+ } else {
+ fprintf(stderr,
+ "VMDK: Not supported extent type \"%s\""".\n", type);
+ return -ENOTSUP;
+ }
+next_line:
+ /* move to next line */
+ while (*p && *p != '\n') {
+ p++;
+ }
+ p++;
+ }
+ return 0;
+}
+
+static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
+ int64_t desc_offset)
+{
+ int ret;
+ char buf[2048];
+ char ct[128];
+ BDRVVmdkState *s = bs->opaque;
+
+ ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
+ if (ret < 0) {
+ return ret;
+ }
+ buf[2047] = '\0';
+ if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
+ return -EINVAL;
+ }
+ if (strcmp(ct, "monolithicFlat") &&
+ strcmp(ct, "twoGbMaxExtentSparse") &&
+ strcmp(ct, "twoGbMaxExtentFlat")) {
+ fprintf(stderr,
+ "VMDK: Not supported image type \"%s\""".\n", ct);
+ return -ENOTSUP;
+ }
+ s->desc_offset = 0;
+ return vmdk_parse_extents(buf, bs, bs->file->filename);
+}
+
+static int vmdk_open(BlockDriverState *bs, int flags)
+{
+ int ret;
+ BDRVVmdkState *s = bs->opaque;
+
+ if (vmdk_open_sparse(bs, bs->file, flags) == 0) {
+ s->desc_offset = 0x200;
+ } else {
+ ret = vmdk_open_desc_file(bs, flags, 0);
+ if (ret) {
+ goto fail;
+ }
+ }
+ /* try to open parent images, if exist */
+ ret = vmdk_parent_open(bs);
+ if (ret) {
+ goto fail;
+ }
+ s->parent_cid = vmdk_read_cid(bs, 1);
+ qemu_co_mutex_init(&s->lock);
+
+ /* Disable migration when VMDK images are used */
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "vmdk", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ return 0;
+
+fail:
+ vmdk_free_extents(bs);
+ return ret;
+}
+
+static int get_whole_cluster(BlockDriverState *bs,
+ VmdkExtent *extent,
+ uint64_t cluster_offset,
+ uint64_t offset,
+ bool allocate)
+{
+ /* 128 sectors * 512 bytes each = grain size 64KB */
+ uint8_t whole_grain[extent->cluster_sectors * 512];
+
+ /* we will be here if it's first write on non-exist grain(cluster).
+ * try to read from parent image, if exist */
+ if (bs->backing_hd) {
+ int ret;
+
+ if (!vmdk_is_cid_valid(bs)) {
+ return -1;
+ }
+
+ /* floor offset to cluster */
+ offset -= offset % (extent->cluster_sectors * 512);
+ ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
+ extent->cluster_sectors);
+ if (ret < 0) {
+ return -1;
+ }
+
+ /* Write grain only into the active image */
+ ret = bdrv_write(extent->file, cluster_offset, whole_grain,
+ extent->cluster_sectors);
+ if (ret < 0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
+{
+ /* update L2 table */
+ if (bdrv_pwrite_sync(
+ extent->file,
+ ((int64_t)m_data->l2_offset * 512)
+ + (m_data->l2_index * sizeof(m_data->offset)),
+ &(m_data->offset),
+ sizeof(m_data->offset)
+ ) < 0) {
+ return -1;
+ }
+ /* update backup L2 table */
+ if (extent->l1_backup_table_offset != 0) {
+ m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+ if (bdrv_pwrite_sync(
+ extent->file,
+ ((int64_t)m_data->l2_offset * 512)
+ + (m_data->l2_index * sizeof(m_data->offset)),
+ &(m_data->offset), sizeof(m_data->offset)
+ ) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int get_cluster_offset(BlockDriverState *bs,
+ VmdkExtent *extent,
+ VmdkMetaData *m_data,
+ uint64_t offset,
+ int allocate,
+ uint64_t *cluster_offset)
+{
+ unsigned int l1_index, l2_offset, l2_index;
+ int min_index, i, j;
+ uint32_t min_count, *l2_table, tmp = 0;
+
+ if (m_data) {
+ m_data->valid = 0;
+ }
+ if (extent->flat) {
+ *cluster_offset = extent->flat_start_offset;
+ return 0;
+ }
+
+ offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+ l1_index = (offset >> 9) / extent->l1_entry_sectors;
+ if (l1_index >= extent->l1_size) {
+ return -1;
+ }
+ l2_offset = extent->l1_table[l1_index];
+ if (!l2_offset) {
+ return -1;
+ }
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == extent->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++extent->l2_cache_counts[i] == 0xffffffff) {
+ for (j = 0; j < L2_CACHE_SIZE; j++) {
+ extent->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = extent->l2_cache + (i * extent->l2_size);
+ goto found;
+ }
+ }
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (extent->l2_cache_counts[i] < min_count) {
+ min_count = extent->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = extent->l2_cache + (min_index * extent->l2_size);
+ if (bdrv_pread(
+ extent->file,
+ (int64_t)l2_offset * 512,
+ l2_table,
+ extent->l2_size * sizeof(uint32_t)
+ ) != extent->l2_size * sizeof(uint32_t)) {
+ return -1;
+ }
+
+ extent->l2_cache_offsets[min_index] = l2_offset;
+ extent->l2_cache_counts[min_index] = 1;
+ found:
+ l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
+ *cluster_offset = le32_to_cpu(l2_table[l2_index]);
+
+ if (!*cluster_offset) {
+ if (!allocate) {
+ return -1;
+ }
+
+ /* Avoid the L2 tables update for the images that have snapshots. */
+ *cluster_offset = bdrv_getlength(extent->file);
+ if (!extent->compressed) {
+ bdrv_truncate(
+ extent->file,
+ *cluster_offset + (extent->cluster_sectors << 9)
+ );
+ }
+
+ *cluster_offset >>= 9;
+ tmp = cpu_to_le32(*cluster_offset);
+ l2_table[l2_index] = tmp;
+
+ /* First of all we write grain itself, to avoid race condition
+ * that may to corrupt the image.
+ * This problem may occur because of insufficient space on host disk
+ * or inappropriate VM shutdown.
+ */
+ if (get_whole_cluster(
+ bs, extent, *cluster_offset, offset, allocate) == -1) {
+ return -1;
+ }
+
+ if (m_data) {
+ m_data->offset = tmp;
+ m_data->l1_index = l1_index;
+ m_data->l2_index = l2_index;
+ m_data->l2_offset = l2_offset;
+ m_data->valid = 1;
+ }
+ }
+ *cluster_offset <<= 9;
+ return 0;
+}
+
+static VmdkExtent *find_extent(BDRVVmdkState *s,
+ int64_t sector_num, VmdkExtent *start_hint)
+{
+ VmdkExtent *extent = start_hint;
+
+ if (!extent) {
+ extent = &s->extents[0];
+ }
+ while (extent < &s->extents[s->num_extents]) {
+ if (sector_num < extent->end_sector) {
+ return extent;
+ }
+ extent++;
+ }
+ return NULL;
+}
+
+static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVVmdkState *s = bs->opaque;
+ int64_t index_in_cluster, n, ret;
+ uint64_t offset;
+ VmdkExtent *extent;
+
+ extent = find_extent(s, sector_num, NULL);
+ if (!extent) {
+ return 0;
+ }
+ qemu_co_mutex_lock(&s->lock);
+ ret = get_cluster_offset(bs, extent, NULL,
+ sector_num * 512, 0, &offset);
+ qemu_co_mutex_unlock(&s->lock);
+ /* get_cluster_offset returning 0 means success */
+ ret = !ret;
+
+ index_in_cluster = sector_num % extent->cluster_sectors;
+ n = extent->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+ *pnum = n;
+ return ret;
+}
+
+static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
+ int64_t offset_in_cluster, const uint8_t *buf,
+ int nb_sectors, int64_t sector_num)
+{
+ int ret;
+ VmdkGrainMarker *data = NULL;
+ uLongf buf_len;
+ const uint8_t *write_buf = buf;
+ int write_len = nb_sectors * 512;
+
+ if (extent->compressed) {
+ if (!extent->has_marker) {
+ ret = -EINVAL;
+ goto out;
+ }
+ buf_len = (extent->cluster_sectors << 9) * 2;
+ data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
+ if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
+ buf_len == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ data->lba = sector_num;
+ data->size = buf_len;
+ write_buf = (uint8_t *)data;
+ write_len = buf_len + sizeof(VmdkGrainMarker);
+ }
+ ret = bdrv_pwrite(extent->file,
+ cluster_offset + offset_in_cluster,
+ write_buf,
+ write_len);
+ if (ret != write_len) {
+ ret = ret < 0 ? ret : -EIO;
+ goto out;
+ }
+ ret = 0;
+ out:
+ g_free(data);
+ return ret;
+}
+
+static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
+ int64_t offset_in_cluster, uint8_t *buf,
+ int nb_sectors)
+{
+ int ret;
+ int cluster_bytes, buf_bytes;
+ uint8_t *cluster_buf, *compressed_data;
+ uint8_t *uncomp_buf;
+ uint32_t data_len;
+ VmdkGrainMarker *marker;
+ uLongf buf_len;
+
+
+ if (!extent->compressed) {
+ ret = bdrv_pread(extent->file,
+ cluster_offset + offset_in_cluster,
+ buf, nb_sectors * 512);
+ if (ret == nb_sectors * 512) {
+ return 0;
+ } else {
+ return -EIO;
+ }
+ }
+ cluster_bytes = extent->cluster_sectors * 512;
+ /* Read two clusters in case GrainMarker + compressed data > one cluster */
+ buf_bytes = cluster_bytes * 2;
+ cluster_buf = g_malloc(buf_bytes);
+ uncomp_buf = g_malloc(cluster_bytes);
+ ret = bdrv_pread(extent->file,
+ cluster_offset,
+ cluster_buf, buf_bytes);
+ if (ret < 0) {
+ goto out;
+ }
+ compressed_data = cluster_buf;
+ buf_len = cluster_bytes;
+ data_len = cluster_bytes;
+ if (extent->has_marker) {
+ marker = (VmdkGrainMarker *)cluster_buf;
+ compressed_data = marker->data;
+ data_len = le32_to_cpu(marker->size);
+ }
+ if (!data_len || data_len > buf_bytes) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
+ if (ret != Z_OK) {
+ ret = -EINVAL;
+ goto out;
+
+ }
+ if (offset_in_cluster < 0 ||
+ offset_in_cluster + nb_sectors * 512 > buf_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
+ ret = 0;
+
+ out:
+ g_free(uncomp_buf);
+ g_free(cluster_buf);
+ return ret;
+}
+
+static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVmdkState *s = bs->opaque;
+ int ret;
+ uint64_t n, index_in_cluster;
+ VmdkExtent *extent = NULL;
+ uint64_t cluster_offset;
+
+ while (nb_sectors > 0) {
+ extent = find_extent(s, sector_num, extent);
+ if (!extent) {
+ return -EIO;
+ }
+ ret = get_cluster_offset(
+ bs, extent, NULL,
+ sector_num << 9, 0, &cluster_offset);
+ index_in_cluster = sector_num % extent->cluster_sectors;
+ n = extent->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+ if (ret) {
+ /* if not allocated, try to read from parent image, if exist */
+ if (bs->backing_hd) {
+ if (!vmdk_is_cid_valid(bs)) {
+ return -EINVAL;
+ }
+ ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ memset(buf, 0, 512 * n);
+ }
+ } else {
+ ret = vmdk_read_extent(extent,
+ cluster_offset, index_in_cluster * 512,
+ buf, n);
+ if (ret) {
+ return ret;
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+
+static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVmdkState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vmdk_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVmdkState *s = bs->opaque;
+ VmdkExtent *extent = NULL;
+ int n, ret;
+ int64_t index_in_cluster;
+ uint64_t cluster_offset;
+ VmdkMetaData m_data;
+
+ if (sector_num > bs->total_sectors) {
+ fprintf(stderr,
+ "(VMDK) Wrong offset: sector_num=0x%" PRIx64
+ " total_sectors=0x%" PRIx64 "\n",
+ sector_num, bs->total_sectors);
+ return -EIO;
+ }
+
+ while (nb_sectors > 0) {
+ extent = find_extent(s, sector_num, extent);
+ if (!extent) {
+ return -EIO;
+ }
+ ret = get_cluster_offset(
+ bs,
+ extent,
+ &m_data,
+ sector_num << 9, !extent->compressed,
+ &cluster_offset);
+ if (extent->compressed) {
+ if (ret == 0) {
+ /* Refuse write to allocated cluster for streamOptimized */
+ fprintf(stderr,
+ "VMDK: can't write to allocated cluster"
+ " for streamOptimized\n");
+ return -EIO;
+ } else {
+ /* allocate */
+ ret = get_cluster_offset(
+ bs,
+ extent,
+ &m_data,
+ sector_num << 9, 1,
+ &cluster_offset);
+ }
+ }
+ if (ret) {
+ return -EINVAL;
+ }
+ index_in_cluster = sector_num % extent->cluster_sectors;
+ n = extent->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+
+ ret = vmdk_write_extent(extent,
+ cluster_offset, index_in_cluster * 512,
+ buf, n, sector_num);
+ if (ret) {
+ return ret;
+ }
+ if (m_data.valid) {
+ /* update L2 tables */
+ if (vmdk_L2update(extent, &m_data) == -1) {
+ return -EIO;
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+
+ /* update CID on the first write every time the virtual disk is
+ * opened */
+ if (!s->cid_updated) {
+ ret = vmdk_write_cid(bs, time(NULL));
+ if (ret < 0) {
+ return ret;
+ }
+ s->cid_updated = true;
+ }
+ }
+ return 0;
+}
+
+static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVmdkState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vmdk_write(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+
+static int vmdk_create_extent(const char *filename, int64_t filesize,
+ bool flat, bool compress)
+{
+ int ret, i;
+ int fd = 0;
+ VMDK4Header header;
+ uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+
+ fd = qemu_open(filename,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ if (fd < 0) {
+ return -errno;
+ }
+ if (flat) {
+ ret = ftruncate(fd, filesize);
+ if (ret < 0) {
+ ret = -errno;
+ }
+ goto exit;
+ }
+ magic = cpu_to_be32(VMDK4_MAGIC);
+ memset(&header, 0, sizeof(header));
+ header.version = 1;
+ header.flags =
+ 3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
+ header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
+ header.capacity = filesize / 512;
+ header.granularity = 128;
+ header.num_gtes_per_gte = 512;
+
+ grains = (filesize / 512 + header.granularity - 1) / header.granularity;
+ gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+ gt_count =
+ (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+ gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+
+ header.desc_offset = 1;
+ header.desc_size = 20;
+ header.rgd_offset = header.desc_offset + header.desc_size;
+ header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+ header.grain_offset =
+ ((header.gd_offset + gd_size + (gt_size * gt_count) +
+ header.granularity - 1) / header.granularity) *
+ header.granularity;
+ /* swap endianness for all header fields */
+ header.version = cpu_to_le32(header.version);
+ header.flags = cpu_to_le32(header.flags);
+ header.capacity = cpu_to_le64(header.capacity);
+ header.granularity = cpu_to_le64(header.granularity);
+ header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
+ header.desc_offset = cpu_to_le64(header.desc_offset);
+ header.desc_size = cpu_to_le64(header.desc_size);
+ header.rgd_offset = cpu_to_le64(header.rgd_offset);
+ header.gd_offset = cpu_to_le64(header.gd_offset);
+ header.grain_offset = cpu_to_le64(header.grain_offset);
+ header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
+
+ header.check_bytes[0] = 0xa;
+ header.check_bytes[1] = 0x20;
+ header.check_bytes[2] = 0xd;
+ header.check_bytes[3] = 0xa;
+
+ /* write all the data */
+ ret = qemu_write_full(fd, &magic, sizeof(magic));
+ if (ret != sizeof(magic)) {
+ ret = -errno;
+ goto exit;
+ }
+ ret = qemu_write_full(fd, &header, sizeof(header));
+ if (ret != sizeof(header)) {
+ ret = -errno;
+ goto exit;
+ }
+
+ ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
+ if (ret < 0) {
+ ret = -errno;
+ goto exit;
+ }
+
+ /* write grain directory */
+ lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
+ for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
+ i < gt_count; i++, tmp += gt_size) {
+ ret = qemu_write_full(fd, &tmp, sizeof(tmp));
+ if (ret != sizeof(tmp)) {
+ ret = -errno;
+ goto exit;
+ }
+ }
+
+ /* write backup grain directory */
+ lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
+ for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
+ i < gt_count; i++, tmp += gt_size) {
+ ret = qemu_write_full(fd, &tmp, sizeof(tmp));
+ if (ret != sizeof(tmp)) {
+ ret = -errno;
+ goto exit;
+ }
+ }
+
+ ret = 0;
+ exit:
+ qemu_close(fd);
+ return ret;
+}
+
+static int filename_decompose(const char *filename, char *path, char *prefix,
+ char *postfix, size_t buf_len)
+{
+ const char *p, *q;
+
+ if (filename == NULL || !strlen(filename)) {
+ fprintf(stderr, "Vmdk: no filename provided.\n");
+ return -1;
+ }
+ p = strrchr(filename, '/');
+ if (p == NULL) {
+ p = strrchr(filename, '\\');
+ }
+ if (p == NULL) {
+ p = strrchr(filename, ':');
+ }
+ if (p != NULL) {
+ p++;
+ if (p - filename >= buf_len) {
+ return -1;
+ }
+ pstrcpy(path, p - filename + 1, filename);
+ } else {
+ p = filename;
+ path[0] = '\0';
+ }
+ q = strrchr(p, '.');
+ if (q == NULL) {
+ pstrcpy(prefix, buf_len, p);
+ postfix[0] = '\0';
+ } else {
+ if (q - p >= buf_len) {
+ return -1;
+ }
+ pstrcpy(prefix, q - p + 1, p);
+ pstrcpy(postfix, buf_len, q);
+ }
+ return 0;
+}
+
+static int relative_path(char *dest, int dest_size,
+ const char *base, const char *target)
+{
+ int i = 0;
+ int n = 0;
+ const char *p, *q;
+#ifdef _WIN32
+ const char *sep = "\\";
+#else
+ const char *sep = "/";
+#endif
+
+ if (!(dest && base && target)) {
+ return -1;
+ }
+ if (path_is_absolute(target)) {
+ dest[dest_size - 1] = '\0';
+ strncpy(dest, target, dest_size - 1);
+ return 0;
+ }
+ while (base[i] == target[i]) {
+ i++;
+ }
+ p = &base[i];
+ q = &target[i];
+ while (*p) {
+ if (*p == *sep) {
+ n++;
+ }
+ p++;
+ }
+ dest[0] = '\0';
+ for (; n; n--) {
+ pstrcat(dest, dest_size, "..");
+ pstrcat(dest, dest_size, sep);
+ }
+ pstrcat(dest, dest_size, q);
+ return 0;
+}
+
+static int vmdk_create(const char *filename, QEMUOptionParameter *options)
+{
+ int fd, idx = 0;
+ char desc[BUF_SIZE];
+ int64_t total_size = 0, filesize;
+ const char *backing_file = NULL;
+ const char *fmt = NULL;
+ int flags = 0;
+ int ret = 0;
+ bool flat, split, compress;
+ char ext_desc_lines[BUF_SIZE] = "";
+ char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
+ const int64_t split_size = 0x80000000; /* VMDK has constant split size */
+ const char *desc_extent_line;
+ char parent_desc_line[BUF_SIZE] = "";
+ uint32_t parent_cid = 0xffffffff;
+ const char desc_template[] =
+ "# Disk DescriptorFile\n"
+ "version=1\n"
+ "CID=%x\n"
+ "parentCID=%x\n"
+ "createType=\"%s\"\n"
+ "%s"
+ "\n"
+ "# Extent description\n"
+ "%s"
+ "\n"
+ "# The Disk Data Base\n"
+ "#DDB\n"
+ "\n"
+ "ddb.virtualHWVersion = \"%d\"\n"
+ "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+ "ddb.geometry.heads = \"16\"\n"
+ "ddb.geometry.sectors = \"63\"\n"
+ "ddb.adapterType = \"ide\"\n";
+
+ if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
+ return -EINVAL;
+ }
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
+ flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
+ } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
+ fmt = options->value.s;
+ }
+ options++;
+ }
+ if (!fmt) {
+ /* Default format to monolithicSparse */
+ fmt = "monolithicSparse";
+ } else if (strcmp(fmt, "monolithicFlat") &&
+ strcmp(fmt, "monolithicSparse") &&
+ strcmp(fmt, "twoGbMaxExtentSparse") &&
+ strcmp(fmt, "twoGbMaxExtentFlat") &&
+ strcmp(fmt, "streamOptimized")) {
+ fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt);
+ return -EINVAL;
+ }
+ split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
+ strcmp(fmt, "twoGbMaxExtentSparse"));
+ flat = !(strcmp(fmt, "monolithicFlat") &&
+ strcmp(fmt, "twoGbMaxExtentFlat"));
+ compress = !strcmp(fmt, "streamOptimized");
+ if (flat) {
+ desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
+ } else {
+ desc_extent_line = "RW %lld SPARSE \"%s\"\n";
+ }
+ if (flat && backing_file) {
+ /* not supporting backing file for flat image */
+ return -ENOTSUP;
+ }
+ if (backing_file) {
+ char parent_filename[PATH_MAX];
+ BlockDriverState *bs = bdrv_new("");
+ ret = bdrv_open(bs, backing_file, 0, NULL);
+ if (ret != 0) {
+ bdrv_delete(bs);
+ return ret;
+ }
+ if (strcmp(bs->drv->format_name, "vmdk")) {
+ bdrv_delete(bs);
+ return -EINVAL;
+ }
+ parent_cid = vmdk_read_cid(bs, 0);
+ bdrv_delete(bs);
+ relative_path(parent_filename, sizeof(parent_filename),
+ filename, backing_file);
+ snprintf(parent_desc_line, sizeof(parent_desc_line),
+ "parentFileNameHint=\"%s\"", parent_filename);
+ }
+
+ /* Create extents */
+ filesize = total_size;
+ while (filesize > 0) {
+ char desc_line[BUF_SIZE];
+ char ext_filename[PATH_MAX];
+ char desc_filename[PATH_MAX];
+ int64_t size = filesize;
+
+ if (split && size > split_size) {
+ size = split_size;
+ }
+ if (split) {
+ snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
+ prefix, flat ? 'f' : 's', ++idx, postfix);
+ } else if (flat) {
+ snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
+ prefix, postfix);
+ } else {
+ snprintf(desc_filename, sizeof(desc_filename), "%s%s",
+ prefix, postfix);
+ }
+ snprintf(ext_filename, sizeof(ext_filename), "%s%s",
+ path, desc_filename);
+
+ if (vmdk_create_extent(ext_filename, size, flat, compress)) {
+ return -EINVAL;
+ }
+ filesize -= size;
+
+ /* Format description line */
+ snprintf(desc_line, sizeof(desc_line),
+ desc_extent_line, size / 512, desc_filename);
+ pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
+ }
+ /* generate descriptor file */
+ snprintf(desc, sizeof(desc), desc_template,
+ (unsigned int)time(NULL),
+ parent_cid,
+ fmt,
+ parent_desc_line,
+ ext_desc_lines,
+ (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+ total_size / (int64_t)(63 * 16 * 512));
+ if (split || flat) {
+ fd = qemu_open(filename,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ } else {
+ fd = qemu_open(filename,
+ O_WRONLY | O_BINARY | O_LARGEFILE,
+ 0644);
+ }
+ if (fd < 0) {
+ return -errno;
+ }
+ /* the descriptor offset = 0x200 */
+ if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
+ ret = -errno;
+ goto exit;
+ }
+ ret = qemu_write_full(fd, desc, strlen(desc));
+ if (ret != strlen(desc)) {
+ ret = -errno;
+ goto exit;
+ }
+ ret = 0;
+exit:
+ qemu_close(fd);
+ return ret;
+}
+
+static void vmdk_close(BlockDriverState *bs)
+{
+ BDRVVmdkState *s = bs->opaque;
+
+ vmdk_free_extents(bs);
+
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+}
+
+static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
+{
+ BDRVVmdkState *s = bs->opaque;
+ int i, err;
+ int ret = 0;
+
+ for (i = 0; i < s->num_extents; i++) {
+ err = bdrv_co_flush(s->extents[i].file);
+ if (err < 0) {
+ ret = err;
+ }
+ }
+ return ret;
+}
+
+static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
+{
+ int i;
+ int64_t ret = 0;
+ int64_t r;
+ BDRVVmdkState *s = bs->opaque;
+
+ ret = bdrv_get_allocated_file_size(bs->file);
+ if (ret < 0) {
+ return ret;
+ }
+ for (i = 0; i < s->num_extents; i++) {
+ if (s->extents[i].file == bs->file) {
+ continue;
+ }
+ r = bdrv_get_allocated_file_size(s->extents[i].file);
+ if (r < 0) {
+ return r;
+ }
+ ret += r;
+ }
+ return ret;
+}
+
+static QEMUOptionParameter vmdk_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_COMPAT6,
+ .type = OPT_FLAG,
+ .help = "VMDK version 6 image"
+ },
+ {
+ .name = BLOCK_OPT_SUBFMT,
+ .type = OPT_STRING,
+ .help =
+ "VMDK flat extent format, can be one of "
+ "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_vmdk = {
+ .format_name = "vmdk",
+ .instance_size = sizeof(BDRVVmdkState),
+ .bdrv_probe = vmdk_probe,
+ .bdrv_open = vmdk_open,
+ .bdrv_read = vmdk_co_read,
+ .bdrv_write = vmdk_co_write,
+ .bdrv_close = vmdk_close,
+ .bdrv_create = vmdk_create,
+ .bdrv_co_flush_to_disk = vmdk_co_flush,
+ .bdrv_co_is_allocated = vmdk_co_is_allocated,
+ .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
+
+ .create_options = vmdk_create_options,
+};
+
+static void bdrv_vmdk_init(void)
+{
+ bdrv_register(&bdrv_vmdk);
+}
+
+block_init(bdrv_vmdk_init);
diff --git a/block/vpc.c b/block/vpc.c
new file mode 100644
index 000000000..c0b82c4f5
--- /dev/null
+++ b/block/vpc.c
@@ -0,0 +1,799 @@
+/*
+ * Block driver for Connectix / Microsoft Virtual PC images
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "migration.h"
+
+/**************************************************************/
+
+#define HEADER_SIZE 512
+
+//#define CACHE
+
+enum vhd_type {
+ VHD_FIXED = 2,
+ VHD_DYNAMIC = 3,
+ VHD_DIFFERENCING = 4,
+};
+
+// Seconds since Jan 1, 2000 0:00:00 (UTC)
+#define VHD_TIMESTAMP_BASE 946684800
+
+// always big-endian
+struct vhd_footer {
+ char creator[8]; // "conectix"
+ uint32_t features;
+ uint32_t version;
+
+ // Offset of next header structure, 0xFFFFFFFF if none
+ uint64_t data_offset;
+
+ // Seconds since Jan 1, 2000 0:00:00 (UTC)
+ uint32_t timestamp;
+
+ char creator_app[4]; // "vpc "
+ uint16_t major;
+ uint16_t minor;
+ char creator_os[4]; // "Wi2k"
+
+ uint64_t orig_size;
+ uint64_t size;
+
+ uint16_t cyls;
+ uint8_t heads;
+ uint8_t secs_per_cyl;
+
+ uint32_t type;
+
+ // Checksum of the Hard Disk Footer ("one's complement of the sum of all
+ // the bytes in the footer without the checksum field")
+ uint32_t checksum;
+
+ // UUID used to identify a parent hard disk (backing file)
+ uint8_t uuid[16];
+
+ uint8_t in_saved_state;
+};
+
+struct vhd_dyndisk_header {
+ char magic[8]; // "cxsparse"
+
+ // Offset of next header structure, 0xFFFFFFFF if none
+ uint64_t data_offset;
+
+ // Offset of the Block Allocation Table (BAT)
+ uint64_t table_offset;
+
+ uint32_t version;
+ uint32_t max_table_entries; // 32bit/entry
+
+ // 2 MB by default, must be a power of two
+ uint32_t block_size;
+
+ uint32_t checksum;
+ uint8_t parent_uuid[16];
+ uint32_t parent_timestamp;
+ uint32_t reserved;
+
+ // Backing file name (in UTF-16)
+ uint8_t parent_name[512];
+
+ struct {
+ uint32_t platform;
+ uint32_t data_space;
+ uint32_t data_length;
+ uint32_t reserved;
+ uint64_t data_offset;
+ } parent_locator[8];
+};
+
+typedef struct BDRVVPCState {
+ CoMutex lock;
+ uint8_t footer_buf[HEADER_SIZE];
+ uint64_t free_data_block_offset;
+ int max_table_entries;
+ uint32_t *pagetable;
+ uint64_t bat_offset;
+ uint64_t last_bitmap_offset;
+
+ uint32_t block_size;
+ uint32_t bitmap_size;
+
+#ifdef CACHE
+ uint8_t *pageentry_u8;
+ uint32_t *pageentry_u32;
+ uint16_t *pageentry_u16;
+
+ uint64_t last_bitmap;
+#endif
+
+ Error *migration_blocker;
+} BDRVVPCState;
+
+static uint32_t vpc_checksum(uint8_t* buf, size_t size)
+{
+ uint32_t res = 0;
+ int i;
+
+ for (i = 0; i < size; i++)
+ res += buf[i];
+
+ return ~res;
+}
+
+
+static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
+ return 100;
+ return 0;
+}
+
+static int vpc_open(BlockDriverState *bs, int flags)
+{
+ BDRVVPCState *s = bs->opaque;
+ int i;
+ struct vhd_footer* footer;
+ struct vhd_dyndisk_header* dyndisk_header;
+ uint8_t buf[HEADER_SIZE];
+ uint32_t checksum;
+ int err = -1;
+ int disk_type = VHD_DYNAMIC;
+
+ if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+ goto fail;
+
+ footer = (struct vhd_footer*) s->footer_buf;
+ if (strncmp(footer->creator, "conectix", 8)) {
+ int64_t offset = bdrv_getlength(bs->file);
+ if (offset < HEADER_SIZE) {
+ goto fail;
+ }
+ /* If a fixed disk, the footer is found only at the end of the file */
+ if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE)
+ != HEADER_SIZE) {
+ goto fail;
+ }
+ if (strncmp(footer->creator, "conectix", 8)) {
+ goto fail;
+ }
+ disk_type = VHD_FIXED;
+ }
+
+ checksum = be32_to_cpu(footer->checksum);
+ footer->checksum = 0;
+ if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
+ fprintf(stderr, "block-vpc: The header checksum of '%s' is "
+ "incorrect.\n", bs->filename);
+
+ /* Write 'checksum' back to footer, or else will leave it with zero. */
+ footer->checksum = be32_to_cpu(checksum);
+
+ // The visible size of a image in Virtual PC depends on the geometry
+ // rather than on the size stored in the footer (the size in the footer
+ // is too large usually)
+ bs->total_sectors = (int64_t)
+ be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
+
+ if (bs->total_sectors >= 65535 * 16 * 255) {
+ err = -EFBIG;
+ goto fail;
+ }
+
+ if (disk_type == VHD_DYNAMIC) {
+ if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
+ HEADER_SIZE) != HEADER_SIZE) {
+ goto fail;
+ }
+
+ dyndisk_header = (struct vhd_dyndisk_header *) buf;
+
+ if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
+ goto fail;
+ }
+
+ s->block_size = be32_to_cpu(dyndisk_header->block_size);
+ s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
+
+ s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
+ s->pagetable = g_malloc(s->max_table_entries * 4);
+
+ s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
+ if (bdrv_pread(bs->file, s->bat_offset, s->pagetable,
+ s->max_table_entries * 4) != s->max_table_entries * 4) {
+ goto fail;
+ }
+
+ s->free_data_block_offset =
+ (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511;
+
+ for (i = 0; i < s->max_table_entries; i++) {
+ be32_to_cpus(&s->pagetable[i]);
+ if (s->pagetable[i] != 0xFFFFFFFF) {
+ int64_t next = (512 * (int64_t) s->pagetable[i]) +
+ s->bitmap_size + s->block_size;
+
+ if (next > s->free_data_block_offset) {
+ s->free_data_block_offset = next;
+ }
+ }
+ }
+
+ s->last_bitmap_offset = (int64_t) -1;
+
+#ifdef CACHE
+ s->pageentry_u8 = g_malloc(512);
+ s->pageentry_u32 = s->pageentry_u8;
+ s->pageentry_u16 = s->pageentry_u8;
+ s->last_pagetable = -1;
+#endif
+ }
+
+ qemu_co_mutex_init(&s->lock);
+
+ /* Disable migration when VHD images are used */
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "vpc", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ return 0;
+ fail:
+ return err;
+}
+
+/*
+ * Returns the absolute byte offset of the given sector in the image file.
+ * If the sector is not allocated, -1 is returned instead.
+ *
+ * The parameter write must be 1 if the offset will be used for a write
+ * operation (the block bitmaps is updated then), 0 otherwise.
+ */
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+ int64_t sector_num, int write)
+{
+ BDRVVPCState *s = bs->opaque;
+ uint64_t offset = sector_num * 512;
+ uint64_t bitmap_offset, block_offset;
+ uint32_t pagetable_index, pageentry_index;
+
+ pagetable_index = offset / s->block_size;
+ pageentry_index = (offset % s->block_size) / 512;
+
+ if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
+ return -1; // not allocated
+
+ bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
+ block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+
+ // We must ensure that we don't write to any sectors which are marked as
+ // unused in the bitmap. We get away with setting all bits in the block
+ // bitmap each time we write to a new block. This might cause Virtual PC to
+ // miss sparse read optimization, but it's not a problem in terms of
+ // correctness.
+ if (write && (s->last_bitmap_offset != bitmap_offset)) {
+ uint8_t bitmap[s->bitmap_size];
+
+ s->last_bitmap_offset = bitmap_offset;
+ memset(bitmap, 0xff, s->bitmap_size);
+ bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
+ }
+
+// printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n",
+// sector_num, pagetable_index, pageentry_index,
+// bitmap_offset, block_offset);
+
+// disabled by reason
+#if 0
+#ifdef CACHE
+ if (bitmap_offset != s->last_bitmap)
+ {
+ lseek(s->fd, bitmap_offset, SEEK_SET);
+
+ s->last_bitmap = bitmap_offset;
+
+ // Scary! Bitmap is stored as big endian 32bit entries,
+ // while we used to look it up byte by byte
+ read(s->fd, s->pageentry_u8, 512);
+ for (i = 0; i < 128; i++)
+ be32_to_cpus(&s->pageentry_u32[i]);
+ }
+
+ if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1)
+ return -1;
+#else
+ lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET);
+
+ read(s->fd, &bitmap_entry, 1);
+
+ if ((bitmap_entry >> (pageentry_index % 8)) & 1)
+ return -1; // not allocated
+#endif
+#endif
+
+ return block_offset;
+}
+
+/*
+ * Writes the footer to the end of the image file. This is needed when the
+ * file grows as it overwrites the old footer
+ *
+ * Returns 0 on success and < 0 on error
+ */
+static int rewrite_footer(BlockDriverState* bs)
+{
+ int ret;
+ BDRVVPCState *s = bs->opaque;
+ int64_t offset = s->free_data_block_offset;
+
+ ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Allocates a new block. This involves writing a new footer and updating
+ * the Block Allocation Table to use the space at the old end of the image
+ * file (overwriting the old footer)
+ *
+ * Returns the sectors' offset in the image file on success and < 0 on error
+ */
+static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+{
+ BDRVVPCState *s = bs->opaque;
+ int64_t bat_offset;
+ uint32_t index, bat_value;
+ int ret;
+ uint8_t bitmap[s->bitmap_size];
+
+ // Check if sector_num is valid
+ if ((sector_num < 0) || (sector_num > bs->total_sectors))
+ return -1;
+
+ // Write entry into in-memory BAT
+ index = (sector_num * 512) / s->block_size;
+ if (s->pagetable[index] != 0xFFFFFFFF)
+ return -1;
+
+ s->pagetable[index] = s->free_data_block_offset / 512;
+
+ // Initialize the block's bitmap
+ memset(bitmap, 0xff, s->bitmap_size);
+ ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
+ s->bitmap_size);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // Write new footer (the old one will be overwritten)
+ s->free_data_block_offset += s->block_size + s->bitmap_size;
+ ret = rewrite_footer(bs);
+ if (ret < 0)
+ goto fail;
+
+ // Write BAT entry to disk
+ bat_offset = s->bat_offset + (4 * index);
+ bat_value = be32_to_cpu(s->pagetable[index]);
+ ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
+ if (ret < 0)
+ goto fail;
+
+ return get_sector_offset(bs, sector_num, 0);
+
+fail:
+ s->free_data_block_offset -= (s->block_size + s->bitmap_size);
+ return -1;
+}
+
+static int vpc_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVPCState *s = bs->opaque;
+ int ret;
+ int64_t offset;
+ int64_t sectors, sectors_per_block;
+ struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf;
+
+ if (cpu_to_be32(footer->type) == VHD_FIXED) {
+ return bdrv_read(bs->file, sector_num, buf, nb_sectors);
+ }
+ while (nb_sectors > 0) {
+ offset = get_sector_offset(bs, sector_num, 0);
+
+ sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
+ sectors = sectors_per_block - (sector_num % sectors_per_block);
+ if (sectors > nb_sectors) {
+ sectors = nb_sectors;
+ }
+
+ if (offset == -1) {
+ memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
+ } else {
+ ret = bdrv_pread(bs->file, offset, buf,
+ sectors * BDRV_SECTOR_SIZE);
+ if (ret != sectors * BDRV_SECTOR_SIZE) {
+ return -1;
+ }
+ }
+
+ nb_sectors -= sectors;
+ sector_num += sectors;
+ buf += sectors * BDRV_SECTOR_SIZE;
+ }
+ return 0;
+}
+
+static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVPCState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vpc_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int vpc_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVPCState *s = bs->opaque;
+ int64_t offset;
+ int64_t sectors, sectors_per_block;
+ int ret;
+ struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf;
+
+ if (cpu_to_be32(footer->type) == VHD_FIXED) {
+ return bdrv_write(bs->file, sector_num, buf, nb_sectors);
+ }
+ while (nb_sectors > 0) {
+ offset = get_sector_offset(bs, sector_num, 1);
+
+ sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
+ sectors = sectors_per_block - (sector_num % sectors_per_block);
+ if (sectors > nb_sectors) {
+ sectors = nb_sectors;
+ }
+
+ if (offset == -1) {
+ offset = alloc_block(bs, sector_num);
+ if (offset < 0)
+ return -1;
+ }
+
+ ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE);
+ if (ret != sectors * BDRV_SECTOR_SIZE) {
+ return -1;
+ }
+
+ nb_sectors -= sectors;
+ sector_num += sectors;
+ buf += sectors * BDRV_SECTOR_SIZE;
+ }
+
+ return 0;
+}
+
+static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVPCState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vpc_write(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+/*
+ * Calculates the number of cylinders, heads and sectors per cylinder
+ * based on a given number of sectors. This is the algorithm described
+ * in the VHD specification.
+ *
+ * Note that the geometry doesn't always exactly match total_sectors but
+ * may round it down.
+ *
+ * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ */
+static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
+ uint8_t* heads, uint8_t* secs_per_cyl)
+{
+ uint32_t cyls_times_heads;
+
+ if (total_sectors > 65535 * 16 * 255)
+ return -EFBIG;
+
+ if (total_sectors > 65535 * 16 * 63) {
+ *secs_per_cyl = 255;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ } else {
+ *secs_per_cyl = 17;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ *heads = (cyls_times_heads + 1023) / 1024;
+
+ if (*heads < 4)
+ *heads = 4;
+
+ if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
+ *secs_per_cyl = 31;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ }
+
+ if (cyls_times_heads >= (*heads * 1024)) {
+ *secs_per_cyl = 63;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ }
+ }
+
+ *cyls = cyls_times_heads / *heads;
+
+ return 0;
+}
+
+static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
+{
+ struct vhd_dyndisk_header* dyndisk_header =
+ (struct vhd_dyndisk_header*) buf;
+ size_t block_size, num_bat_entries;
+ int i;
+ int ret = -EIO;
+
+ // Write the footer (twice: at the beginning and at the end)
+ block_size = 0x200000;
+ num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
+
+ if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+ goto fail;
+ }
+
+ if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) {
+ goto fail;
+ }
+ if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+ goto fail;
+ }
+
+ // Write the initial BAT
+ if (lseek(fd, 3 * 512, SEEK_SET) < 0) {
+ goto fail;
+ }
+
+ memset(buf, 0xFF, 512);
+ for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
+ if (write(fd, buf, 512) != 512) {
+ goto fail;
+ }
+ }
+
+ // Prepare the Dynamic Disk Header
+ memset(buf, 0, 1024);
+
+ memcpy(dyndisk_header->magic, "cxsparse", 8);
+
+ /*
+ * Note: The spec is actually wrong here for data_offset, it says
+ * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
+ */
+ dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
+ dyndisk_header->table_offset = be64_to_cpu(3 * 512);
+ dyndisk_header->version = be32_to_cpu(0x00010000);
+ dyndisk_header->block_size = be32_to_cpu(block_size);
+ dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+
+ dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+
+ // Write the header
+ if (lseek(fd, 512, SEEK_SET) < 0) {
+ goto fail;
+ }
+
+ if (write(fd, buf, 1024) != 1024) {
+ goto fail;
+ }
+ ret = 0;
+
+ fail:
+ return ret;
+}
+
+static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size)
+{
+ int ret = -EIO;
+
+ /* Add footer to total size */
+ total_size += 512;
+ if (ftruncate(fd, total_size) != 0) {
+ ret = -errno;
+ goto fail;
+ }
+ if (lseek(fd, -512, SEEK_END) < 0) {
+ goto fail;
+ }
+ if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+ goto fail;
+ }
+
+ ret = 0;
+
+ fail:
+ return ret;
+}
+
+static int vpc_create(const char *filename, QEMUOptionParameter *options)
+{
+ uint8_t buf[1024];
+ struct vhd_footer *footer = (struct vhd_footer *) buf;
+ QEMUOptionParameter *disk_type_param;
+ int fd, i;
+ uint16_t cyls = 0;
+ uint8_t heads = 0;
+ uint8_t secs_per_cyl = 0;
+ int64_t total_sectors;
+ int64_t total_size;
+ int disk_type;
+ int ret = -EIO;
+
+ /* Read out options */
+ total_size = get_option_parameter(options, BLOCK_OPT_SIZE)->value.n;
+
+ disk_type_param = get_option_parameter(options, BLOCK_OPT_SUBFMT);
+ if (disk_type_param && disk_type_param->value.s) {
+ if (!strcmp(disk_type_param->value.s, "dynamic")) {
+ disk_type = VHD_DYNAMIC;
+ } else if (!strcmp(disk_type_param->value.s, "fixed")) {
+ disk_type = VHD_FIXED;
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ disk_type = VHD_DYNAMIC;
+ }
+
+ /* Create the file */
+ fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if (fd < 0) {
+ return -EIO;
+ }
+
+ /*
+ * Calculate matching total_size and geometry. Increase the number of
+ * sectors requested until we get enough (or fail). This ensures that
+ * qemu-img convert doesn't truncate images, but rather rounds up.
+ */
+ total_sectors = total_size / BDRV_SECTOR_SIZE;
+ for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
+ if (calculate_geometry(total_sectors + i, &cyls, &heads,
+ &secs_per_cyl))
+ {
+ ret = -EFBIG;
+ goto fail;
+ }
+ }
+
+ total_sectors = (int64_t) cyls * heads * secs_per_cyl;
+
+ /* Prepare the Hard Disk Footer */
+ memset(buf, 0, 1024);
+
+ memcpy(footer->creator, "conectix", 8);
+ /* TODO Check if "qemu" creator_app is ok for VPC */
+ memcpy(footer->creator_app, "qemu", 4);
+ memcpy(footer->creator_os, "Wi2k", 4);
+
+ footer->features = be32_to_cpu(0x02);
+ footer->version = be32_to_cpu(0x00010000);
+ if (disk_type == VHD_DYNAMIC) {
+ footer->data_offset = be64_to_cpu(HEADER_SIZE);
+ } else {
+ footer->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
+ }
+ footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+
+ /* Version of Virtual PC 2007 */
+ footer->major = be16_to_cpu(0x0005);
+ footer->minor = be16_to_cpu(0x0003);
+ if (disk_type == VHD_DYNAMIC) {
+ footer->orig_size = be64_to_cpu(total_sectors * 512);
+ footer->size = be64_to_cpu(total_sectors * 512);
+ } else {
+ footer->orig_size = be64_to_cpu(total_size);
+ footer->size = be64_to_cpu(total_size);
+ }
+ footer->cyls = be16_to_cpu(cyls);
+ footer->heads = heads;
+ footer->secs_per_cyl = secs_per_cyl;
+
+ footer->type = be32_to_cpu(disk_type);
+
+ /* TODO uuid is missing */
+
+ footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+
+ if (disk_type == VHD_DYNAMIC) {
+ ret = create_dynamic_disk(fd, buf, total_sectors);
+ } else {
+ ret = create_fixed_disk(fd, buf, total_size);
+ }
+
+ fail:
+ qemu_close(fd);
+ return ret;
+}
+
+static void vpc_close(BlockDriverState *bs)
+{
+ BDRVVPCState *s = bs->opaque;
+ g_free(s->pagetable);
+#ifdef CACHE
+ g_free(s->pageentry_u8);
+#endif
+
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+}
+
+static QEMUOptionParameter vpc_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_SUBFMT,
+ .type = OPT_STRING,
+ .help =
+ "Type of virtual hard disk format. Supported formats are "
+ "{dynamic (default) | fixed} "
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_vpc = {
+ .format_name = "vpc",
+ .instance_size = sizeof(BDRVVPCState),
+
+ .bdrv_probe = vpc_probe,
+ .bdrv_open = vpc_open,
+ .bdrv_close = vpc_close,
+ .bdrv_create = vpc_create,
+
+ .bdrv_read = vpc_co_read,
+ .bdrv_write = vpc_co_write,
+
+ .create_options = vpc_create_options,
+};
+
+static void bdrv_vpc_init(void)
+{
+ bdrv_register(&bdrv_vpc);
+}
+
+block_init(bdrv_vpc_init);
diff --git a/block/vvfat.c b/block/vvfat.c
new file mode 100644
index 000000000..59d3c5b8a
--- /dev/null
+++ b/block/vvfat.c
@@ -0,0 +1,2911 @@
+/* vim:set shiftwidth=4 ts=8: */
+/*
+ * QEMU Block driver for virtual VFAT (shadows a local directory)
+ *
+ * Copyright (c) 2004,2005 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <sys/stat.h>
+#include <dirent.h>
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "migration.h"
+
+#ifndef S_IWGRP
+#define S_IWGRP 0
+#endif
+#ifndef S_IWOTH
+#define S_IWOTH 0
+#endif
+
+/* TODO: add ":bootsector=blabla.img:" */
+/* LATER TODO: add automatic boot sector generation from
+ BOOTEASY.ASM and Ranish Partition Manager
+ Note that DOS assumes the system files to be the first files in the
+ file system (test if the boot sector still relies on that fact)! */
+/* MAYBE TODO: write block-visofs.c */
+/* TODO: call try_commit() only after a timeout */
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+
+#define DLOG(a) a
+
+#undef stderr
+#define stderr STDERR
+FILE* stderr = NULL;
+
+static void checkpoint(void);
+
+#ifdef __MINGW32__
+void nonono(const char* file, int line, const char* msg) {
+ fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg);
+ exit(-5);
+}
+#undef assert
+#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0)
+#endif
+
+#else
+
+#define DLOG(a)
+
+#endif
+
+/* dynamic array functions */
+typedef struct array_t {
+ char* pointer;
+ unsigned int size,next,item_size;
+} array_t;
+
+static inline void array_init(array_t* array,unsigned int item_size)
+{
+ array->pointer = NULL;
+ array->size=0;
+ array->next=0;
+ array->item_size=item_size;
+}
+
+static inline void array_free(array_t* array)
+{
+ g_free(array->pointer);
+ array->size=array->next=0;
+}
+
+/* does not automatically grow */
+static inline void* array_get(array_t* array,unsigned int index) {
+ assert(index < array->next);
+ return array->pointer + index * array->item_size;
+}
+
+static inline int array_ensure_allocated(array_t* array, int index)
+{
+ if((index + 1) * array->item_size > array->size) {
+ int new_size = (index + 32) * array->item_size;
+ array->pointer = g_realloc(array->pointer, new_size);
+ if (!array->pointer)
+ return -1;
+ array->size = new_size;
+ array->next = index + 1;
+ }
+
+ return 0;
+}
+
+static inline void* array_get_next(array_t* array) {
+ unsigned int next = array->next;
+ void* result;
+
+ if (array_ensure_allocated(array, next) < 0)
+ return NULL;
+
+ array->next = next + 1;
+ result = array_get(array, next);
+
+ return result;
+}
+
+static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
+ if((array->next+count)*array->item_size>array->size) {
+ int increment=count*array->item_size;
+ array->pointer=g_realloc(array->pointer,array->size+increment);
+ if(!array->pointer)
+ return NULL;
+ array->size+=increment;
+ }
+ memmove(array->pointer+(index+count)*array->item_size,
+ array->pointer+index*array->item_size,
+ (array->next-index)*array->item_size);
+ array->next+=count;
+ return array->pointer+index*array->item_size;
+}
+
+/* this performs a "roll", so that the element which was at index_from becomes
+ * index_to, but the order of all other elements is preserved. */
+static inline int array_roll(array_t* array,int index_to,int index_from,int count)
+{
+ char* buf;
+ char* from;
+ char* to;
+ int is;
+
+ if(!array ||
+ index_to<0 || index_to>=array->next ||
+ index_from<0 || index_from>=array->next)
+ return -1;
+
+ if(index_to==index_from)
+ return 0;
+
+ is=array->item_size;
+ from=array->pointer+index_from*is;
+ to=array->pointer+index_to*is;
+ buf=g_malloc(is*count);
+ memcpy(buf,from,is*count);
+
+ if(index_to<index_from)
+ memmove(to+is*count,to,from-to);
+ else
+ memmove(from,from+is*count,to-from);
+
+ memcpy(to,buf,is*count);
+
+ g_free(buf);
+
+ return 0;
+}
+
+static inline int array_remove_slice(array_t* array,int index, int count)
+{
+ assert(index >=0);
+ assert(count > 0);
+ assert(index + count <= array->next);
+ if(array_roll(array,array->next-1,index,count))
+ return -1;
+ array->next -= count;
+ return 0;
+}
+
+static int array_remove(array_t* array,int index)
+{
+ return array_remove_slice(array, index, 1);
+}
+
+/* return the index for a given member */
+static int array_index(array_t* array, void* pointer)
+{
+ size_t offset = (char*)pointer - array->pointer;
+ assert((offset % array->item_size) == 0);
+ assert(offset/array->item_size < array->next);
+ return offset/array->item_size;
+}
+
+/* These structures are used to fake a disk and the VFAT filesystem.
+ * For this reason we need to use QEMU_PACKED. */
+
+typedef struct bootsector_t {
+ uint8_t jump[3];
+ uint8_t name[8];
+ uint16_t sector_size;
+ uint8_t sectors_per_cluster;
+ uint16_t reserved_sectors;
+ uint8_t number_of_fats;
+ uint16_t root_entries;
+ uint16_t total_sectors16;
+ uint8_t media_type;
+ uint16_t sectors_per_fat;
+ uint16_t sectors_per_track;
+ uint16_t number_of_heads;
+ uint32_t hidden_sectors;
+ uint32_t total_sectors;
+ union {
+ struct {
+ uint8_t drive_number;
+ uint8_t current_head;
+ uint8_t signature;
+ uint32_t id;
+ uint8_t volume_label[11];
+ } QEMU_PACKED fat16;
+ struct {
+ uint32_t sectors_per_fat;
+ uint16_t flags;
+ uint8_t major,minor;
+ uint32_t first_cluster_of_root_directory;
+ uint16_t info_sector;
+ uint16_t backup_boot_sector;
+ uint16_t ignored;
+ } QEMU_PACKED fat32;
+ } u;
+ uint8_t fat_type[8];
+ uint8_t ignored[0x1c0];
+ uint8_t magic[2];
+} QEMU_PACKED bootsector_t;
+
+typedef struct {
+ uint8_t head;
+ uint8_t sector;
+ uint8_t cylinder;
+} mbr_chs_t;
+
+typedef struct partition_t {
+ uint8_t attributes; /* 0x80 = bootable */
+ mbr_chs_t start_CHS;
+ uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */
+ mbr_chs_t end_CHS;
+ uint32_t start_sector_long;
+ uint32_t length_sector_long;
+} QEMU_PACKED partition_t;
+
+typedef struct mbr_t {
+ uint8_t ignored[0x1b8];
+ uint32_t nt_id;
+ uint8_t ignored2[2];
+ partition_t partition[4];
+ uint8_t magic[2];
+} QEMU_PACKED mbr_t;
+
+typedef struct direntry_t {
+ uint8_t name[8];
+ uint8_t extension[3];
+ uint8_t attributes;
+ uint8_t reserved[2];
+ uint16_t ctime;
+ uint16_t cdate;
+ uint16_t adate;
+ uint16_t begin_hi;
+ uint16_t mtime;
+ uint16_t mdate;
+ uint16_t begin;
+ uint32_t size;
+} QEMU_PACKED direntry_t;
+
+/* this structure are used to transparently access the files */
+
+typedef struct mapping_t {
+ /* begin is the first cluster, end is the last+1 */
+ uint32_t begin,end;
+ /* as s->directory is growable, no pointer may be used here */
+ unsigned int dir_index;
+ /* the clusters of a file may be in any order; this points to the first */
+ int first_mapping_index;
+ union {
+ /* offset is
+ * - the offset in the file (in clusters) for a file, or
+ * - the next cluster of the directory for a directory, and
+ * - the address of the buffer for a faked entry
+ */
+ struct {
+ uint32_t offset;
+ } file;
+ struct {
+ int parent_mapping_index;
+ int first_dir_index;
+ } dir;
+ } info;
+ /* path contains the full path, i.e. it always starts with s->path */
+ char* path;
+
+ enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2,
+ MODE_DIRECTORY = 4, MODE_FAKED = 8,
+ MODE_DELETED = 16, MODE_RENAMED = 32 } mode;
+ int read_only;
+} mapping_t;
+
+#ifdef DEBUG
+static void print_direntry(const struct direntry_t*);
+static void print_mapping(const struct mapping_t* mapping);
+#endif
+
+/* here begins the real VVFAT driver */
+
+typedef struct BDRVVVFATState {
+ CoMutex lock;
+ BlockDriverState* bs; /* pointer to parent */
+ unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */
+ unsigned char first_sectors[0x40*0x200];
+
+ int fat_type; /* 16 or 32 */
+ array_t fat,directory,mapping;
+
+ unsigned int cluster_size;
+ unsigned int sectors_per_cluster;
+ unsigned int sectors_per_fat;
+ unsigned int sectors_of_root_directory;
+ uint32_t last_cluster_of_root_directory;
+ unsigned int faked_sectors; /* how many sectors are faked before file data */
+ uint32_t sector_count; /* total number of sectors of the partition */
+ uint32_t cluster_count; /* total number of clusters of this partition */
+ uint32_t max_fat_value;
+
+ int current_fd;
+ mapping_t* current_mapping;
+ unsigned char* cluster; /* points to current cluster */
+ unsigned char* cluster_buffer; /* points to a buffer to hold temp data */
+ unsigned int current_cluster;
+
+ /* write support */
+ BlockDriverState* write_target;
+ char* qcow_filename;
+ BlockDriverState* qcow;
+ void* fat2;
+ char* used_clusters;
+ array_t commits;
+ const char* path;
+ int downcase_short_names;
+
+ Error *migration_blocker;
+} BDRVVVFATState;
+
+/* take the sector position spos and convert it to Cylinder/Head/Sector position
+ * if the position is outside the specified geometry, fill maximum value for CHS
+ * and return 1 to signal overflow.
+ */
+static int sector2CHS(mbr_chs_t *chs, int spos, int cyls, int heads, int secs)
+{
+ int head,sector;
+ sector = spos % secs; spos /= secs;
+ head = spos % heads; spos /= heads;
+ if (spos >= cyls) {
+ /* Overflow,
+ it happens if 32bit sector positions are used, while CHS is only 24bit.
+ Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */
+ chs->head = 0xFF;
+ chs->sector = 0xFF;
+ chs->cylinder = 0xFF;
+ return 1;
+ }
+ chs->head = (uint8_t)head;
+ chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) );
+ chs->cylinder = (uint8_t)spos;
+ return 0;
+}
+
+static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs)
+{
+ /* TODO: if the files mbr.img and bootsect.img exist, use them */
+ mbr_t* real_mbr=(mbr_t*)s->first_sectors;
+ partition_t* partition = &(real_mbr->partition[0]);
+ int lba;
+
+ memset(s->first_sectors,0,512);
+
+ /* Win NT Disk Signature */
+ real_mbr->nt_id= cpu_to_le32(0xbe1afdfa);
+
+ partition->attributes=0x80; /* bootable */
+
+ /* LBA is used when partition is outside the CHS geometry */
+ lba = sector2CHS(&partition->start_CHS, s->first_sectors_number - 1,
+ cyls, heads, secs);
+ lba |= sector2CHS(&partition->end_CHS, s->bs->total_sectors - 1,
+ cyls, heads, secs);
+
+ /*LBA partitions are identified only by start/length_sector_long not by CHS*/
+ partition->start_sector_long = cpu_to_le32(s->first_sectors_number - 1);
+ partition->length_sector_long = cpu_to_le32(s->bs->total_sectors
+ - s->first_sectors_number + 1);
+
+ /* FAT12/FAT16/FAT32 */
+ /* DOS uses different types when partition is LBA,
+ probably to prevent older versions from using CHS on them */
+ partition->fs_type= s->fat_type==12 ? 0x1:
+ s->fat_type==16 ? (lba?0xe:0x06):
+ /*fat_tyoe==32*/ (lba?0xc:0x0b);
+
+ real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa;
+}
+
+/* direntry functions */
+
+/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
+static inline int short2long_name(char* dest,const char* src)
+{
+ int i;
+ int len;
+ for(i=0;i<129 && src[i];i++) {
+ dest[2*i]=src[i];
+ dest[2*i+1]=0;
+ }
+ len=2*i;
+ dest[2*i]=dest[2*i+1]=0;
+ for(i=2*i+2;(i%26);i++)
+ dest[i]=0xff;
+ return len;
+}
+
+static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename)
+{
+ char buffer[258];
+ int length=short2long_name(buffer,filename),
+ number_of_entries=(length+25)/26,i;
+ direntry_t* entry;
+
+ for(i=0;i<number_of_entries;i++) {
+ entry=array_get_next(&(s->directory));
+ entry->attributes=0xf;
+ entry->reserved[0]=0;
+ entry->begin=0;
+ entry->name[0]=(number_of_entries-i)|(i==0?0x40:0);
+ }
+ for(i=0;i<26*number_of_entries;i++) {
+ int offset=(i%26);
+ if(offset<10) offset=1+offset;
+ else if(offset<22) offset=14+offset-10;
+ else offset=28+offset-22;
+ entry=array_get(&(s->directory),s->directory.next-1-(i/26));
+ entry->name[offset]=buffer[i];
+ }
+ return array_get(&(s->directory),s->directory.next-number_of_entries);
+}
+
+static char is_free(const direntry_t* direntry)
+{
+ return direntry->name[0]==0xe5 || direntry->name[0]==0x00;
+}
+
+static char is_volume_label(const direntry_t* direntry)
+{
+ return direntry->attributes == 0x28;
+}
+
+static char is_long_name(const direntry_t* direntry)
+{
+ return direntry->attributes == 0xf;
+}
+
+static char is_short_name(const direntry_t* direntry)
+{
+ return !is_volume_label(direntry) && !is_long_name(direntry)
+ && !is_free(direntry);
+}
+
+static char is_directory(const direntry_t* direntry)
+{
+ return direntry->attributes & 0x10 && direntry->name[0] != 0xe5;
+}
+
+static inline char is_dot(const direntry_t* direntry)
+{
+ return is_short_name(direntry) && direntry->name[0] == '.';
+}
+
+static char is_file(const direntry_t* direntry)
+{
+ return is_short_name(direntry) && !is_directory(direntry);
+}
+
+static inline uint32_t begin_of_direntry(const direntry_t* direntry)
+{
+ return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16);
+}
+
+static inline uint32_t filesize_of_direntry(const direntry_t* direntry)
+{
+ return le32_to_cpu(direntry->size);
+}
+
+static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin)
+{
+ direntry->begin = cpu_to_le16(begin & 0xffff);
+ direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff);
+}
+
+/* fat functions */
+
+static inline uint8_t fat_chksum(const direntry_t* entry)
+{
+ uint8_t chksum=0;
+ int i;
+
+ for(i=0;i<11;i++) {
+ unsigned char c;
+
+ c = (i < 8) ? entry->name[i] : entry->extension[i-8];
+ chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+ }
+
+ return chksum;
+}
+
+/* if return_time==0, this returns the fat_date, else the fat_time */
+static uint16_t fat_datetime(time_t time,int return_time) {
+ struct tm* t;
+#ifdef _WIN32
+ t=localtime(&time); /* this is not thread safe */
+#else
+ struct tm t1;
+ t = &t1;
+ localtime_r(&time,t);
+#endif
+ if(return_time)
+ return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
+ return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
+}
+
+static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value)
+{
+ if(s->fat_type==32) {
+ uint32_t* entry=array_get(&(s->fat),cluster);
+ *entry=cpu_to_le32(value);
+ } else if(s->fat_type==16) {
+ uint16_t* entry=array_get(&(s->fat),cluster);
+ *entry=cpu_to_le16(value&0xffff);
+ } else {
+ int offset = (cluster*3/2);
+ unsigned char* p = array_get(&(s->fat), offset);
+ switch (cluster&1) {
+ case 0:
+ p[0] = value&0xff;
+ p[1] = (p[1]&0xf0) | ((value>>8)&0xf);
+ break;
+ case 1:
+ p[0] = (p[0]&0xf) | ((value&0xf)<<4);
+ p[1] = (value>>4);
+ break;
+ }
+ }
+}
+
+static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster)
+{
+ if(s->fat_type==32) {
+ uint32_t* entry=array_get(&(s->fat),cluster);
+ return le32_to_cpu(*entry);
+ } else if(s->fat_type==16) {
+ uint16_t* entry=array_get(&(s->fat),cluster);
+ return le16_to_cpu(*entry);
+ } else {
+ const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2;
+ return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+ }
+}
+
+static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry)
+{
+ if(fat_entry>s->max_fat_value-8)
+ return -1;
+ return 0;
+}
+
+static inline void init_fat(BDRVVVFATState* s)
+{
+ if (s->fat_type == 12) {
+ array_init(&(s->fat),1);
+ array_ensure_allocated(&(s->fat),
+ s->sectors_per_fat * 0x200 * 3 / 2 - 1);
+ } else {
+ array_init(&(s->fat),(s->fat_type==32?4:2));
+ array_ensure_allocated(&(s->fat),
+ s->sectors_per_fat * 0x200 / s->fat.item_size - 1);
+ }
+ memset(s->fat.pointer,0,s->fat.size);
+
+ switch(s->fat_type) {
+ case 12: s->max_fat_value=0xfff; break;
+ case 16: s->max_fat_value=0xffff; break;
+ case 32: s->max_fat_value=0x0fffffff; break;
+ default: s->max_fat_value=0; /* error... */
+ }
+
+}
+
+/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */
+/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */
+static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
+ unsigned int directory_start, const char* filename, int is_dot)
+{
+ int i,j,long_index=s->directory.next;
+ direntry_t* entry = NULL;
+ direntry_t* entry_long = NULL;
+
+ if(is_dot) {
+ entry=array_get_next(&(s->directory));
+ memset(entry->name,0x20,11);
+ memcpy(entry->name,filename,strlen(filename));
+ return entry;
+ }
+
+ entry_long=create_long_filename(s,filename);
+
+ i = strlen(filename);
+ for(j = i - 1; j>0 && filename[j]!='.';j--);
+ if (j > 0)
+ i = (j > 8 ? 8 : j);
+ else if (i > 8)
+ i = 8;
+
+ entry=array_get_next(&(s->directory));
+ memset(entry->name,0x20,11);
+ memcpy(entry->name, filename, i);
+
+ if(j > 0)
+ for (i = 0; i < 3 && filename[j+1+i]; i++)
+ entry->extension[i] = filename[j+1+i];
+
+ /* upcase & remove unwanted characters */
+ for(i=10;i>=0;i--) {
+ if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--);
+ if(entry->name[i]<=' ' || entry->name[i]>0x7f
+ || strchr(".*?<>|\":/\\[];,+='",entry->name[i]))
+ entry->name[i]='_';
+ else if(entry->name[i]>='a' && entry->name[i]<='z')
+ entry->name[i]+='A'-'a';
+ }
+
+ /* mangle duplicates */
+ while(1) {
+ direntry_t* entry1=array_get(&(s->directory),directory_start);
+ int j;
+
+ for(;entry1<entry;entry1++)
+ if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11))
+ break; /* found dupe */
+ if(entry1==entry) /* no dupe found */
+ break;
+
+ /* use all 8 characters of name */
+ if(entry->name[7]==' ') {
+ int j;
+ for(j=6;j>0 && entry->name[j]==' ';j--)
+ entry->name[j]='~';
+ }
+
+ /* increment number */
+ for(j=7;j>0 && entry->name[j]=='9';j--)
+ entry->name[j]='0';
+ if(j>0) {
+ if(entry->name[j]<'0' || entry->name[j]>'9')
+ entry->name[j]='0';
+ else
+ entry->name[j]++;
+ }
+ }
+
+ /* calculate checksum; propagate to long name */
+ if(entry_long) {
+ uint8_t chksum=fat_chksum(entry);
+
+ /* calculate anew, because realloc could have taken place */
+ entry_long=array_get(&(s->directory),long_index);
+ while(entry_long<entry && is_long_name(entry_long)) {
+ entry_long->reserved[1]=chksum;
+ entry_long++;
+ }
+ }
+
+ return entry;
+}
+
+/*
+ * Read a directory. (the index of the corresponding mapping must be passed).
+ */
+static int read_directory(BDRVVVFATState* s, int mapping_index)
+{
+ mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+ direntry_t* direntry;
+ const char* dirname = mapping->path;
+ int first_cluster = mapping->begin;
+ int parent_index = mapping->info.dir.parent_mapping_index;
+ mapping_t* parent_mapping = (mapping_t*)
+ (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL);
+ int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1;
+
+ DIR* dir=opendir(dirname);
+ struct dirent* entry;
+ int i;
+
+ assert(mapping->mode & MODE_DIRECTORY);
+
+ if(!dir) {
+ mapping->end = mapping->begin;
+ return -1;
+ }
+
+ i = mapping->info.dir.first_dir_index =
+ first_cluster == 0 ? 0 : s->directory.next;
+
+ /* actually read the directory, and allocate the mappings */
+ while((entry=readdir(dir))) {
+ unsigned int length=strlen(dirname)+2+strlen(entry->d_name);
+ char* buffer;
+ direntry_t* direntry;
+ struct stat st;
+ int is_dot=!strcmp(entry->d_name,".");
+ int is_dotdot=!strcmp(entry->d_name,"..");
+
+ if(first_cluster == 0 && (is_dotdot || is_dot))
+ continue;
+
+ buffer=(char*)g_malloc(length);
+ snprintf(buffer,length,"%s/%s",dirname,entry->d_name);
+
+ if(stat(buffer,&st)<0) {
+ g_free(buffer);
+ continue;
+ }
+
+ /* create directory entry for this file */
+ direntry=create_short_and_long_name(s, i, entry->d_name,
+ is_dot || is_dotdot);
+ direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20);
+ direntry->reserved[0]=direntry->reserved[1]=0;
+ direntry->ctime=fat_datetime(st.st_ctime,1);
+ direntry->cdate=fat_datetime(st.st_ctime,0);
+ direntry->adate=fat_datetime(st.st_atime,0);
+ direntry->begin_hi=0;
+ direntry->mtime=fat_datetime(st.st_mtime,1);
+ direntry->mdate=fat_datetime(st.st_mtime,0);
+ if(is_dotdot)
+ set_begin_of_direntry(direntry, first_cluster_of_parent);
+ else if(is_dot)
+ set_begin_of_direntry(direntry, first_cluster);
+ else
+ direntry->begin=0; /* do that later */
+ if (st.st_size > 0x7fffffff) {
+ fprintf(stderr, "File %s is larger than 2GB\n", buffer);
+ g_free(buffer);
+ closedir(dir);
+ return -2;
+ }
+ direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size);
+
+ /* create mapping for this file */
+ if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
+ s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+ s->current_mapping->begin=0;
+ s->current_mapping->end=st.st_size;
+ /*
+ * we get the direntry of the most recent direntry, which
+ * contains the short name and all the relevant information.
+ */
+ s->current_mapping->dir_index=s->directory.next-1;
+ s->current_mapping->first_mapping_index = -1;
+ if (S_ISDIR(st.st_mode)) {
+ s->current_mapping->mode = MODE_DIRECTORY;
+ s->current_mapping->info.dir.parent_mapping_index =
+ mapping_index;
+ } else {
+ s->current_mapping->mode = MODE_UNDEFINED;
+ s->current_mapping->info.file.offset = 0;
+ }
+ s->current_mapping->path=buffer;
+ s->current_mapping->read_only =
+ (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0;
+ }
+ }
+ closedir(dir);
+
+ /* fill with zeroes up to the end of the cluster */
+ while(s->directory.next%(0x10*s->sectors_per_cluster)) {
+ direntry_t* direntry=array_get_next(&(s->directory));
+ memset(direntry,0,sizeof(direntry_t));
+ }
+
+/* TODO: if there are more entries, bootsector has to be adjusted! */
+#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster)
+ if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) {
+ /* root directory */
+ int cur = s->directory.next;
+ array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1);
+ s->directory.next = ROOT_ENTRIES;
+ memset(array_get(&(s->directory), cur), 0,
+ (ROOT_ENTRIES - cur) * sizeof(direntry_t));
+ }
+
+ /* reget the mapping, since s->mapping was possibly realloc()ed */
+ mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+ first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
+ * 0x20 / s->cluster_size;
+ mapping->end = first_cluster;
+
+ direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+ set_begin_of_direntry(direntry, mapping->begin);
+
+ return 0;
+}
+
+static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
+{
+ return (sector_num-s->faked_sectors)/s->sectors_per_cluster;
+}
+
+static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num)
+{
+ return s->faked_sectors + s->sectors_per_cluster * cluster_num;
+}
+
+static int init_directories(BDRVVVFATState* s,
+ const char *dirname, int heads, int secs)
+{
+ bootsector_t* bootsector;
+ mapping_t* mapping;
+ unsigned int i;
+ unsigned int cluster;
+
+ memset(&(s->first_sectors[0]),0,0x40*0x200);
+
+ s->cluster_size=s->sectors_per_cluster*0x200;
+ s->cluster_buffer=g_malloc(s->cluster_size);
+
+ /*
+ * The formula: sc = spf+1+spf*spc*(512*8/fat_type),
+ * where sc is sector_count,
+ * spf is sectors_per_fat,
+ * spc is sectors_per_clusters, and
+ * fat_type = 12, 16 or 32.
+ */
+ i = 1+s->sectors_per_cluster*0x200*8/s->fat_type;
+ s->sectors_per_fat=(s->sector_count+i)/i; /* round up */
+
+ array_init(&(s->mapping),sizeof(mapping_t));
+ array_init(&(s->directory),sizeof(direntry_t));
+
+ /* add volume label */
+ {
+ direntry_t* entry=array_get_next(&(s->directory));
+ entry->attributes=0x28; /* archive | volume label */
+ memcpy(entry->name,"QEMU VVF",8);
+ memcpy(entry->extension,"AT ",3);
+ }
+
+ /* Now build FAT, and write back information into directory */
+ init_fat(s);
+
+ s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2;
+ s->cluster_count=sector2cluster(s, s->sector_count);
+
+ mapping = array_get_next(&(s->mapping));
+ mapping->begin = 0;
+ mapping->dir_index = 0;
+ mapping->info.dir.parent_mapping_index = -1;
+ mapping->first_mapping_index = -1;
+ mapping->path = g_strdup(dirname);
+ i = strlen(mapping->path);
+ if (i > 0 && mapping->path[i - 1] == '/')
+ mapping->path[i - 1] = '\0';
+ mapping->mode = MODE_DIRECTORY;
+ mapping->read_only = 0;
+ s->path = mapping->path;
+
+ for (i = 0, cluster = 0; i < s->mapping.next; i++) {
+ /* MS-DOS expects the FAT to be 0 for the root directory
+ * (except for the media byte). */
+ /* LATER TODO: still true for FAT32? */
+ int fix_fat = (i != 0);
+ mapping = array_get(&(s->mapping), i);
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ mapping->begin = cluster;
+ if(read_directory(s, i)) {
+ fprintf(stderr, "Could not read directory %s\n",
+ mapping->path);
+ return -1;
+ }
+ mapping = array_get(&(s->mapping), i);
+ } else {
+ assert(mapping->mode == MODE_UNDEFINED);
+ mapping->mode=MODE_NORMAL;
+ mapping->begin = cluster;
+ if (mapping->end > 0) {
+ direntry_t* direntry = array_get(&(s->directory),
+ mapping->dir_index);
+
+ mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size;
+ set_begin_of_direntry(direntry, mapping->begin);
+ } else {
+ mapping->end = cluster + 1;
+ fix_fat = 0;
+ }
+ }
+
+ assert(mapping->begin < mapping->end);
+
+ /* next free cluster */
+ cluster = mapping->end;
+
+ if(cluster > s->cluster_count) {
+ fprintf(stderr,"Directory does not fit in FAT%d (capacity %.2f MB)\n",
+ s->fat_type, s->sector_count / 2000.0);
+ return -EINVAL;
+ }
+
+ /* fix fat for entry */
+ if (fix_fat) {
+ int j;
+ for(j = mapping->begin; j < mapping->end - 1; j++)
+ fat_set(s, j, j+1);
+ fat_set(s, mapping->end - 1, s->max_fat_value);
+ }
+ }
+
+ mapping = array_get(&(s->mapping), 0);
+ s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster;
+ s->last_cluster_of_root_directory = mapping->end;
+
+ /* the FAT signature */
+ fat_set(s,0,s->max_fat_value);
+ fat_set(s,1,s->max_fat_value);
+
+ s->current_mapping = NULL;
+
+ bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200);
+ bootsector->jump[0]=0xeb;
+ bootsector->jump[1]=0x3e;
+ bootsector->jump[2]=0x90;
+ memcpy(bootsector->name,"QEMU ",8);
+ bootsector->sector_size=cpu_to_le16(0x200);
+ bootsector->sectors_per_cluster=s->sectors_per_cluster;
+ bootsector->reserved_sectors=cpu_to_le16(1);
+ bootsector->number_of_fats=0x2; /* number of FATs */
+ bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10);
+ bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count);
+ bootsector->media_type=(s->first_sectors_number>1?0xf8:0xf0); /* media descriptor (f8=hd, f0=3.5 fd)*/
+ s->fat.pointer[0] = bootsector->media_type;
+ bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat);
+ bootsector->sectors_per_track = cpu_to_le16(secs);
+ bootsector->number_of_heads = cpu_to_le16(heads);
+ bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f);
+ bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0);
+
+ /* LATER TODO: if FAT32, this is wrong */
+ bootsector->u.fat16.drive_number=s->first_sectors_number==1?0:0x80; /* fda=0, hda=0x80 */
+ bootsector->u.fat16.current_head=0;
+ bootsector->u.fat16.signature=0x29;
+ bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
+
+ memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+ memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8);
+ bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
+
+ return 0;
+}
+
+#ifdef DEBUG
+static BDRVVVFATState *vvv = NULL;
+#endif
+
+static int enable_write_target(BDRVVVFATState *s);
+static int is_consistent(BDRVVVFATState *s);
+
+static void vvfat_rebind(BlockDriverState *bs)
+{
+ BDRVVVFATState *s = bs->opaque;
+ s->bs = bs;
+}
+
+static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int i, cyls, heads, secs;
+
+#ifdef DEBUG
+ vvv = s;
+#endif
+
+DLOG(if (stderr == NULL) {
+ stderr = fopen("vvfat.log", "a");
+ setbuf(stderr, NULL);
+})
+
+ s->bs = bs;
+
+ /* LATER TODO: if FAT32, adjust */
+ s->sectors_per_cluster=0x10;
+
+ s->current_cluster=0xffffffff;
+
+ s->first_sectors_number=0x40;
+ /* read only is the default for safety */
+ bs->read_only = 1;
+ s->qcow = s->write_target = NULL;
+ s->qcow_filename = NULL;
+ s->fat2 = NULL;
+ s->downcase_short_names = 1;
+
+ if (!strstart(dirname, "fat:", NULL))
+ return -1;
+
+ if (strstr(dirname, ":32:")) {
+ fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
+ s->fat_type = 32;
+ } else if (strstr(dirname, ":16:")) {
+ s->fat_type = 16;
+ } else if (strstr(dirname, ":12:")) {
+ s->fat_type = 12;
+ }
+
+ if (strstr(dirname, ":floppy:")) {
+ /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */
+ if (!s->fat_type) {
+ s->fat_type = 12;
+ secs = 36;
+ s->sectors_per_cluster=2;
+ } else {
+ secs = s->fat_type == 12 ? 18 : 36;
+ s->sectors_per_cluster=1;
+ }
+ s->first_sectors_number = 1;
+ cyls = 80;
+ heads = 2;
+ } else {
+ /* 32MB or 504MB disk*/
+ if (!s->fat_type) {
+ s->fat_type = 16;
+ }
+ cyls = s->fat_type == 12 ? 64 : 1024;
+ heads = 16;
+ secs = 63;
+ }
+ fprintf(stderr, "vvfat %s chs %d,%d,%d\n",
+ dirname, cyls, heads, secs);
+
+ s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);
+
+ if (strstr(dirname, ":rw:")) {
+ if (enable_write_target(s))
+ return -1;
+ bs->read_only = 0;
+ }
+
+ i = strrchr(dirname, ':') - dirname;
+ assert(i >= 3);
+ if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
+ /* workaround for DOS drive names */
+ dirname += i-1;
+ else
+ dirname += i+1;
+
+ bs->total_sectors = cyls * heads * secs;
+
+ if (init_directories(s, dirname, heads, secs)) {
+ return -1;
+ }
+
+ s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
+
+ if (s->first_sectors_number == 0x40) {
+ init_mbr(s, cyls, heads, secs);
+ }
+
+ // assert(is_consistent(s));
+ qemu_co_mutex_init(&s->lock);
+
+ /* Disable migration when vvfat is used rw */
+ if (s->qcow) {
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "vvfat (rw)", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+ }
+
+ return 0;
+}
+
+static inline void vvfat_close_current_file(BDRVVVFATState *s)
+{
+ if(s->current_mapping) {
+ s->current_mapping = NULL;
+ if (s->current_fd) {
+ qemu_close(s->current_fd);
+ s->current_fd = 0;
+ }
+ }
+ s->current_cluster = -1;
+}
+
+/* mappings between index1 and index2-1 are supposed to be ordered
+ * return value is the index of the last mapping for which end>cluster_num
+ */
+static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2)
+{
+ while(1) {
+ int index3;
+ mapping_t* mapping;
+ index3=(index1+index2)/2;
+ mapping=array_get(&(s->mapping),index3);
+ assert(mapping->begin < mapping->end);
+ if(mapping->begin>=cluster_num) {
+ assert(index2!=index3 || index2==0);
+ if(index2==index3)
+ return index1;
+ index2=index3;
+ } else {
+ if(index1==index3)
+ return mapping->end<=cluster_num ? index2 : index1;
+ index1=index3;
+ }
+ assert(index1<=index2);
+ DLOG(mapping=array_get(&(s->mapping),index1);
+ assert(mapping->begin<=cluster_num);
+ assert(index2 >= s->mapping.next ||
+ ((mapping = array_get(&(s->mapping),index2)) &&
+ mapping->end>cluster_num)));
+ }
+}
+
+static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num)
+{
+ int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next);
+ mapping_t* mapping;
+ if(index>=s->mapping.next)
+ return NULL;
+ mapping=array_get(&(s->mapping),index);
+ if(mapping->begin>cluster_num)
+ return NULL;
+ assert(mapping->begin<=cluster_num && mapping->end>cluster_num);
+ return mapping;
+}
+
+static int open_file(BDRVVVFATState* s,mapping_t* mapping)
+{
+ if(!mapping)
+ return -1;
+ if(!s->current_mapping ||
+ strcmp(s->current_mapping->path,mapping->path)) {
+ /* open file */
+ int fd = qemu_open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if(fd<0)
+ return -1;
+ vvfat_close_current_file(s);
+ s->current_fd = fd;
+ s->current_mapping = mapping;
+ }
+ return 0;
+}
+
+static inline int read_cluster(BDRVVVFATState *s,int cluster_num)
+{
+ if(s->current_cluster != cluster_num) {
+ int result=0;
+ off_t offset;
+ assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY));
+ if(!s->current_mapping
+ || s->current_mapping->begin>cluster_num
+ || s->current_mapping->end<=cluster_num) {
+ /* binary search of mappings for file */
+ mapping_t* mapping=find_mapping_for_cluster(s,cluster_num);
+
+ assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end));
+
+ if (mapping && mapping->mode & MODE_DIRECTORY) {
+ vvfat_close_current_file(s);
+ s->current_mapping = mapping;
+read_cluster_directory:
+ offset = s->cluster_size*(cluster_num-s->current_mapping->begin);
+ s->cluster = (unsigned char*)s->directory.pointer+offset
+ + 0x20*s->current_mapping->info.dir.first_dir_index;
+ assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0);
+ assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size);
+ s->current_cluster = cluster_num;
+ return 0;
+ }
+
+ if(open_file(s,mapping))
+ return -2;
+ } else if (s->current_mapping->mode & MODE_DIRECTORY)
+ goto read_cluster_directory;
+
+ assert(s->current_fd);
+
+ offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset;
+ if(lseek(s->current_fd, offset, SEEK_SET)!=offset)
+ return -3;
+ s->cluster=s->cluster_buffer;
+ result=read(s->current_fd,s->cluster,s->cluster_size);
+ if(result<0) {
+ s->current_cluster = -1;
+ return -1;
+ }
+ s->current_cluster = cluster_num;
+ }
+ return 0;
+}
+
+#ifdef DEBUG
+static void print_direntry(const direntry_t* direntry)
+{
+ int j = 0;
+ char buffer[1024];
+
+ fprintf(stderr, "direntry %p: ", direntry);
+ if(!direntry)
+ return;
+ if(is_long_name(direntry)) {
+ unsigned char* c=(unsigned char*)direntry;
+ int i;
+ for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2)
+#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;}
+ ADD_CHAR(c[i]);
+ for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2)
+ ADD_CHAR(c[i]);
+ for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2)
+ ADD_CHAR(c[i]);
+ buffer[j] = 0;
+ fprintf(stderr, "%s\n", buffer);
+ } else {
+ int i;
+ for(i=0;i<11;i++)
+ ADD_CHAR(direntry->name[i]);
+ buffer[j] = 0;
+ fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n",
+ buffer,
+ direntry->attributes,
+ begin_of_direntry(direntry),le32_to_cpu(direntry->size));
+ }
+}
+
+static void print_mapping(const mapping_t* mapping)
+{
+ fprintf(stderr, "mapping (%p): begin, end = %d, %d, dir_index = %d, "
+ "first_mapping_index = %d, name = %s, mode = 0x%x, " ,
+ mapping, mapping->begin, mapping->end, mapping->dir_index,
+ mapping->first_mapping_index, mapping->path, mapping->mode);
+
+ if (mapping->mode & MODE_DIRECTORY)
+ fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index);
+ else
+ fprintf(stderr, "offset = %d\n", mapping->info.file.offset);
+}
+#endif
+
+static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int i;
+
+ for(i=0;i<nb_sectors;i++,sector_num++) {
+ if (sector_num >= bs->total_sectors)
+ return -1;
+ if (s->qcow) {
+ int n;
+ if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) {
+DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
+ if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) {
+ return -1;
+ }
+ i += n - 1;
+ sector_num += n - 1;
+ continue;
+ }
+DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
+ }
+ if(sector_num<s->faked_sectors) {
+ if(sector_num<s->first_sectors_number)
+ memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200);
+ else if(sector_num-s->first_sectors_number<s->sectors_per_fat)
+ memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200);
+ else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat)
+ memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200);
+ } else {
+ uint32_t sector=sector_num-s->faked_sectors,
+ sector_offset_in_cluster=(sector%s->sectors_per_cluster),
+ cluster_num=sector/s->sectors_per_cluster;
+ if(cluster_num > s->cluster_count || read_cluster(s, cluster_num) != 0) {
+ /* LATER TODO: strict: return -1; */
+ memset(buf+i*0x200,0,0x200);
+ continue;
+ }
+ memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200);
+ }
+ }
+ return 0;
+}
+
+static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVVFATState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vvfat_read(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+/* LATER TODO: statify all functions */
+
+/*
+ * Idea of the write support (use snapshot):
+ *
+ * 1. check if all data is consistent, recording renames, modifications,
+ * new files and directories (in s->commits).
+ *
+ * 2. if the data is not consistent, stop committing
+ *
+ * 3. handle renames, and create new files and directories (do not yet
+ * write their contents)
+ *
+ * 4. walk the directories, fixing the mapping and direntries, and marking
+ * the handled mappings as not deleted
+ *
+ * 5. commit the contents of the files
+ *
+ * 6. handle deleted files and directories
+ *
+ */
+
+typedef struct commit_t {
+ char* path;
+ union {
+ struct { uint32_t cluster; } rename;
+ struct { int dir_index; uint32_t modified_offset; } writeout;
+ struct { uint32_t first_cluster; } new_file;
+ struct { uint32_t cluster; } mkdir;
+ } param;
+ /* DELETEs and RMDIRs are handled differently: see handle_deletes() */
+ enum {
+ ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR
+ } action;
+} commit_t;
+
+static void clear_commits(BDRVVVFATState* s)
+{
+ int i;
+DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next));
+ for (i = 0; i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ assert(commit->path || commit->action == ACTION_WRITEOUT);
+ if (commit->action != ACTION_WRITEOUT) {
+ assert(commit->path);
+ g_free(commit->path);
+ } else
+ assert(commit->path == NULL);
+ }
+ s->commits.next = 0;
+}
+
+static void schedule_rename(BDRVVVFATState* s,
+ uint32_t cluster, char* new_path)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = new_path;
+ commit->param.rename.cluster = cluster;
+ commit->action = ACTION_RENAME;
+}
+
+static void schedule_writeout(BDRVVVFATState* s,
+ int dir_index, uint32_t modified_offset)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = NULL;
+ commit->param.writeout.dir_index = dir_index;
+ commit->param.writeout.modified_offset = modified_offset;
+ commit->action = ACTION_WRITEOUT;
+}
+
+static void schedule_new_file(BDRVVVFATState* s,
+ char* path, uint32_t first_cluster)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = path;
+ commit->param.new_file.first_cluster = first_cluster;
+ commit->action = ACTION_NEW_FILE;
+}
+
+static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = path;
+ commit->param.mkdir.cluster = cluster;
+ commit->action = ACTION_MKDIR;
+}
+
+typedef struct {
+ /*
+ * Since the sequence number is at most 0x3f, and the filename
+ * length is at most 13 times the sequence number, the maximal
+ * filename length is 0x3f * 13 bytes.
+ */
+ unsigned char name[0x3f * 13 + 1];
+ int checksum, len;
+ int sequence_number;
+} long_file_name;
+
+static void lfn_init(long_file_name* lfn)
+{
+ lfn->sequence_number = lfn->len = 0;
+ lfn->checksum = 0x100;
+}
+
+/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */
+static int parse_long_name(long_file_name* lfn,
+ const direntry_t* direntry)
+{
+ int i, j, offset;
+ const unsigned char* pointer = (const unsigned char*)direntry;
+
+ if (!is_long_name(direntry))
+ return 1;
+
+ if (pointer[0] & 0x40) {
+ lfn->sequence_number = pointer[0] & 0x3f;
+ lfn->checksum = pointer[13];
+ lfn->name[0] = 0;
+ lfn->name[lfn->sequence_number * 13] = 0;
+ } else if ((pointer[0] & 0x3f) != --lfn->sequence_number)
+ return -1;
+ else if (pointer[13] != lfn->checksum)
+ return -2;
+ else if (pointer[12] || pointer[26] || pointer[27])
+ return -3;
+
+ offset = 13 * (lfn->sequence_number - 1);
+ for (i = 0, j = 1; i < 13; i++, j+=2) {
+ if (j == 11)
+ j = 14;
+ else if (j == 26)
+ j = 28;
+
+ if (pointer[j+1] == 0)
+ lfn->name[offset + i] = pointer[j];
+ else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0)
+ return -4;
+ else
+ lfn->name[offset + i] = 0;
+ }
+
+ if (pointer[0] & 0x40)
+ lfn->len = offset + strlen((char*)lfn->name + offset);
+
+ return 0;
+}
+
+/* returns 0 if successful, >0 if no short_name, and <0 on error */
+static int parse_short_name(BDRVVVFATState* s,
+ long_file_name* lfn, direntry_t* direntry)
+{
+ int i, j;
+
+ if (!is_short_name(direntry))
+ return 1;
+
+ for (j = 7; j >= 0 && direntry->name[j] == ' '; j--);
+ for (i = 0; i <= j; i++) {
+ if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f)
+ return -1;
+ else if (s->downcase_short_names)
+ lfn->name[i] = qemu_tolower(direntry->name[i]);
+ else
+ lfn->name[i] = direntry->name[i];
+ }
+
+ for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+ if (j >= 0) {
+ lfn->name[i++] = '.';
+ lfn->name[i + j + 1] = '\0';
+ for (;j >= 0; j--) {
+ if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
+ return -2;
+ else if (s->downcase_short_names)
+ lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
+ else
+ lfn->name[i + j] = direntry->extension[j];
+ }
+ } else
+ lfn->name[i + j + 1] = '\0';
+
+ lfn->len = strlen((char*)lfn->name);
+
+ return 0;
+}
+
+static inline uint32_t modified_fat_get(BDRVVVFATState* s,
+ unsigned int cluster)
+{
+ if (cluster < s->last_cluster_of_root_directory) {
+ if (cluster + 1 == s->last_cluster_of_root_directory)
+ return s->max_fat_value;
+ else
+ return cluster + 1;
+ }
+
+ if (s->fat_type==32) {
+ uint32_t* entry=((uint32_t*)s->fat2)+cluster;
+ return le32_to_cpu(*entry);
+ } else if (s->fat_type==16) {
+ uint16_t* entry=((uint16_t*)s->fat2)+cluster;
+ return le16_to_cpu(*entry);
+ } else {
+ const uint8_t* x=s->fat2+cluster*3/2;
+ return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+ }
+}
+
+static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
+{
+ int was_modified = 0;
+ int i, dummy;
+
+ if (s->qcow == NULL)
+ return 0;
+
+ for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
+ was_modified = bdrv_is_allocated(s->qcow,
+ cluster2sector(s, cluster_num) + i, 1, &dummy);
+
+ return was_modified;
+}
+
+static const char* get_basename(const char* path)
+{
+ char* basename = strrchr(path, '/');
+ if (basename == NULL)
+ return path;
+ else
+ return basename + 1; /* strip '/' */
+}
+
+/*
+ * The array s->used_clusters holds the states of the clusters. If it is
+ * part of a file, it has bit 2 set, in case of a directory, bit 1. If it
+ * was modified, bit 3 is set.
+ * If any cluster is allocated, but not part of a file or directory, this
+ * driver refuses to commit.
+ */
+typedef enum {
+ USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4
+} used_t;
+
+/*
+ * get_cluster_count_for_direntry() not only determines how many clusters
+ * are occupied by direntry, but also if it was renamed or modified.
+ *
+ * A file is thought to be renamed *only* if there already was a file with
+ * exactly the same first cluster, but a different name.
+ *
+ * Further, the files/directories handled by this function are
+ * assumed to be *not* deleted (and *only* those).
+ */
+static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
+ direntry_t* direntry, const char* path)
+{
+ /*
+ * This is a little bit tricky:
+ * IF the guest OS just inserts a cluster into the file chain,
+ * and leaves the rest alone, (i.e. the original file had clusters
+ * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens:
+ *
+ * - do_commit will write the cluster into the file at the given
+ * offset, but
+ *
+ * - the cluster which is overwritten should be moved to a later
+ * position in the file.
+ *
+ * I am not aware that any OS does something as braindead, but this
+ * situation could happen anyway when not committing for a long time.
+ * Just to be sure that this does not bite us, detect it, and copy the
+ * contents of the clusters to-be-overwritten into the qcow.
+ */
+ int copy_it = 0;
+ int was_modified = 0;
+ int32_t ret = 0;
+
+ uint32_t cluster_num = begin_of_direntry(direntry);
+ uint32_t offset = 0;
+ int first_mapping_index = -1;
+ mapping_t* mapping = NULL;
+ const char* basename2 = NULL;
+
+ vvfat_close_current_file(s);
+
+ /* the root directory */
+ if (cluster_num == 0)
+ return 0;
+
+ /* write support */
+ if (s->qcow) {
+ basename2 = get_basename(path);
+
+ mapping = find_mapping_for_cluster(s, cluster_num);
+
+ if (mapping) {
+ const char* basename;
+
+ assert(mapping->mode & MODE_DELETED);
+ mapping->mode &= ~MODE_DELETED;
+
+ basename = get_basename(mapping->path);
+
+ assert(mapping->mode & MODE_NORMAL);
+
+ /* rename */
+ if (strcmp(basename, basename2))
+ schedule_rename(s, cluster_num, g_strdup(path));
+ } else if (is_file(direntry))
+ /* new file */
+ schedule_new_file(s, g_strdup(path), cluster_num);
+ else {
+ abort();
+ return 0;
+ }
+ }
+
+ while(1) {
+ if (s->qcow) {
+ if (!copy_it && cluster_was_modified(s, cluster_num)) {
+ if (mapping == NULL ||
+ mapping->begin > cluster_num ||
+ mapping->end <= cluster_num)
+ mapping = find_mapping_for_cluster(s, cluster_num);
+
+
+ if (mapping &&
+ (mapping->mode & MODE_DIRECTORY) == 0) {
+
+ /* was modified in qcow */
+ if (offset != mapping->info.file.offset + s->cluster_size
+ * (cluster_num - mapping->begin)) {
+ /* offset of this cluster in file chain has changed */
+ abort();
+ copy_it = 1;
+ } else if (offset == 0) {
+ const char* basename = get_basename(mapping->path);
+
+ if (strcmp(basename, basename2))
+ copy_it = 1;
+ first_mapping_index = array_index(&(s->mapping), mapping);
+ }
+
+ if (mapping->first_mapping_index != first_mapping_index
+ && mapping->info.file.offset > 0) {
+ abort();
+ copy_it = 1;
+ }
+
+ /* need to write out? */
+ if (!was_modified && is_file(direntry)) {
+ was_modified = 1;
+ schedule_writeout(s, mapping->dir_index, offset);
+ }
+ }
+ }
+
+ if (copy_it) {
+ int i, dummy;
+ /*
+ * This is horribly inefficient, but that is okay, since
+ * it is rarely executed, if at all.
+ */
+ int64_t offset = cluster2sector(s, cluster_num);
+
+ vvfat_close_current_file(s);
+ for (i = 0; i < s->sectors_per_cluster; i++) {
+ if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) {
+ if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) {
+ return -1;
+ }
+ if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) {
+ return -2;
+ }
+ }
+ }
+ }
+ }
+
+ ret++;
+ if (s->used_clusters[cluster_num] & USED_ANY)
+ return 0;
+ s->used_clusters[cluster_num] = USED_FILE;
+
+ cluster_num = modified_fat_get(s, cluster_num);
+
+ if (fat_eof(s, cluster_num))
+ return ret;
+ else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16)
+ return -1;
+
+ offset += s->cluster_size;
+ }
+}
+
+/*
+ * This function looks at the modified data (qcow).
+ * It returns 0 upon inconsistency or error, and the number of clusters
+ * used by the directory, its subdirectories and their files.
+ */
+static int check_directory_consistency(BDRVVVFATState *s,
+ int cluster_num, const char* path)
+{
+ int ret = 0;
+ unsigned char* cluster = g_malloc(s->cluster_size);
+ direntry_t* direntries = (direntry_t*)cluster;
+ mapping_t* mapping = find_mapping_for_cluster(s, cluster_num);
+
+ long_file_name lfn;
+ int path_len = strlen(path);
+ char path2[PATH_MAX + 1];
+
+ assert(path_len < PATH_MAX); /* len was tested before! */
+ pstrcpy(path2, sizeof(path2), path);
+ path2[path_len] = '/';
+ path2[path_len + 1] = '\0';
+
+ if (mapping) {
+ const char* basename = get_basename(mapping->path);
+ const char* basename2 = get_basename(path);
+
+ assert(mapping->mode & MODE_DIRECTORY);
+
+ assert(mapping->mode & MODE_DELETED);
+ mapping->mode &= ~MODE_DELETED;
+
+ if (strcmp(basename, basename2))
+ schedule_rename(s, cluster_num, g_strdup(path));
+ } else
+ /* new directory */
+ schedule_mkdir(s, cluster_num, g_strdup(path));
+
+ lfn_init(&lfn);
+ do {
+ int i;
+ int subret = 0;
+
+ ret++;
+
+ if (s->used_clusters[cluster_num] & USED_ANY) {
+ fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num);
+ return 0;
+ }
+ s->used_clusters[cluster_num] = USED_DIRECTORY;
+
+DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num)));
+ subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster,
+ s->sectors_per_cluster);
+ if (subret) {
+ fprintf(stderr, "Error fetching direntries\n");
+ fail:
+ g_free(cluster);
+ return 0;
+ }
+
+ for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) {
+ int cluster_count = 0;
+
+DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i));
+ if (is_volume_label(direntries + i) || is_dot(direntries + i) ||
+ is_free(direntries + i))
+ continue;
+
+ subret = parse_long_name(&lfn, direntries + i);
+ if (subret < 0) {
+ fprintf(stderr, "Error in long name\n");
+ goto fail;
+ }
+ if (subret == 0 || is_free(direntries + i))
+ continue;
+
+ if (fat_chksum(direntries+i) != lfn.checksum) {
+ subret = parse_short_name(s, &lfn, direntries + i);
+ if (subret < 0) {
+ fprintf(stderr, "Error in short name (%d)\n", subret);
+ goto fail;
+ }
+ if (subret > 0 || !strcmp((char*)lfn.name, ".")
+ || !strcmp((char*)lfn.name, ".."))
+ continue;
+ }
+ lfn.checksum = 0x100; /* cannot use long name twice */
+
+ if (path_len + 1 + lfn.len >= PATH_MAX) {
+ fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name);
+ goto fail;
+ }
+ pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1,
+ (char*)lfn.name);
+
+ if (is_directory(direntries + i)) {
+ if (begin_of_direntry(direntries + i) == 0) {
+ DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i));
+ goto fail;
+ }
+ cluster_count = check_directory_consistency(s,
+ begin_of_direntry(direntries + i), path2);
+ if (cluster_count == 0) {
+ DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i));
+ goto fail;
+ }
+ } else if (is_file(direntries + i)) {
+ /* check file size with FAT */
+ cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
+ if (cluster_count !=
+ (le32_to_cpu(direntries[i].size) + s->cluster_size
+ - 1) / s->cluster_size) {
+ DLOG(fprintf(stderr, "Cluster count mismatch\n"));
+ goto fail;
+ }
+ } else
+ abort(); /* cluster_count = 0; */
+
+ ret += cluster_count;
+ }
+
+ cluster_num = modified_fat_get(s, cluster_num);
+ } while(!fat_eof(s, cluster_num));
+
+ g_free(cluster);
+ return ret;
+}
+
+/* returns 1 on success */
+static int is_consistent(BDRVVVFATState* s)
+{
+ int i, check;
+ int used_clusters_count = 0;
+
+DLOG(checkpoint());
+ /*
+ * - get modified FAT
+ * - compare the two FATs (TODO)
+ * - get buffer for marking used clusters
+ * - recurse direntries from root (using bs->bdrv_read to make
+ * sure to get the new data)
+ * - check that the FAT agrees with the size
+ * - count the number of clusters occupied by this directory and
+ * its files
+ * - check that the cumulative used cluster count agrees with the
+ * FAT
+ * - if all is fine, return number of used clusters
+ */
+ if (s->fat2 == NULL) {
+ int size = 0x200 * s->sectors_per_fat;
+ s->fat2 = g_malloc(size);
+ memcpy(s->fat2, s->fat.pointer, size);
+ }
+ check = vvfat_read(s->bs,
+ s->first_sectors_number, s->fat2, s->sectors_per_fat);
+ if (check) {
+ fprintf(stderr, "Could not copy fat\n");
+ return 0;
+ }
+ assert (s->used_clusters);
+ for (i = 0; i < sector2cluster(s, s->sector_count); i++)
+ s->used_clusters[i] &= ~USED_ANY;
+
+ clear_commits(s);
+
+ /* mark every mapped file/directory as deleted.
+ * (check_directory_consistency() will unmark those still present). */
+ if (s->qcow)
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->first_mapping_index < 0)
+ mapping->mode |= MODE_DELETED;
+ }
+
+ used_clusters_count = check_directory_consistency(s, 0, s->path);
+ if (used_clusters_count <= 0) {
+ DLOG(fprintf(stderr, "problem in directory\n"));
+ return 0;
+ }
+
+ check = s->last_cluster_of_root_directory;
+ for (i = check; i < sector2cluster(s, s->sector_count); i++) {
+ if (modified_fat_get(s, i)) {
+ if(!s->used_clusters[i]) {
+ DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i));
+ return 0;
+ }
+ check++;
+ }
+
+ if (s->used_clusters[i] == USED_ALLOCATED) {
+ /* allocated, but not used... */
+ DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i));
+ return 0;
+ }
+ }
+
+ if (check != used_clusters_count)
+ return 0;
+
+ return used_clusters_count;
+}
+
+static inline void adjust_mapping_indices(BDRVVVFATState* s,
+ int offset, int adjust)
+{
+ int i;
+
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+
+#define ADJUST_MAPPING_INDEX(name) \
+ if (mapping->name >= offset) \
+ mapping->name += adjust
+
+ ADJUST_MAPPING_INDEX(first_mapping_index);
+ if (mapping->mode & MODE_DIRECTORY)
+ ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index);
+ }
+}
+
+/* insert or update mapping */
+static mapping_t* insert_mapping(BDRVVVFATState* s,
+ uint32_t begin, uint32_t end)
+{
+ /*
+ * - find mapping where mapping->begin >= begin,
+ * - if mapping->begin > begin: insert
+ * - adjust all references to mappings!
+ * - else: adjust
+ * - replace name
+ */
+ int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next);
+ mapping_t* mapping = NULL;
+ mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+ if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index))
+ && mapping->begin < begin) {
+ mapping->end = begin;
+ index++;
+ mapping = array_get(&(s->mapping), index);
+ }
+ if (index >= s->mapping.next || mapping->begin > begin) {
+ mapping = array_insert(&(s->mapping), index, 1);
+ mapping->path = NULL;
+ adjust_mapping_indices(s, index, +1);
+ }
+
+ mapping->begin = begin;
+ mapping->end = end;
+
+DLOG(mapping_t* next_mapping;
+assert(index + 1 >= s->mapping.next ||
+((next_mapping = array_get(&(s->mapping), index + 1)) &&
+ next_mapping->begin >= end)));
+
+ if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+ s->current_mapping = array_get(&(s->mapping),
+ s->current_mapping - first_mapping);
+
+ return mapping;
+}
+
+static int remove_mapping(BDRVVVFATState* s, int mapping_index)
+{
+ mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+ mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+ /* free mapping */
+ if (mapping->first_mapping_index < 0) {
+ g_free(mapping->path);
+ }
+
+ /* remove from s->mapping */
+ array_remove(&(s->mapping), mapping_index);
+
+ /* adjust all references to mappings */
+ adjust_mapping_indices(s, mapping_index, -1);
+
+ if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+ s->current_mapping = array_get(&(s->mapping),
+ s->current_mapping - first_mapping);
+
+ return 0;
+}
+
+static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust)
+{
+ int i;
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->dir_index >= offset)
+ mapping->dir_index += adjust;
+ if ((mapping->mode & MODE_DIRECTORY) &&
+ mapping->info.dir.first_dir_index >= offset)
+ mapping->info.dir.first_dir_index += adjust;
+ }
+}
+
+static direntry_t* insert_direntries(BDRVVVFATState* s,
+ int dir_index, int count)
+{
+ /*
+ * make room in s->directory,
+ * adjust_dirindices
+ */
+ direntry_t* result = array_insert(&(s->directory), dir_index, count);
+ if (result == NULL)
+ return NULL;
+ adjust_dirindices(s, dir_index, count);
+ return result;
+}
+
+static int remove_direntries(BDRVVVFATState* s, int dir_index, int count)
+{
+ int ret = array_remove_slice(&(s->directory), dir_index, count);
+ if (ret)
+ return ret;
+ adjust_dirindices(s, dir_index, -count);
+ return 0;
+}
+
+/*
+ * Adapt the mappings of the cluster chain starting at first cluster
+ * (i.e. if a file starts at first_cluster, the chain is followed according
+ * to the modified fat, and the corresponding entries in s->mapping are
+ * adjusted)
+ */
+static int commit_mappings(BDRVVVFATState* s,
+ uint32_t first_cluster, int dir_index)
+{
+ mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t cluster = first_cluster;
+
+ vvfat_close_current_file(s);
+
+ assert(mapping);
+ assert(mapping->begin == first_cluster);
+ mapping->first_mapping_index = -1;
+ mapping->dir_index = dir_index;
+ mapping->mode = (dir_index <= 0 || is_directory(direntry)) ?
+ MODE_DIRECTORY : MODE_NORMAL;
+
+ while (!fat_eof(s, cluster)) {
+ uint32_t c, c1;
+
+ for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1;
+ c = c1, c1 = modified_fat_get(s, c1));
+
+ c++;
+ if (c > mapping->end) {
+ int index = array_index(&(s->mapping), mapping);
+ int i, max_i = s->mapping.next - index;
+ for (i = 1; i < max_i && mapping[i].begin < c; i++);
+ while (--i > 0)
+ remove_mapping(s, index + 1);
+ }
+ assert(mapping == array_get(&(s->mapping), s->mapping.next - 1)
+ || mapping[1].begin >= c);
+ mapping->end = c;
+
+ if (!fat_eof(s, c1)) {
+ int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next);
+ mapping_t* next_mapping = i >= s->mapping.next ? NULL :
+ array_get(&(s->mapping), i);
+
+ if (next_mapping == NULL || next_mapping->begin > c1) {
+ int i1 = array_index(&(s->mapping), mapping);
+
+ next_mapping = insert_mapping(s, c1, c1+1);
+
+ if (c1 < c)
+ i1++;
+ mapping = array_get(&(s->mapping), i1);
+ }
+
+ next_mapping->dir_index = mapping->dir_index;
+ next_mapping->first_mapping_index =
+ mapping->first_mapping_index < 0 ?
+ array_index(&(s->mapping), mapping) :
+ mapping->first_mapping_index;
+ next_mapping->path = mapping->path;
+ next_mapping->mode = mapping->mode;
+ next_mapping->read_only = mapping->read_only;
+ if (mapping->mode & MODE_DIRECTORY) {
+ next_mapping->info.dir.parent_mapping_index =
+ mapping->info.dir.parent_mapping_index;
+ next_mapping->info.dir.first_dir_index =
+ mapping->info.dir.first_dir_index +
+ 0x10 * s->sectors_per_cluster *
+ (mapping->end - mapping->begin);
+ } else
+ next_mapping->info.file.offset = mapping->info.file.offset +
+ mapping->end - mapping->begin;
+
+ mapping = next_mapping;
+ }
+
+ cluster = c1;
+ }
+
+ return 0;
+}
+
+static int commit_direntries(BDRVVVFATState* s,
+ int dir_index, int parent_mapping_index)
+{
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry);
+ mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+
+ int factor = 0x10 * s->sectors_per_cluster;
+ int old_cluster_count, new_cluster_count;
+ int current_dir_index = mapping->info.dir.first_dir_index;
+ int first_dir_index = current_dir_index;
+ int ret, i;
+ uint32_t c;
+
+DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index));
+
+ assert(direntry);
+ assert(mapping);
+ assert(mapping->begin == first_cluster);
+ assert(mapping->info.dir.first_dir_index < s->directory.next);
+ assert(mapping->mode & MODE_DIRECTORY);
+ assert(dir_index == 0 || is_directory(direntry));
+
+ mapping->info.dir.parent_mapping_index = parent_mapping_index;
+
+ if (first_cluster == 0) {
+ old_cluster_count = new_cluster_count =
+ s->last_cluster_of_root_directory;
+ } else {
+ for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+ c = fat_get(s, c))
+ old_cluster_count++;
+
+ for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+ c = modified_fat_get(s, c))
+ new_cluster_count++;
+ }
+
+ if (new_cluster_count > old_cluster_count) {
+ if (insert_direntries(s,
+ current_dir_index + factor * old_cluster_count,
+ factor * (new_cluster_count - old_cluster_count)) == NULL)
+ return -1;
+ } else if (new_cluster_count < old_cluster_count)
+ remove_direntries(s,
+ current_dir_index + factor * new_cluster_count,
+ factor * (old_cluster_count - new_cluster_count));
+
+ for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) {
+ void* direntry = array_get(&(s->directory), current_dir_index);
+ int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry,
+ s->sectors_per_cluster);
+ if (ret)
+ return ret;
+ assert(!strncmp(s->directory.pointer, "QEMU", 4));
+ current_dir_index += factor;
+ }
+
+ ret = commit_mappings(s, first_cluster, dir_index);
+ if (ret)
+ return ret;
+
+ /* recurse */
+ for (i = 0; i < factor * new_cluster_count; i++) {
+ direntry = array_get(&(s->directory), first_dir_index + i);
+ if (is_directory(direntry) && !is_dot(direntry)) {
+ mapping = find_mapping_for_cluster(s, first_cluster);
+ assert(mapping->mode & MODE_DIRECTORY);
+ ret = commit_direntries(s, first_dir_index + i,
+ array_index(&(s->mapping), mapping));
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* commit one file (adjust contents, adjust mapping),
+ return first_mapping_index */
+static int commit_one_file(BDRVVVFATState* s,
+ int dir_index, uint32_t offset)
+{
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t c = begin_of_direntry(direntry);
+ uint32_t first_cluster = c;
+ mapping_t* mapping = find_mapping_for_cluster(s, c);
+ uint32_t size = filesize_of_direntry(direntry);
+ char* cluster = g_malloc(s->cluster_size);
+ uint32_t i;
+ int fd = 0;
+
+ assert(offset < size);
+ assert((offset % s->cluster_size) == 0);
+
+ for (i = s->cluster_size; i < offset; i += s->cluster_size)
+ c = modified_fat_get(s, c);
+
+ fd = qemu_open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666);
+ if (fd < 0) {
+ fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path,
+ strerror(errno), errno);
+ g_free(cluster);
+ return fd;
+ }
+ if (offset > 0) {
+ if (lseek(fd, offset, SEEK_SET) != offset) {
+ qemu_close(fd);
+ g_free(cluster);
+ return -3;
+ }
+ }
+
+ while (offset < size) {
+ uint32_t c1;
+ int rest_size = (size - offset > s->cluster_size ?
+ s->cluster_size : size - offset);
+ int ret;
+
+ c1 = modified_fat_get(s, c);
+
+ assert((size - offset == 0 && fat_eof(s, c)) ||
+ (size > offset && c >=2 && !fat_eof(s, c)));
+
+ ret = vvfat_read(s->bs, cluster2sector(s, c),
+ (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200);
+
+ if (ret < 0) {
+ qemu_close(fd);
+ g_free(cluster);
+ return ret;
+ }
+
+ if (write(fd, cluster, rest_size) < 0) {
+ qemu_close(fd);
+ g_free(cluster);
+ return -2;
+ }
+
+ offset += rest_size;
+ c = c1;
+ }
+
+ if (ftruncate(fd, size)) {
+ perror("ftruncate()");
+ qemu_close(fd);
+ g_free(cluster);
+ return -4;
+ }
+ qemu_close(fd);
+ g_free(cluster);
+
+ return commit_mappings(s, first_cluster, dir_index);
+}
+
+#ifdef DEBUG
+/* test, if all mappings point to valid direntries */
+static void check1(BDRVVVFATState* s)
+{
+ int i;
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->mode & MODE_DELETED) {
+ fprintf(stderr, "deleted\n");
+ continue;
+ }
+ assert(mapping->dir_index < s->directory.next);
+ direntry_t* direntry = array_get(&(s->directory), mapping->dir_index);
+ assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0);
+ if (mapping->mode & MODE_DIRECTORY) {
+ assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next);
+ assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0);
+ }
+ }
+}
+
+/* test, if all direntries have mappings */
+static void check2(BDRVVVFATState* s)
+{
+ int i;
+ int first_mapping = -1;
+
+ for (i = 0; i < s->directory.next; i++) {
+ direntry_t* direntry = array_get(&(s->directory), i);
+
+ if (is_short_name(direntry) && begin_of_direntry(direntry)) {
+ mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry));
+ assert(mapping);
+ assert(mapping->dir_index == i || is_dot(direntry));
+ assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry));
+ }
+
+ if ((i % (0x10 * s->sectors_per_cluster)) == 0) {
+ /* cluster start */
+ int j, count = 0;
+
+ for (j = 0; j < s->mapping.next; j++) {
+ mapping_t* mapping = array_get(&(s->mapping), j);
+ if (mapping->mode & MODE_DELETED)
+ continue;
+ if (mapping->mode & MODE_DIRECTORY) {
+ if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) {
+ assert(++count == 1);
+ if (mapping->first_mapping_index == -1)
+ first_mapping = array_index(&(s->mapping), mapping);
+ else
+ assert(first_mapping == mapping->first_mapping_index);
+ if (mapping->info.dir.parent_mapping_index < 0)
+ assert(j == 0);
+ else {
+ mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index);
+ assert(parent->mode & MODE_DIRECTORY);
+ assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index);
+ }
+ }
+ }
+ }
+ if (count == 0)
+ first_mapping = -1;
+ }
+ }
+}
+#endif
+
+static int handle_renames_and_mkdirs(BDRVVVFATState* s)
+{
+ int i;
+
+#ifdef DEBUG
+ fprintf(stderr, "handle_renames\n");
+ for (i = 0; i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action);
+ }
+#endif
+
+ for (i = 0; i < s->commits.next;) {
+ commit_t* commit = array_get(&(s->commits), i);
+ if (commit->action == ACTION_RENAME) {
+ mapping_t* mapping = find_mapping_for_cluster(s,
+ commit->param.rename.cluster);
+ char* old_path = mapping->path;
+
+ assert(commit->path);
+ mapping->path = commit->path;
+ if (rename(old_path, mapping->path))
+ return -2;
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ int l1 = strlen(mapping->path);
+ int l2 = strlen(old_path);
+ int diff = l1 - l2;
+ direntry_t* direntry = array_get(&(s->directory),
+ mapping->info.dir.first_dir_index);
+ uint32_t c = mapping->begin;
+ int i = 0;
+
+ /* recurse */
+ while (!fat_eof(s, c)) {
+ do {
+ direntry_t* d = direntry + i;
+
+ if (is_file(d) || (is_directory(d) && !is_dot(d))) {
+ mapping_t* m = find_mapping_for_cluster(s,
+ begin_of_direntry(d));
+ int l = strlen(m->path);
+ char* new_path = g_malloc(l + diff + 1);
+
+ assert(!strncmp(m->path, mapping->path, l2));
+
+ pstrcpy(new_path, l + diff + 1, mapping->path);
+ pstrcpy(new_path + l1, l + diff + 1 - l1,
+ m->path + l2);
+
+ schedule_rename(s, m->begin, new_path);
+ }
+ i++;
+ } while((i % (0x10 * s->sectors_per_cluster)) != 0);
+ c = fat_get(s, c);
+ }
+ }
+
+ g_free(old_path);
+ array_remove(&(s->commits), i);
+ continue;
+ } else if (commit->action == ACTION_MKDIR) {
+ mapping_t* mapping;
+ int j, parent_path_len;
+
+#ifdef __MINGW32__
+ if (mkdir(commit->path))
+ return -5;
+#else
+ if (mkdir(commit->path, 0755))
+ return -5;
+#endif
+
+ mapping = insert_mapping(s, commit->param.mkdir.cluster,
+ commit->param.mkdir.cluster + 1);
+ if (mapping == NULL)
+ return -6;
+
+ mapping->mode = MODE_DIRECTORY;
+ mapping->read_only = 0;
+ mapping->path = commit->path;
+ j = s->directory.next;
+ assert(j);
+ insert_direntries(s, s->directory.next,
+ 0x10 * s->sectors_per_cluster);
+ mapping->info.dir.first_dir_index = j;
+
+ parent_path_len = strlen(commit->path)
+ - strlen(get_basename(commit->path)) - 1;
+ for (j = 0; j < s->mapping.next; j++) {
+ mapping_t* m = array_get(&(s->mapping), j);
+ if (m->first_mapping_index < 0 && m != mapping &&
+ !strncmp(m->path, mapping->path, parent_path_len) &&
+ strlen(m->path) == parent_path_len)
+ break;
+ }
+ assert(j < s->mapping.next);
+ mapping->info.dir.parent_mapping_index = j;
+
+ array_remove(&(s->commits), i);
+ continue;
+ }
+
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * TODO: make sure that the short name is not matching *another* file
+ */
+static int handle_commits(BDRVVVFATState* s)
+{
+ int i, fail = 0;
+
+ vvfat_close_current_file(s);
+
+ for (i = 0; !fail && i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ switch(commit->action) {
+ case ACTION_RENAME: case ACTION_MKDIR:
+ abort();
+ fail = -2;
+ break;
+ case ACTION_WRITEOUT: {
+#ifndef NDEBUG
+ /* these variables are only used by assert() below */
+ direntry_t* entry = array_get(&(s->directory),
+ commit->param.writeout.dir_index);
+ uint32_t begin = begin_of_direntry(entry);
+ mapping_t* mapping = find_mapping_for_cluster(s, begin);
+#endif
+
+ assert(mapping);
+ assert(mapping->begin == begin);
+ assert(commit->path == NULL);
+
+ if (commit_one_file(s, commit->param.writeout.dir_index,
+ commit->param.writeout.modified_offset))
+ fail = -3;
+
+ break;
+ }
+ case ACTION_NEW_FILE: {
+ int begin = commit->param.new_file.first_cluster;
+ mapping_t* mapping = find_mapping_for_cluster(s, begin);
+ direntry_t* entry;
+ int i;
+
+ /* find direntry */
+ for (i = 0; i < s->directory.next; i++) {
+ entry = array_get(&(s->directory), i);
+ if (is_file(entry) && begin_of_direntry(entry) == begin)
+ break;
+ }
+
+ if (i >= s->directory.next) {
+ fail = -6;
+ continue;
+ }
+
+ /* make sure there exists an initial mapping */
+ if (mapping && mapping->begin != begin) {
+ mapping->end = begin;
+ mapping = NULL;
+ }
+ if (mapping == NULL) {
+ mapping = insert_mapping(s, begin, begin+1);
+ }
+ /* most members will be fixed in commit_mappings() */
+ assert(commit->path);
+ mapping->path = commit->path;
+ mapping->read_only = 0;
+ mapping->mode = MODE_NORMAL;
+ mapping->info.file.offset = 0;
+
+ if (commit_one_file(s, i, 0))
+ fail = -7;
+
+ break;
+ }
+ default:
+ abort();
+ }
+ }
+ if (i > 0 && array_remove_slice(&(s->commits), 0, i))
+ return -1;
+ return fail;
+}
+
+static int handle_deletes(BDRVVVFATState* s)
+{
+ int i, deferred = 1, deleted = 1;
+
+ /* delete files corresponding to mappings marked as deleted */
+ /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */
+ while (deferred && deleted) {
+ deferred = 0;
+ deleted = 0;
+
+ for (i = 1; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->mode & MODE_DELETED) {
+ direntry_t* entry = array_get(&(s->directory),
+ mapping->dir_index);
+
+ if (is_free(entry)) {
+ /* remove file/directory */
+ if (mapping->mode & MODE_DIRECTORY) {
+ int j, next_dir_index = s->directory.next,
+ first_dir_index = mapping->info.dir.first_dir_index;
+
+ if (rmdir(mapping->path) < 0) {
+ if (errno == ENOTEMPTY) {
+ deferred++;
+ continue;
+ } else
+ return -5;
+ }
+
+ for (j = 1; j < s->mapping.next; j++) {
+ mapping_t* m = array_get(&(s->mapping), j);
+ if (m->mode & MODE_DIRECTORY &&
+ m->info.dir.first_dir_index >
+ first_dir_index &&
+ m->info.dir.first_dir_index <
+ next_dir_index)
+ next_dir_index =
+ m->info.dir.first_dir_index;
+ }
+ remove_direntries(s, first_dir_index,
+ next_dir_index - first_dir_index);
+
+ deleted++;
+ }
+ } else {
+ if (unlink(mapping->path))
+ return -4;
+ deleted++;
+ }
+ DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry));
+ remove_mapping(s, i);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * synchronize mapping with new state:
+ *
+ * - copy FAT (with bdrv_read)
+ * - mark all filenames corresponding to mappings as deleted
+ * - recurse direntries from root (using bs->bdrv_read)
+ * - delete files corresponding to mappings marked as deleted
+ */
+static int do_commit(BDRVVVFATState* s)
+{
+ int ret = 0;
+
+ /* the real meat are the commits. Nothing to do? Move along! */
+ if (s->commits.next == 0)
+ return 0;
+
+ vvfat_close_current_file(s);
+
+ ret = handle_renames_and_mkdirs(s);
+ if (ret) {
+ fprintf(stderr, "Error handling renames (%d)\n", ret);
+ abort();
+ return ret;
+ }
+
+ /* copy FAT (with bdrv_read) */
+ memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat);
+
+ /* recurse direntries from root (using bs->bdrv_read) */
+ ret = commit_direntries(s, 0, -1);
+ if (ret) {
+ fprintf(stderr, "Fatal: error while committing (%d)\n", ret);
+ abort();
+ return ret;
+ }
+
+ ret = handle_commits(s);
+ if (ret) {
+ fprintf(stderr, "Error handling commits (%d)\n", ret);
+ abort();
+ return ret;
+ }
+
+ ret = handle_deletes(s);
+ if (ret) {
+ fprintf(stderr, "Error deleting\n");
+ abort();
+ return ret;
+ }
+
+ if (s->qcow->drv->bdrv_make_empty) {
+ s->qcow->drv->bdrv_make_empty(s->qcow);
+ }
+
+ memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
+
+DLOG(checkpoint());
+ return 0;
+}
+
+static int try_commit(BDRVVVFATState* s)
+{
+ vvfat_close_current_file(s);
+DLOG(checkpoint());
+ if(!is_consistent(s))
+ return -1;
+ return do_commit(s);
+}
+
+static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int i, ret;
+
+DLOG(checkpoint());
+
+ /* Check if we're operating in read-only mode */
+ if (s->qcow == NULL) {
+ return -EACCES;
+ }
+
+ vvfat_close_current_file(s);
+
+ /*
+ * Some sanity checks:
+ * - do not allow writing to the boot sector
+ * - do not allow to write non-ASCII filenames
+ */
+
+ if (sector_num < s->first_sectors_number)
+ return -1;
+
+ for (i = sector2cluster(s, sector_num);
+ i <= sector2cluster(s, sector_num + nb_sectors - 1);) {
+ mapping_t* mapping = find_mapping_for_cluster(s, i);
+ if (mapping) {
+ if (mapping->read_only) {
+ fprintf(stderr, "Tried to write to write-protected file %s\n",
+ mapping->path);
+ return -1;
+ }
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ int begin = cluster2sector(s, i);
+ int end = begin + s->sectors_per_cluster, k;
+ int dir_index;
+ const direntry_t* direntries;
+ long_file_name lfn;
+
+ lfn_init(&lfn);
+
+ if (begin < sector_num)
+ begin = sector_num;
+ if (end > sector_num + nb_sectors)
+ end = sector_num + nb_sectors;
+ dir_index = mapping->dir_index +
+ 0x10 * (begin - mapping->begin * s->sectors_per_cluster);
+ direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num));
+
+ for (k = 0; k < (end - begin) * 0x10; k++) {
+ /* do not allow non-ASCII filenames */
+ if (parse_long_name(&lfn, direntries + k) < 0) {
+ fprintf(stderr, "Warning: non-ASCII filename\n");
+ return -1;
+ }
+ /* no access to the direntry of a read-only file */
+ else if (is_short_name(direntries+k) &&
+ (direntries[k].attributes & 1)) {
+ if (memcmp(direntries + k,
+ array_get(&(s->directory), dir_index + k),
+ sizeof(direntry_t))) {
+ fprintf(stderr, "Warning: tried to write to write-protected file\n");
+ return -1;
+ }
+ }
+ }
+ }
+ i = mapping->end;
+ } else
+ i++;
+ }
+
+ /*
+ * Use qcow backend. Commit later.
+ */
+DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors));
+ ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors);
+ if (ret < 0) {
+ fprintf(stderr, "Error writing to qcow backend\n");
+ return ret;
+ }
+
+ for (i = sector2cluster(s, sector_num);
+ i <= sector2cluster(s, sector_num + nb_sectors - 1); i++)
+ if (i >= 0)
+ s->used_clusters[i] |= USED_ALLOCATED;
+
+DLOG(checkpoint());
+ /* TODO: add timeout */
+ try_commit(s);
+
+DLOG(checkpoint());
+ return 0;
+}
+
+static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ BDRVVVFATState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = vvfat_write(bs, sector_num, buf, nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int* n)
+{
+ BDRVVVFATState* s = bs->opaque;
+ *n = s->sector_count - sector_num;
+ if (*n > nb_sectors)
+ *n = nb_sectors;
+ else if (*n < 0)
+ return 0;
+ return 1;
+}
+
+static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t* buffer, int nb_sectors) {
+ BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
+ return try_commit(s);
+}
+
+static void write_target_close(BlockDriverState *bs) {
+ BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
+ bdrv_delete(s->qcow);
+ g_free(s->qcow_filename);
+}
+
+static BlockDriver vvfat_write_target = {
+ .format_name = "vvfat_write_target",
+ .bdrv_write = write_target_commit,
+ .bdrv_close = write_target_close,
+};
+
+static int enable_write_target(BDRVVVFATState *s)
+{
+ BlockDriver *bdrv_qcow;
+ QEMUOptionParameter *options;
+ int ret;
+ int size = sector2cluster(s, s->sector_count);
+ s->used_clusters = calloc(size, 1);
+
+ array_init(&(s->commits), sizeof(commit_t));
+
+ s->qcow_filename = g_malloc(1024);
+ ret = get_tmp_filename(s->qcow_filename, 1024);
+ if (ret < 0) {
+ g_free(s->qcow_filename);
+ s->qcow_filename = NULL;
+ return ret;
+ }
+
+ bdrv_qcow = bdrv_find_format("qcow");
+ options = parse_option_parameters("", bdrv_qcow->create_options, NULL);
+ set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
+ set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
+
+ if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
+ return -1;
+
+ s->qcow = bdrv_new("");
+ if (s->qcow == NULL) {
+ return -1;
+ }
+
+ ret = bdrv_open(s->qcow, s->qcow_filename,
+ BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
+ if (ret < 0) {
+ return ret;
+ }
+
+#ifndef _WIN32
+ unlink(s->qcow_filename);
+#endif
+
+ s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+ s->bs->backing_hd->drv = &vvfat_write_target;
+ s->bs->backing_hd->opaque = g_malloc(sizeof(void*));
+ *(void**)s->bs->backing_hd->opaque = s;
+
+ return 0;
+}
+
+static void vvfat_close(BlockDriverState *bs)
+{
+ BDRVVVFATState *s = bs->opaque;
+
+ vvfat_close_current_file(s);
+ array_free(&(s->fat));
+ array_free(&(s->directory));
+ array_free(&(s->mapping));
+ g_free(s->cluster_buffer);
+
+ if (s->qcow) {
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+ }
+}
+
+static BlockDriver bdrv_vvfat = {
+ .format_name = "vvfat",
+ .instance_size = sizeof(BDRVVVFATState),
+ .bdrv_file_open = vvfat_open,
+ .bdrv_rebind = vvfat_rebind,
+ .bdrv_read = vvfat_co_read,
+ .bdrv_write = vvfat_co_write,
+ .bdrv_close = vvfat_close,
+ .bdrv_co_is_allocated = vvfat_co_is_allocated,
+ .protocol_name = "fat",
+};
+
+static void bdrv_vvfat_init(void)
+{
+ bdrv_register(&bdrv_vvfat);
+}
+
+block_init(bdrv_vvfat_init);
+
+#ifdef DEBUG
+static void checkpoint(void) {
+ assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2);
+ check1(vvv);
+ check2(vvv);
+ assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY));
+#if 0
+ if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf)
+ fprintf(stderr, "Nonono!\n");
+ mapping_t* mapping;
+ direntry_t* direntry;
+ assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next);
+ assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next);
+ if (vvv->mapping.next<47)
+ return;
+ assert((mapping = array_get(&(vvv->mapping), 47)));
+ assert(mapping->dir_index < vvv->directory.next);
+ direntry = array_get(&(vvv->directory), mapping->dir_index);
+ assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0);
+#endif
+}
+#endif