diff options
author | Anas Nashif <anas.nashif@intel.com> | 2012-11-06 07:50:24 -0800 |
---|---|---|
committer | Anas Nashif <anas.nashif@intel.com> | 2012-11-06 07:50:24 -0800 |
commit | 060629c6ef0b7e5c267d84c91600113264d33120 (patch) | |
tree | 18fcb144ac71b9c4d08ee5d1dc58e2b16c109a5a /block | |
download | qemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.gz qemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.bz2 qemu-060629c6ef0b7e5c267d84c91600113264d33120.zip |
Imported Upstream version 1.2.0upstream/1.2.0
Diffstat (limited to 'block')
36 files changed, 25019 insertions, 0 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs new file mode 100644 index 000000000..b5754d39b --- /dev/null +++ b/block/Makefile.objs @@ -0,0 +1,11 @@ +block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o +block-obj-y += qed-check.o +block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o +block-obj-y += stream.o +block-obj-$(CONFIG_WIN32) += raw-win32.o +block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_CURL) += curl.o +block-obj-$(CONFIG_RBD) += rbd.o diff --git a/block/blkdebug.c b/block/blkdebug.c new file mode 100644 index 000000000..59dcea065 --- /dev/null +++ b/block/blkdebug.c @@ -0,0 +1,473 @@ +/* + * Block protocol for I/O error injection + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +typedef struct BDRVBlkdebugState { + int state; + QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; + QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; +} BDRVBlkdebugState; + +typedef struct BlkdebugAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + int ret; +} BlkdebugAIOCB; + +static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb); + +static AIOPool blkdebug_aio_pool = { + .aiocb_size = sizeof(BlkdebugAIOCB), + .cancel = blkdebug_aio_cancel, +}; + +enum { + ACTION_INJECT_ERROR, + ACTION_SET_STATE, +}; + +typedef struct BlkdebugRule { + BlkDebugEvent event; + int action; + int state; + union { + struct { + int error; + int immediately; + int once; + int64_t sector; + } inject; + struct { + int new_state; + } set_state; + } options; + QLIST_ENTRY(BlkdebugRule) next; + QSIMPLEQ_ENTRY(BlkdebugRule) active_next; +} BlkdebugRule; + +static QemuOptsList inject_error_opts = { + .name = "inject-error", + .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head), + .desc = { + { + .name = "event", + .type = QEMU_OPT_STRING, + }, + { + .name = "state", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "errno", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "sector", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "once", + .type = QEMU_OPT_BOOL, + }, + { + .name = "immediately", + .type = QEMU_OPT_BOOL, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList set_state_opts = { + .name = "set-state", + .head = QTAILQ_HEAD_INITIALIZER(set_state_opts.head), + .desc = { + { + .name = "event", + .type = QEMU_OPT_STRING, + }, + { + .name = "state", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "new_state", + .type = QEMU_OPT_NUMBER, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList *config_groups[] = { + &inject_error_opts, + &set_state_opts, + NULL +}; + +static const char *event_names[BLKDBG_EVENT_MAX] = { + [BLKDBG_L1_UPDATE] = "l1_update", + [BLKDBG_L1_GROW_ALLOC_TABLE] = "l1_grow.alloc_table", + [BLKDBG_L1_GROW_WRITE_TABLE] = "l1_grow.write_table", + [BLKDBG_L1_GROW_ACTIVATE_TABLE] = "l1_grow.activate_table", + + [BLKDBG_L2_LOAD] = "l2_load", + [BLKDBG_L2_UPDATE] = "l2_update", + [BLKDBG_L2_UPDATE_COMPRESSED] = "l2_update_compressed", + [BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read", + [BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write", + + [BLKDBG_READ_AIO] = "read_aio", + [BLKDBG_READ_BACKING_AIO] = "read_backing_aio", + [BLKDBG_READ_COMPRESSED] = "read_compressed", + + [BLKDBG_WRITE_AIO] = "write_aio", + [BLKDBG_WRITE_COMPRESSED] = "write_compressed", + + [BLKDBG_VMSTATE_LOAD] = "vmstate_load", + [BLKDBG_VMSTATE_SAVE] = "vmstate_save", + + [BLKDBG_COW_READ] = "cow_read", + [BLKDBG_COW_WRITE] = "cow_write", + + [BLKDBG_REFTABLE_LOAD] = "reftable_load", + [BLKDBG_REFTABLE_GROW] = "reftable_grow", + + [BLKDBG_REFBLOCK_LOAD] = "refblock_load", + [BLKDBG_REFBLOCK_UPDATE] = "refblock_update", + [BLKDBG_REFBLOCK_UPDATE_PART] = "refblock_update_part", + [BLKDBG_REFBLOCK_ALLOC] = "refblock_alloc", + [BLKDBG_REFBLOCK_ALLOC_HOOKUP] = "refblock_alloc.hookup", + [BLKDBG_REFBLOCK_ALLOC_WRITE] = "refblock_alloc.write", + [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS] = "refblock_alloc.write_blocks", + [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE] = "refblock_alloc.write_table", + [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE] = "refblock_alloc.switch_table", + + [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc", + [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes", + [BLKDBG_CLUSTER_FREE] = "cluster_free", +}; + +static int get_event_by_name(const char *name, BlkDebugEvent *event) +{ + int i; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + if (!strcmp(event_names[i], name)) { + *event = i; + return 0; + } + } + + return -1; +} + +struct add_rule_data { + BDRVBlkdebugState *s; + int action; +}; + +static int add_rule(QemuOpts *opts, void *opaque) +{ + struct add_rule_data *d = opaque; + BDRVBlkdebugState *s = d->s; + const char* event_name; + BlkDebugEvent event; + struct BlkdebugRule *rule; + + /* Find the right event for the rule */ + event_name = qemu_opt_get(opts, "event"); + if (!event_name || get_event_by_name(event_name, &event) < 0) { + return -1; + } + + /* Set attributes common for all actions */ + rule = g_malloc0(sizeof(*rule)); + *rule = (struct BlkdebugRule) { + .event = event, + .action = d->action, + .state = qemu_opt_get_number(opts, "state", 0), + }; + + /* Parse action-specific options */ + switch (d->action) { + case ACTION_INJECT_ERROR: + rule->options.inject.error = qemu_opt_get_number(opts, "errno", EIO); + rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); + rule->options.inject.immediately = + qemu_opt_get_bool(opts, "immediately", 0); + rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); + break; + + case ACTION_SET_STATE: + rule->options.set_state.new_state = + qemu_opt_get_number(opts, "new_state", 0); + break; + }; + + /* Add the rule */ + QLIST_INSERT_HEAD(&s->rules[event], rule, next); + + return 0; +} + +static int read_config(BDRVBlkdebugState *s, const char *filename) +{ + FILE *f; + int ret; + struct add_rule_data d; + + f = fopen(filename, "r"); + if (f == NULL) { + return -errno; + } + + ret = qemu_config_parse(f, config_groups, filename); + if (ret < 0) { + goto fail; + } + + d.s = s; + d.action = ACTION_INJECT_ERROR; + qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0); + + d.action = ACTION_SET_STATE; + qemu_opts_foreach(&set_state_opts, add_rule, &d, 0); + + ret = 0; +fail: + qemu_opts_reset(&inject_error_opts); + qemu_opts_reset(&set_state_opts); + fclose(f); + return ret; +} + +/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */ +static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVBlkdebugState *s = bs->opaque; + int ret; + char *config, *c; + + /* Parse the blkdebug: prefix */ + if (strncmp(filename, "blkdebug:", strlen("blkdebug:"))) { + return -EINVAL; + } + filename += strlen("blkdebug:"); + + /* Read rules from config file */ + c = strchr(filename, ':'); + if (c == NULL) { + return -EINVAL; + } + + config = g_strdup(filename); + config[c - filename] = '\0'; + ret = read_config(s, config); + g_free(config); + if (ret < 0) { + return ret; + } + filename = c + 1; + + /* Set initial state */ + s->state = 1; + + /* Open the backing file */ + ret = bdrv_file_open(&bs->file, filename, flags); + if (ret < 0) { + return ret; + } + + return 0; +} + +static void error_callback_bh(void *opaque) +{ + struct BlkdebugAIOCB *acb = opaque; + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_release(acb); +} + +static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb) +{ + BlkdebugAIOCB *acb = container_of(blockacb, BlkdebugAIOCB, common); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *inject_error(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule) +{ + BDRVBlkdebugState *s = bs->opaque; + int error = rule->options.inject.error; + struct BlkdebugAIOCB *acb; + QEMUBH *bh; + + if (rule->options.inject.once) { + QSIMPLEQ_INIT(&s->active_rules); + } + + if (rule->options.inject.immediately) { + return NULL; + } + + acb = qemu_aio_get(&blkdebug_aio_pool, bs, cb, opaque); + acb->ret = -error; + + bh = qemu_bh_new(error_callback_bh, acb); + acb->bh = bh; + qemu_bh_schedule(bh); + + return &acb->common; +} + +static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); + } + + return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); + } + + return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +static void blkdebug_close(BlockDriverState *bs) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule, *next; + int i; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { + QLIST_REMOVE(rule, next); + g_free(rule); + } + } +} + +static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, + int old_state, bool injected) +{ + BDRVBlkdebugState *s = bs->opaque; + + /* Only process rules for the current state */ + if (rule->state && rule->state != old_state) { + return injected; + } + + /* Take the action */ + switch (rule->action) { + case ACTION_INJECT_ERROR: + if (!injected) { + QSIMPLEQ_INIT(&s->active_rules); + injected = true; + } + QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next); + break; + + case ACTION_SET_STATE: + s->state = rule->options.set_state.new_state; + break; + } + return injected; +} + +static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) +{ + BDRVBlkdebugState *s = bs->opaque; + struct BlkdebugRule *rule; + int old_state = s->state; + bool injected; + + assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); + + injected = false; + QLIST_FOREACH(rule, &s->rules[event], next) { + injected = process_rule(bs, rule, old_state, injected); + } +} + +static int64_t blkdebug_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file); +} + +static BlockDriver bdrv_blkdebug = { + .format_name = "blkdebug", + .protocol_name = "blkdebug", + + .instance_size = sizeof(BDRVBlkdebugState), + + .bdrv_file_open = blkdebug_open, + .bdrv_close = blkdebug_close, + .bdrv_getlength = blkdebug_getlength, + + .bdrv_aio_readv = blkdebug_aio_readv, + .bdrv_aio_writev = blkdebug_aio_writev, + + .bdrv_debug_event = blkdebug_debug_event, +}; + +static void bdrv_blkdebug_init(void) +{ + bdrv_register(&bdrv_blkdebug); +} + +block_init(bdrv_blkdebug_init); diff --git a/block/blkverify.c b/block/blkverify.c new file mode 100644 index 000000000..9d5f1ec5b --- /dev/null +++ b/block/blkverify.c @@ -0,0 +1,366 @@ +/* + * Block protocol for block driver correctness testing + * + * Copyright (C) 2010 IBM, Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <stdarg.h> +#include "qemu_socket.h" /* for EINPROGRESS on Windows */ +#include "block_int.h" + +typedef struct { + BlockDriverState *test_file; +} BDRVBlkverifyState; + +typedef struct BlkverifyAIOCB BlkverifyAIOCB; +struct BlkverifyAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + + /* Request metadata */ + bool is_write; + int64_t sector_num; + int nb_sectors; + + int ret; /* first completed request's result */ + unsigned int done; /* completion counter */ + bool *finished; /* completion signal for cancel */ + + QEMUIOVector *qiov; /* user I/O vector */ + QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ + void *buf; /* buffer for raw file I/O */ + + void (*verify)(BlkverifyAIOCB *acb); +}; + +static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb) +{ + BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb; + bool finished = false; + + /* Wait until request completes, invokes its callback, and frees itself */ + acb->finished = &finished; + while (!finished) { + qemu_aio_wait(); + } +} + +static AIOPool blkverify_aio_pool = { + .aiocb_size = sizeof(BlkverifyAIOCB), + .cancel = blkverify_aio_cancel, +}; + +static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", + acb->is_write ? "write" : "read", acb->sector_num, + acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ +static int blkverify_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVBlkverifyState *s = bs->opaque; + int ret; + char *raw, *c; + + /* Parse the blkverify: prefix */ + if (strncmp(filename, "blkverify:", strlen("blkverify:"))) { + return -EINVAL; + } + filename += strlen("blkverify:"); + + /* Parse the raw image filename */ + c = strchr(filename, ':'); + if (c == NULL) { + return -EINVAL; + } + + raw = g_strdup(filename); + raw[c - filename] = '\0'; + ret = bdrv_file_open(&bs->file, raw, flags); + g_free(raw); + if (ret < 0) { + return ret; + } + filename = c + 1; + + /* Open the test file */ + s->test_file = bdrv_new(""); + ret = bdrv_open(s->test_file, filename, flags, NULL); + if (ret < 0) { + bdrv_delete(s->test_file); + s->test_file = NULL; + return ret; + } + + return 0; +} + +static void blkverify_close(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_delete(s->test_file); + s->test_file = NULL; +} + +static int64_t blkverify_getlength(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + return bdrv_getlength(s->test_file); +} + +/** + * Check that I/O vector contents are identical + * + * @a: I/O vector + * @b: I/O vector + * @ret: Offset to first mismatching byte or -1 if match + */ +static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + ssize_t offset = 0; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + size_t len = 0; + uint8_t *p = (uint8_t *)a->iov[i].iov_base; + uint8_t *q = (uint8_t *)b->iov[i].iov_base; + + assert(a->iov[i].iov_len == b->iov[i].iov_len); + while (len < a->iov[i].iov_len && *p++ == *q++) { + len++; + } + + offset += len; + + if (len != a->iov[i].iov_len) { + return offset; + } + } + return -1; +} + +typedef struct { + int src_index; + struct iovec *src_iov; + void *dest_base; +} IOVectorSortElem; + +static int sortelem_cmp_src_base(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + /* Don't overflow */ + if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) { + return -1; + } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) { + return 1; + } else { + return 0; + } +} + +static int sortelem_cmp_src_index(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + return elem_a->src_index - elem_b->src_index; +} + +/** + * Copy contents of I/O vector + * + * The relative relationships of overlapping iovecs are preserved. This is + * necessary to ensure identical semantics in the cloned I/O vector. + */ +static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, + void *buf) +{ + IOVectorSortElem sortelems[src->niov]; + void *last_end; + int i; + + /* Sort by source iovecs by base address */ + for (i = 0; i < src->niov; i++) { + sortelems[i].src_index = i; + sortelems[i].src_iov = &src->iov[i]; + } + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); + + /* Allocate buffer space taking into account overlapping iovecs */ + last_end = NULL; + for (i = 0; i < src->niov; i++) { + struct iovec *cur = sortelems[i].src_iov; + ptrdiff_t rewind = 0; + + /* Detect overlap */ + if (last_end && last_end > cur->iov_base) { + rewind = last_end - cur->iov_base; + } + + sortelems[i].dest_base = buf - rewind; + buf += cur->iov_len - MIN(rewind, cur->iov_len); + last_end = MAX(cur->iov_base + cur->iov_len, last_end); + } + + /* Sort by source iovec index and build destination iovec */ + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); + for (i = 0; i < src->niov; i++) { + qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); + } +} + +static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque); + + acb->bh = NULL; + acb->is_write = is_write; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->ret = -EINPROGRESS; + acb->done = 0; + acb->qiov = qiov; + acb->buf = NULL; + acb->verify = NULL; + acb->finished = NULL; + return acb; +} + +static void blkverify_aio_bh(void *opaque) +{ + BlkverifyAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + if (acb->buf) { + qemu_iovec_destroy(&acb->raw_qiov); + qemu_vfree(acb->buf); + } + acb->common.cb(acb->common.opaque, acb->ret); + if (acb->finished) { + *acb->finished = true; + } + qemu_aio_release(acb); +} + +static void blkverify_aio_cb(void *opaque, int ret) +{ + BlkverifyAIOCB *acb = opaque; + + switch (++acb->done) { + case 1: + acb->ret = ret; + break; + + case 2: + if (acb->ret != ret) { + blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); + } + + if (acb->verify) { + acb->verify(acb); + } + + acb->bh = qemu_bh_new(blkverify_aio_bh, acb); + qemu_bh_schedule(acb->bh); + break; + } +} + +static void blkverify_verify_readv(BlkverifyAIOCB *acb) +{ + ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov); + if (offset != -1) { + blkverify_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); + } +} + +static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, + nb_sectors, cb, opaque); + + acb->verify = blkverify_verify_readv; + acb->buf = qemu_blockalign(bs->file, qiov->size); + qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); + blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + + bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, + blkverify_aio_cb, acb); + return &acb->common; +} + +static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, + nb_sectors, cb, opaque); + + bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + return &acb->common; +} + +static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* Only flush test file, the raw file is not important */ + return bdrv_aio_flush(s->test_file, cb, opaque); +} + +static BlockDriver bdrv_blkverify = { + .format_name = "blkverify", + .protocol_name = "blkverify", + + .instance_size = sizeof(BDRVBlkverifyState), + + .bdrv_getlength = blkverify_getlength, + + .bdrv_file_open = blkverify_open, + .bdrv_close = blkverify_close, + + .bdrv_aio_readv = blkverify_aio_readv, + .bdrv_aio_writev = blkverify_aio_writev, + .bdrv_aio_flush = blkverify_aio_flush, +}; + +static void bdrv_blkverify_init(void) +{ + bdrv_register(&bdrv_blkverify); +} + +block_init(bdrv_blkverify_init); diff --git a/block/bochs.c b/block/bochs.c new file mode 100644 index 000000000..ab7944dc4 --- /dev/null +++ b/block/bochs.c @@ -0,0 +1,243 @@ +/* + * Block driver for the various disk image formats used by Bochs + * Currently only for "growing" type in read-only mode + * + * Copyright (c) 2005 Alex Beregszaszi + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "Bochs Virtual HD Image" +#define HEADER_VERSION 0x00020000 +#define HEADER_V1 0x00010000 +#define HEADER_SIZE 512 + +#define REDOLOG_TYPE "Redolog" +#define GROWING_TYPE "Growing" + +// not allocated: 0xffffffff + +// always little-endian +struct bochs_header_v1 { + char magic[32]; // "Bochs Virtual HD Image" + char type[16]; // "Redolog" + char subtype[16]; // "Undoable" / "Volatile" / "Growing" + uint32_t version; + uint32_t header; // size of header + + union { + struct { + uint32_t catalog; // num of entries + uint32_t bitmap; // bitmap size + uint32_t extent; // extent size + uint64_t disk; // disk size + char padding[HEADER_SIZE - 64 - 8 - 20]; + } redolog; + char padding[HEADER_SIZE - 64 - 8]; + } extra; +}; + +// always little-endian +struct bochs_header { + char magic[32]; // "Bochs Virtual HD Image" + char type[16]; // "Redolog" + char subtype[16]; // "Undoable" / "Volatile" / "Growing" + uint32_t version; + uint32_t header; // size of header + + union { + struct { + uint32_t catalog; // num of entries + uint32_t bitmap; // bitmap size + uint32_t extent; // extent size + uint32_t reserved; // for ??? + uint64_t disk; // disk size + char padding[HEADER_SIZE - 64 - 8 - 24]; + } redolog; + char padding[HEADER_SIZE - 64 - 8]; + } extra; +}; + +typedef struct BDRVBochsState { + CoMutex lock; + uint32_t *catalog_bitmap; + int catalog_size; + + int data_offset; + + int bitmap_blocks; + int extent_blocks; + int extent_size; +} BDRVBochsState; + +static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct bochs_header *bochs = (const void *)buf; + + if (buf_size < HEADER_SIZE) + return 0; + + if (!strcmp(bochs->magic, HEADER_MAGIC) && + !strcmp(bochs->type, REDOLOG_TYPE) && + !strcmp(bochs->subtype, GROWING_TYPE) && + ((le32_to_cpu(bochs->version) == HEADER_VERSION) || + (le32_to_cpu(bochs->version) == HEADER_V1))) + return 100; + + return 0; +} + +static int bochs_open(BlockDriverState *bs, int flags) +{ + BDRVBochsState *s = bs->opaque; + int i; + struct bochs_header bochs; + struct bochs_header_v1 header_v1; + + bs->read_only = 1; // no write support yet + + if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) { + goto fail; + } + + if (strcmp(bochs.magic, HEADER_MAGIC) || + strcmp(bochs.type, REDOLOG_TYPE) || + strcmp(bochs.subtype, GROWING_TYPE) || + ((le32_to_cpu(bochs.version) != HEADER_VERSION) && + (le32_to_cpu(bochs.version) != HEADER_V1))) { + goto fail; + } + + if (le32_to_cpu(bochs.version) == HEADER_V1) { + memcpy(&header_v1, &bochs, sizeof(bochs)); + bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512; + } else { + bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + } + + s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); + s->catalog_bitmap = g_malloc(s->catalog_size * 4); + if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, + s->catalog_size * 4) != s->catalog_size * 4) + goto fail; + for (i = 0; i < s->catalog_size; i++) + le32_to_cpus(&s->catalog_bitmap[i]); + + s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); + + s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512; + s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512; + + s->extent_size = le32_to_cpu(bochs.extra.redolog.extent); + + qemu_co_mutex_init(&s->lock); + return 0; + fail: + return -1; +} + +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ + BDRVBochsState *s = bs->opaque; + int64_t offset = sector_num * 512; + int64_t extent_index, extent_offset, bitmap_offset; + char bitmap_entry; + + // seek to sector + extent_index = offset / s->extent_size; + extent_offset = (offset % s->extent_size) / 512; + + if (s->catalog_bitmap[extent_index] == 0xffffffff) { + return -1; /* not allocated */ + } + + bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] * + (s->extent_blocks + s->bitmap_blocks)); + + /* read in bitmap for current extent */ + if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), + &bitmap_entry, 1) != 1) { + return -1; + } + + if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { + return -1; /* not allocated */ + } + + return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); +} + +static int bochs_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + + while (nb_sectors > 0) { + int64_t block_offset = seek_to_sector(bs, sector_num); + if (block_offset >= 0) { + ret = bdrv_pread(bs->file, block_offset, buf, 512); + if (ret != 512) { + return -1; + } + } else + memset(buf, 0, 512); + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVBochsState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = bochs_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void bochs_close(BlockDriverState *bs) +{ + BDRVBochsState *s = bs->opaque; + g_free(s->catalog_bitmap); +} + +static BlockDriver bdrv_bochs = { + .format_name = "bochs", + .instance_size = sizeof(BDRVBochsState), + .bdrv_probe = bochs_probe, + .bdrv_open = bochs_open, + .bdrv_read = bochs_co_read, + .bdrv_close = bochs_close, +}; + +static void bdrv_bochs_init(void) +{ + bdrv_register(&bdrv_bochs); +} + +block_init(bdrv_bochs_init); diff --git a/block/cloop.c b/block/cloop.c new file mode 100644 index 000000000..7570eb8e7 --- /dev/null +++ b/block/cloop.c @@ -0,0 +1,195 @@ +/* + * QEMU Block driver for CLOOP images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> + +typedef struct BDRVCloopState { + CoMutex lock; + uint32_t block_size; + uint32_t n_blocks; + uint64_t *offsets; + uint32_t sectors_per_block; + uint32_t current_block; + uint8_t *compressed_block; + uint8_t *uncompressed_block; + z_stream zstream; +} BDRVCloopState; + +static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const char *magic_version_2_0 = "#!/bin/sh\n" + "#V2.0 Format\n" + "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n"; + int length = strlen(magic_version_2_0); + if (length > buf_size) { + length = buf_size; + } + if (!memcmp(magic_version_2_0, buf, length)) { + return 2; + } + return 0; +} + +static int cloop_open(BlockDriverState *bs, int flags) +{ + BDRVCloopState *s = bs->opaque; + uint32_t offsets_size, max_compressed_block_size = 1, i; + + bs->read_only = 1; + + /* read header */ + if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) { + goto cloop_close; + } + s->block_size = be32_to_cpu(s->block_size); + + if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) { + goto cloop_close; + } + s->n_blocks = be32_to_cpu(s->n_blocks); + + /* read offsets */ + offsets_size = s->n_blocks * sizeof(uint64_t); + s->offsets = g_malloc(offsets_size); + if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) < + offsets_size) { + goto cloop_close; + } + for(i=0;i<s->n_blocks;i++) { + s->offsets[i] = be64_to_cpu(s->offsets[i]); + if (i > 0) { + uint32_t size = s->offsets[i] - s->offsets[i - 1]; + if (size > max_compressed_block_size) { + max_compressed_block_size = size; + } + } + } + + /* initialize zlib engine */ + s->compressed_block = g_malloc(max_compressed_block_size + 1); + s->uncompressed_block = g_malloc(s->block_size); + if (inflateInit(&s->zstream) != Z_OK) { + goto cloop_close; + } + s->current_block = s->n_blocks; + + s->sectors_per_block = s->block_size/512; + bs->total_sectors = s->n_blocks * s->sectors_per_block; + qemu_co_mutex_init(&s->lock); + return 0; + +cloop_close: + return -1; +} + +static inline int cloop_read_block(BlockDriverState *bs, int block_num) +{ + BDRVCloopState *s = bs->opaque; + + if (s->current_block != block_num) { + int ret; + uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; + + ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block, + bytes); + if (ret != bytes) { + return -1; + } + + s->zstream.next_in = s->compressed_block; + s->zstream.avail_in = bytes; + s->zstream.next_out = s->uncompressed_block; + s->zstream.avail_out = s->block_size; + ret = inflateReset(&s->zstream); + if (ret != Z_OK) { + return -1; + } + ret = inflate(&s->zstream, Z_FINISH); + if (ret != Z_STREAM_END || s->zstream.total_out != s->block_size) { + return -1; + } + + s->current_block = block_num; + } + return 0; +} + +static int cloop_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVCloopState *s = bs->opaque; + int i; + + for (i = 0; i < nb_sectors; i++) { + uint32_t sector_offset_in_block = + ((sector_num + i) % s->sectors_per_block), + block_num = (sector_num + i) / s->sectors_per_block; + if (cloop_read_block(bs, block_num) != 0) { + return -1; + } + memcpy(buf + i * 512, + s->uncompressed_block + sector_offset_in_block * 512, 512); + } + return 0; +} + +static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVCloopState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = cloop_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void cloop_close(BlockDriverState *bs) +{ + BDRVCloopState *s = bs->opaque; + if (s->n_blocks > 0) { + g_free(s->offsets); + } + g_free(s->compressed_block); + g_free(s->uncompressed_block); + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_cloop = { + .format_name = "cloop", + .instance_size = sizeof(BDRVCloopState), + .bdrv_probe = cloop_probe, + .bdrv_open = cloop_open, + .bdrv_read = cloop_co_read, + .bdrv_close = cloop_close, +}; + +static void bdrv_cloop_init(void) +{ + bdrv_register(&bdrv_cloop); +} + +block_init(bdrv_cloop_init); diff --git a/block/cow.c b/block/cow.c new file mode 100644 index 000000000..a5a00eb9c --- /dev/null +++ b/block/cow.c @@ -0,0 +1,356 @@ +/* + * Block driver for the COW format + * + * Copyright (c) 2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ +/* COW block driver using file system holes */ + +/* user mode linux compatible COW file */ +#define COW_MAGIC 0x4f4f4f4d /* MOOO */ +#define COW_VERSION 2 + +struct cow_header_v2 { + uint32_t magic; + uint32_t version; + char backing_file[1024]; + int32_t mtime; + uint64_t size; + uint32_t sectorsize; +}; + +typedef struct BDRVCowState { + CoMutex lock; + int64_t cow_sectors_offset; +} BDRVCowState; + +static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct cow_header_v2 *cow_header = (const void *)buf; + + if (buf_size >= sizeof(struct cow_header_v2) && + be32_to_cpu(cow_header->magic) == COW_MAGIC && + be32_to_cpu(cow_header->version) == COW_VERSION) + return 100; + else + return 0; +} + +static int cow_open(BlockDriverState *bs, int flags) +{ + BDRVCowState *s = bs->opaque; + struct cow_header_v2 cow_header; + int bitmap_size; + int64_t size; + int ret; + + /* see if it is a cow image */ + ret = bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header)); + if (ret < 0) { + goto fail; + } + + if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { + ret = -EINVAL; + goto fail; + } + + if (be32_to_cpu(cow_header.version) != COW_VERSION) { + char version[64]; + snprintf(version, sizeof(version), + "COW version %d", cow_header.version); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "cow", version); + ret = -ENOTSUP; + goto fail; + } + + /* cow image found */ + size = be64_to_cpu(cow_header.size); + bs->total_sectors = size / 512; + + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + cow_header.backing_file); + + bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header); + s->cow_sectors_offset = (bitmap_size + 511) & ~511; + qemu_co_mutex_init(&s->lock); + return 0; + fail: + return ret; +} + +/* + * XXX(hch): right now these functions are extremely inefficient. + * We should just read the whole bitmap we'll need in one go instead. + */ +static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum) +{ + uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; + uint8_t bitmap; + int ret; + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + bitmap |= (1 << (bitnum % 8)); + + ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + return 0; +} + +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) +{ + uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; + uint8_t bitmap; + int ret; + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + return !!(bitmap & (1 << (bitnum % 8))); +} + +/* Return true if first block has been changed (ie. current version is + * in COW file). Set the number of continuous blocks for which that + * is true. */ +static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *num_same) +{ + int changed; + + if (nb_sectors == 0) { + *num_same = nb_sectors; + return 0; + } + + changed = is_bit_set(bs, sector_num); + if (changed < 0) { + return 0; /* XXX: how to return I/O errors? */ + } + + for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) { + if (is_bit_set(bs, sector_num + *num_same) != changed) + break; + } + + return changed; +} + +static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + int error = 0; + int i; + + for (i = 0; i < nb_sectors; i++) { + error = cow_set_bit(bs, sector_num + i); + if (error) { + break; + } + } + + return error; +} + +static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVCowState *s = bs->opaque; + int ret, n; + + while (nb_sectors > 0) { + if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) { + ret = bdrv_pread(bs->file, + s->cow_sectors_offset + sector_num * 512, + buf, n * 512); + if (ret < 0) { + return ret; + } + } else { + if (bs->backing_hd) { + /* read from the base image */ + ret = bdrv_read(bs->backing_hd, sector_num, buf, n); + if (ret < 0) { + return ret; + } + } else { + memset(buf, 0, n * 512); + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static coroutine_fn int cow_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVCowState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = cow_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int cow_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVCowState *s = bs->opaque; + int ret; + + ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512, + buf, nb_sectors * 512); + if (ret < 0) { + return ret; + } + + return cow_update_bitmap(bs, sector_num, nb_sectors); +} + +static coroutine_fn int cow_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVCowState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = cow_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void cow_close(BlockDriverState *bs) +{ +} + +static int cow_create(const char *filename, QEMUOptionParameter *options) +{ + struct cow_header_v2 cow_header; + struct stat st; + int64_t image_sectors = 0; + const char *image_filename = NULL; + int ret; + BlockDriverState *cow_bs; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_sectors = options->value.n / 512; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + image_filename = options->value.s; + } + options++; + } + + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + memset(&cow_header, 0, sizeof(cow_header)); + cow_header.magic = cpu_to_be32(COW_MAGIC); + cow_header.version = cpu_to_be32(COW_VERSION); + if (image_filename) { + /* Note: if no file, we put a dummy mtime */ + cow_header.mtime = cpu_to_be32(0); + + if (stat(image_filename, &st) != 0) { + goto mtime_fail; + } + cow_header.mtime = cpu_to_be32(st.st_mtime); + mtime_fail: + pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file), + image_filename); + } + cow_header.sectorsize = cpu_to_be32(512); + cow_header.size = cpu_to_be64(image_sectors * 512); + ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header)); + if (ret < 0) { + goto exit; + } + + /* resize to include at least all the bitmap */ + ret = bdrv_truncate(cow_bs, + sizeof(cow_header) + ((image_sectors + 7) >> 3)); + if (ret < 0) { + goto exit; + } + +exit: + bdrv_delete(cow_bs); + return ret; +} + +static QEMUOptionParameter cow_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { NULL } +}; + +static BlockDriver bdrv_cow = { + .format_name = "cow", + .instance_size = sizeof(BDRVCowState), + + .bdrv_probe = cow_probe, + .bdrv_open = cow_open, + .bdrv_close = cow_close, + .bdrv_create = cow_create, + + .bdrv_read = cow_co_read, + .bdrv_write = cow_co_write, + .bdrv_co_is_allocated = cow_co_is_allocated, + + .create_options = cow_create_options, +}; + +static void bdrv_cow_init(void) +{ + bdrv_register(&bdrv_cow); +} + +block_init(bdrv_cow_init); diff --git a/block/curl.c b/block/curl.c new file mode 100644 index 000000000..e7c3634d3 --- /dev/null +++ b/block/curl.c @@ -0,0 +1,624 @@ +/* + * QEMU Block driver for CURL images + * + * Copyright (c) 2009 Alexander Graf <agraf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include <curl/curl.h> + +// #define DEBUG +// #define DEBUG_VERBOSE + +#ifdef DEBUG_CURL +#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#define CURL_NUM_STATES 8 +#define CURL_NUM_ACB 8 +#define SECTOR_SIZE 512 +#define READ_AHEAD_SIZE (256 * 1024) + +#define FIND_RET_NONE 0 +#define FIND_RET_OK 1 +#define FIND_RET_WAIT 2 + +struct BDRVCURLState; + +typedef struct CURLAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + QEMUIOVector *qiov; + + int64_t sector_num; + int nb_sectors; + + size_t start; + size_t end; +} CURLAIOCB; + +typedef struct CURLState +{ + struct BDRVCURLState *s; + CURLAIOCB *acb[CURL_NUM_ACB]; + CURL *curl; + char *orig_buf; + size_t buf_start; + size_t buf_off; + size_t buf_len; + char range[128]; + char errmsg[CURL_ERROR_SIZE]; + char in_use; +} CURLState; + +typedef struct BDRVCURLState { + CURLM *multi; + size_t len; + CURLState states[CURL_NUM_STATES]; + char *url; + size_t readahead_size; +} BDRVCURLState; + +static void curl_clean_state(CURLState *s); +static void curl_multi_do(void *arg); +static int curl_aio_flush(void *opaque); + +static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, + void *s, void *sp) +{ + DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); + switch (action) { + case CURL_POLL_IN: + qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, curl_aio_flush, s); + break; + case CURL_POLL_OUT: + qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, curl_aio_flush, s); + break; + case CURL_POLL_INOUT: + qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, + curl_aio_flush, s); + break; + case CURL_POLL_REMOVE: + qemu_aio_set_fd_handler(fd, NULL, NULL, NULL, NULL); + break; + } + + return 0; +} + +static size_t curl_size_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ + CURLState *s = ((CURLState*)opaque); + size_t realsize = size * nmemb; + size_t fsize; + + if(sscanf(ptr, "Content-Length: %zd", &fsize) == 1) { + s->s->len = fsize; + } + + return realsize; +} + +static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ + CURLState *s = ((CURLState*)opaque); + size_t realsize = size * nmemb; + int i; + + DPRINTF("CURL: Just reading %zd bytes\n", realsize); + + if (!s || !s->orig_buf) + goto read_end; + + memcpy(s->orig_buf + s->buf_off, ptr, realsize); + s->buf_off += realsize; + + for(i=0; i<CURL_NUM_ACB; i++) { + CURLAIOCB *acb = s->acb[i]; + + if (!acb) + continue; + + if ((s->buf_off >= acb->end)) { + qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, + acb->end - acb->start); + acb->common.cb(acb->common.opaque, 0); + qemu_aio_release(acb); + s->acb[i] = NULL; + } + } + +read_end: + return realsize; +} + +static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, + CURLAIOCB *acb) +{ + int i; + size_t end = start + len; + + for (i=0; i<CURL_NUM_STATES; i++) { + CURLState *state = &s->states[i]; + size_t buf_end = (state->buf_start + state->buf_off); + size_t buf_fend = (state->buf_start + state->buf_len); + + if (!state->orig_buf) + continue; + if (!state->buf_off) + continue; + + // Does the existing buffer cover our section? + if ((start >= state->buf_start) && + (start <= buf_end) && + (end >= state->buf_start) && + (end <= buf_end)) + { + char *buf = state->orig_buf + (start - state->buf_start); + + qemu_iovec_from_buf(acb->qiov, 0, buf, len); + acb->common.cb(acb->common.opaque, 0); + + return FIND_RET_OK; + } + + // Wait for unfinished chunks + if ((start >= state->buf_start) && + (start <= buf_fend) && + (end >= state->buf_start) && + (end <= buf_fend)) + { + int j; + + acb->start = start - state->buf_start; + acb->end = acb->start + len; + + for (j=0; j<CURL_NUM_ACB; j++) { + if (!state->acb[j]) { + state->acb[j] = acb; + return FIND_RET_WAIT; + } + } + } + } + + return FIND_RET_NONE; +} + +static void curl_multi_do(void *arg) +{ + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + int r; + int msgs_in_queue; + + if (!s->multi) + return; + + do { + r = curl_multi_socket_all(s->multi, &running); + } while(r == CURLM_CALL_MULTI_PERFORM); + + /* Try to find done transfers, so we can free the easy + * handle again. */ + do { + CURLMsg *msg; + msg = curl_multi_info_read(s->multi, &msgs_in_queue); + + if (!msg) + break; + if (msg->msg == CURLMSG_NONE) + break; + + switch (msg->msg) { + case CURLMSG_DONE: + { + CURLState *state = NULL; + curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state); + + /* ACBs for successful messages get completed in curl_read_cb */ + if (msg->data.result != CURLE_OK) { + int i; + for (i = 0; i < CURL_NUM_ACB; i++) { + CURLAIOCB *acb = state->acb[i]; + + if (acb == NULL) { + continue; + } + + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_release(acb); + state->acb[i] = NULL; + } + } + + curl_clean_state(state); + break; + } + default: + msgs_in_queue = 0; + break; + } + } while(msgs_in_queue); +} + +static CURLState *curl_init_state(BDRVCURLState *s) +{ + CURLState *state = NULL; + int i, j; + + do { + for (i=0; i<CURL_NUM_STATES; i++) { + for (j=0; j<CURL_NUM_ACB; j++) + if (s->states[i].acb[j]) + continue; + if (s->states[i].in_use) + continue; + + state = &s->states[i]; + state->in_use = 1; + break; + } + if (!state) { + g_usleep(100); + curl_multi_do(s); + } + } while(!state); + + if (state->curl) + goto has_curl; + + state->curl = curl_easy_init(); + if (!state->curl) + return NULL; + curl_easy_setopt(state->curl, CURLOPT_URL, s->url); + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); + curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); + curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + +#ifdef DEBUG_VERBOSE + curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); +#endif + +has_curl: + + state->s = s; + + return state; +} + +static void curl_clean_state(CURLState *s) +{ + if (s->s->multi) + curl_multi_remove_handle(s->s->multi, s->curl); + s->in_use = 0; +} + +static int curl_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVCURLState *s = bs->opaque; + CURLState *state = NULL; + double d; + + #define RA_OPTSTR ":readahead=" + char *file; + char *ra; + const char *ra_val; + int parse_state = 0; + + static int inited = 0; + + file = g_strdup(filename); + s->readahead_size = READ_AHEAD_SIZE; + + /* Parse a trailing ":readahead=#:" param, if present. */ + ra = file + strlen(file) - 1; + while (ra >= file) { + if (parse_state == 0) { + if (*ra == ':') + parse_state++; + else + break; + } else if (parse_state == 1) { + if (*ra > '9' || *ra < '0') { + char *opt_start = ra - strlen(RA_OPTSTR) + 1; + if (opt_start > file && + strncmp(opt_start, RA_OPTSTR, strlen(RA_OPTSTR)) == 0) { + ra_val = ra + 1; + ra -= strlen(RA_OPTSTR) - 1; + *ra = '\0'; + s->readahead_size = atoi(ra_val); + break; + } else { + break; + } + } + } + ra--; + } + + if ((s->readahead_size & 0x1ff) != 0) { + fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n", + s->readahead_size); + goto out_noclean; + } + + if (!inited) { + curl_global_init(CURL_GLOBAL_ALL); + inited = 1; + } + + DPRINTF("CURL: Opening %s\n", file); + s->url = file; + state = curl_init_state(s); + if (!state) + goto out_noclean; + + // Get file size + + curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_size_cb); + if (curl_easy_perform(state->curl)) + goto out; + curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); + curl_easy_setopt(state->curl, CURLOPT_NOBODY, 0); + if (d) + s->len = (size_t)d; + else if(!s->len) + goto out; + DPRINTF("CURL: Size = %zd\n", s->len); + + curl_clean_state(state); + curl_easy_cleanup(state->curl); + state->curl = NULL; + + // Now we know the file exists and its size, so let's + // initialize the multi interface! + + s->multi = curl_multi_init(); + curl_multi_setopt( s->multi, CURLMOPT_SOCKETDATA, s); + curl_multi_setopt( s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb ); + curl_multi_do(s); + + return 0; + +out: + fprintf(stderr, "CURL: Error opening file: %s\n", state->errmsg); + curl_easy_cleanup(state->curl); + state->curl = NULL; +out_noclean: + g_free(file); + return -EINVAL; +} + +static int curl_aio_flush(void *opaque) +{ + BDRVCURLState *s = opaque; + int i, j; + + for (i=0; i < CURL_NUM_STATES; i++) { + for(j=0; j < CURL_NUM_ACB; j++) { + if (s->states[i].acb[j]) { + return 1; + } + } + } + return 0; +} + +static void curl_aio_cancel(BlockDriverAIOCB *blockacb) +{ + // Do we have to implement canceling? Seems to work without... +} + +static AIOPool curl_aio_pool = { + .aiocb_size = sizeof(CURLAIOCB), + .cancel = curl_aio_cancel, +}; + + +static void curl_readv_bh_cb(void *p) +{ + CURLState *state; + + CURLAIOCB *acb = p; + BDRVCURLState *s = acb->common.bs->opaque; + + qemu_bh_delete(acb->bh); + acb->bh = NULL; + + size_t start = acb->sector_num * SECTOR_SIZE; + size_t end; + + // In case we have the requested data already (e.g. read-ahead), + // we can just call the callback and be done. + switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) { + case FIND_RET_OK: + qemu_aio_release(acb); + // fall through + case FIND_RET_WAIT: + return; + default: + break; + } + + // No cache found, so let's start a new request + state = curl_init_state(s); + if (!state) { + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_release(acb); + return; + } + + acb->start = 0; + acb->end = (acb->nb_sectors * SECTOR_SIZE); + + state->buf_off = 0; + if (state->orig_buf) + g_free(state->orig_buf); + state->buf_start = start; + state->buf_len = acb->end + s->readahead_size; + end = MIN(start + state->buf_len, s->len) - 1; + state->orig_buf = g_malloc(state->buf_len); + state->acb[0] = acb; + + snprintf(state->range, 127, "%zd-%zd", start, end); + DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n", + (acb->nb_sectors * SECTOR_SIZE), start, state->range); + curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); + + curl_multi_add_handle(s->multi, state->curl); + curl_multi_do(s); + +} + +static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + CURLAIOCB *acb; + + acb = qemu_aio_get(&curl_aio_pool, bs, cb, opaque); + + acb->qiov = qiov; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + + acb->bh = qemu_bh_new(curl_readv_bh_cb, acb); + + if (!acb->bh) { + DPRINTF("CURL: qemu_bh_new failed\n"); + return NULL; + } + + qemu_bh_schedule(acb->bh); + return &acb->common; +} + +static void curl_close(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + int i; + + DPRINTF("CURL: Close\n"); + for (i=0; i<CURL_NUM_STATES; i++) { + if (s->states[i].in_use) + curl_clean_state(&s->states[i]); + if (s->states[i].curl) { + curl_easy_cleanup(s->states[i].curl); + s->states[i].curl = NULL; + } + if (s->states[i].orig_buf) { + g_free(s->states[i].orig_buf); + s->states[i].orig_buf = NULL; + } + } + if (s->multi) + curl_multi_cleanup(s->multi); + if (s->url) + free(s->url); +} + +static int64_t curl_getlength(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + return s->len; +} + +static BlockDriver bdrv_http = { + .format_name = "http", + .protocol_name = "http", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, +}; + +static BlockDriver bdrv_https = { + .format_name = "https", + .protocol_name = "https", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, +}; + +static BlockDriver bdrv_ftp = { + .format_name = "ftp", + .protocol_name = "ftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, +}; + +static BlockDriver bdrv_ftps = { + .format_name = "ftps", + .protocol_name = "ftps", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, +}; + +static BlockDriver bdrv_tftp = { + .format_name = "tftp", + .protocol_name = "tftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, +}; + +static void curl_block_init(void) +{ + bdrv_register(&bdrv_http); + bdrv_register(&bdrv_https); + bdrv_register(&bdrv_ftp); + bdrv_register(&bdrv_ftps); + bdrv_register(&bdrv_tftp); +} + +block_init(curl_block_init); diff --git a/block/dmg.c b/block/dmg.c new file mode 100644 index 000000000..37902a434 --- /dev/null +++ b/block/dmg.c @@ -0,0 +1,325 @@ +/* + * QEMU Block driver for DMG images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "bswap.h" +#include "module.h" +#include <zlib.h> + +typedef struct BDRVDMGState { + CoMutex lock; + /* each chunk contains a certain number of sectors, + * offsets[i] is the offset in the .dmg file, + * lengths[i] is the length of the compressed chunk, + * sectors[i] is the sector beginning at offsets[i], + * sectorcounts[i] is the number of sectors in that chunk, + * the sectors array is ordered + * 0<=i<n_chunks */ + + uint32_t n_chunks; + uint32_t* types; + uint64_t* offsets; + uint64_t* lengths; + uint64_t* sectors; + uint64_t* sectorcounts; + uint32_t current_chunk; + uint8_t *compressed_chunk; + uint8_t *uncompressed_chunk; + z_stream zstream; +} BDRVDMGState; + +static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + int len=strlen(filename); + if(len>4 && !strcmp(filename+len-4,".dmg")) + return 2; + return 0; +} + +static off_t read_off(BlockDriverState *bs, int64_t offset) +{ + uint64_t buffer; + if (bdrv_pread(bs->file, offset, &buffer, 8) < 8) + return 0; + return be64_to_cpu(buffer); +} + +static off_t read_uint32(BlockDriverState *bs, int64_t offset) +{ + uint32_t buffer; + if (bdrv_pread(bs->file, offset, &buffer, 4) < 4) + return 0; + return be32_to_cpu(buffer); +} + +static int dmg_open(BlockDriverState *bs, int flags) +{ + BDRVDMGState *s = bs->opaque; + off_t info_begin,info_end,last_in_offset,last_out_offset; + uint32_t count; + uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; + int64_t offset; + + bs->read_only = 1; + s->n_chunks = 0; + s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; + + /* read offset of info blocks */ + offset = bdrv_getlength(bs->file); + if (offset < 0) { + goto fail; + } + offset -= 0x1d8; + + info_begin = read_off(bs, offset); + if (info_begin == 0) { + goto fail; + } + + if (read_uint32(bs, info_begin) != 0x100) { + goto fail; + } + + count = read_uint32(bs, info_begin + 4); + if (count == 0) { + goto fail; + } + info_end = info_begin + count; + + offset = info_begin + 0x100; + + /* read offsets */ + last_in_offset = last_out_offset = 0; + while (offset < info_end) { + uint32_t type; + + count = read_uint32(bs, offset); + if(count==0) + goto fail; + offset += 4; + + type = read_uint32(bs, offset); + if (type == 0x6d697368 && count >= 244) { + int new_size, chunk_count; + + offset += 4; + offset += 200; + + chunk_count = (count-204)/40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size/2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); + + for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) { + s->types[i] = read_uint32(bs, offset); + offset += 4; + if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { + if(s->types[i]==0xffffffff) { + last_in_offset = s->offsets[i-1]+s->lengths[i-1]; + last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1]; + } + chunk_count--; + i--; + offset += 36; + continue; + } + offset += 4; + + s->sectors[i] = last_out_offset+read_off(bs, offset); + offset += 8; + + s->sectorcounts[i] = read_off(bs, offset); + offset += 8; + + s->offsets[i] = last_in_offset+read_off(bs, offset); + offset += 8; + + s->lengths[i] = read_off(bs, offset); + offset += 8; + + if(s->lengths[i]>max_compressed_size) + max_compressed_size = s->lengths[i]; + if(s->sectorcounts[i]>max_sectors_per_chunk) + max_sectors_per_chunk = s->sectorcounts[i]; + } + s->n_chunks+=chunk_count; + } + } + + /* initialize zlib engine */ + s->compressed_chunk = g_malloc(max_compressed_size+1); + s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk); + if(inflateInit(&s->zstream) != Z_OK) + goto fail; + + s->current_chunk = s->n_chunks; + + qemu_co_mutex_init(&s->lock); + return 0; +fail: + return -1; +} + +static inline int is_sector_in_chunk(BDRVDMGState* s, + uint32_t chunk_num,int sector_num) +{ + if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num || + s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num) + return 0; + else + return -1; +} + +static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num) +{ + /* binary search */ + uint32_t chunk1=0,chunk2=s->n_chunks,chunk3; + while(chunk1!=chunk2) { + chunk3 = (chunk1+chunk2)/2; + if(s->sectors[chunk3]>sector_num) + chunk2 = chunk3; + else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num) + return chunk3; + else + chunk1 = chunk3; + } + return s->n_chunks; /* error */ +} + +static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num) +{ + BDRVDMGState *s = bs->opaque; + + if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) { + int ret; + uint32_t chunk = search_chunk(s,sector_num); + + if(chunk>=s->n_chunks) + return -1; + + s->current_chunk = s->n_chunks; + switch(s->types[chunk]) { + case 0x80000005: { /* zlib compressed */ + int i; + + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + i=0; + do { + ret = bdrv_pread(bs->file, s->offsets[chunk] + i, + s->compressed_chunk+i, s->lengths[chunk]-i); + if(ret<0 && errno==EINTR) + ret=0; + i+=ret; + } while(ret>=0 && ret+i<s->lengths[chunk]); + + if (ret != s->lengths[chunk]) + return -1; + + s->zstream.next_in = s->compressed_chunk; + s->zstream.avail_in = s->lengths[chunk]; + s->zstream.next_out = s->uncompressed_chunk; + s->zstream.avail_out = 512*s->sectorcounts[chunk]; + ret = inflateReset(&s->zstream); + if(ret != Z_OK) + return -1; + ret = inflate(&s->zstream, Z_FINISH); + if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk]) + return -1; + break; } + case 1: /* copy */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->uncompressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) + return -1; + break; + case 2: /* zero */ + memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]); + break; + } + s->current_chunk = chunk; + } + return 0; +} + +static int dmg_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVDMGState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++) { + uint32_t sector_offset_in_chunk; + if(dmg_read_chunk(bs, sector_num+i) != 0) + return -1; + sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk]; + memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512); + } + return 0; +} + +static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVDMGState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = dmg_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void dmg_close(BlockDriverState *bs) +{ + BDRVDMGState *s = bs->opaque; + if(s->n_chunks>0) { + free(s->types); + free(s->offsets); + free(s->lengths); + free(s->sectors); + free(s->sectorcounts); + } + free(s->compressed_chunk); + free(s->uncompressed_chunk); + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_dmg = { + .format_name = "dmg", + .instance_size = sizeof(BDRVDMGState), + .bdrv_probe = dmg_probe, + .bdrv_open = dmg_open, + .bdrv_read = dmg_co_read, + .bdrv_close = dmg_close, +}; + +static void bdrv_dmg_init(void) +{ + bdrv_register(&bdrv_dmg); +} + +block_init(bdrv_dmg_init); diff --git a/block/iscsi.c b/block/iscsi.c new file mode 100644 index 000000000..0b96165ec --- /dev/null +++ b/block/iscsi.c @@ -0,0 +1,1086 @@ +/* + * QEMU Block driver for iSCSI images + * + * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include <arpa/inet.h> +#include "qemu-common.h" +#include "qemu-error.h" +#include "block_int.h" +#include "trace.h" +#include "hw/scsi-defs.h" + +#include <iscsi/iscsi.h> +#include <iscsi/scsi-lowlevel.h> + +#ifdef __linux__ +#include <scsi/sg.h> +#include <hw/scsi-defs.h> +#endif + +typedef struct IscsiLun { + struct iscsi_context *iscsi; + int lun; + enum scsi_inquiry_peripheral_device_type type; + int block_size; + uint64_t num_blocks; + int events; +} IscsiLun; + +typedef struct IscsiAIOCB { + BlockDriverAIOCB common; + QEMUIOVector *qiov; + QEMUBH *bh; + IscsiLun *iscsilun; + struct scsi_task *task; + uint8_t *buf; + int status; + int canceled; + size_t read_size; + size_t read_offset; +#ifdef __linux__ + sg_io_hdr_t *ioh; +#endif +} IscsiAIOCB; + +struct IscsiTask { + IscsiLun *iscsilun; + BlockDriverState *bs; + int status; + int complete; +}; + +static void +iscsi_bh_cb(void *p) +{ + IscsiAIOCB *acb = p; + + qemu_bh_delete(acb->bh); + + if (acb->canceled == 0) { + acb->common.cb(acb->common.opaque, acb->status); + } + + if (acb->task != NULL) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + } + + qemu_aio_release(acb); +} + +static void +iscsi_schedule_bh(IscsiAIOCB *acb) +{ + if (acb->bh) { + return; + } + acb->bh = qemu_bh_new(iscsi_bh_cb, acb); + qemu_bh_schedule(acb->bh); +} + + +static void +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, + void *private_data) +{ + IscsiAIOCB *acb = private_data; + + acb->status = -ECANCELED; + iscsi_schedule_bh(acb); +} + +static void +iscsi_aio_cancel(BlockDriverAIOCB *blockacb) +{ + IscsiAIOCB *acb = (IscsiAIOCB *)blockacb; + IscsiLun *iscsilun = acb->iscsilun; + + if (acb->status != -EINPROGRESS) { + return; + } + + acb->canceled = 1; + + /* send a task mgmt call to the target to cancel the task on the target */ + iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task, + iscsi_abort_task_cb, acb); + + while (acb->status == -EINPROGRESS) { + qemu_aio_wait(); + } +} + +static AIOPool iscsi_aio_pool = { + .aiocb_size = sizeof(IscsiAIOCB), + .cancel = iscsi_aio_cancel, +}; + + +static void iscsi_process_read(void *arg); +static void iscsi_process_write(void *arg); + +static int iscsi_process_flush(void *arg) +{ + IscsiLun *iscsilun = arg; + + return iscsi_queue_length(iscsilun->iscsi) > 0; +} + +static void +iscsi_set_events(IscsiLun *iscsilun) +{ + struct iscsi_context *iscsi = iscsilun->iscsi; + int ev; + + /* We always register a read handler. */ + ev = POLLIN; + ev |= iscsi_which_events(iscsi); + if (ev != iscsilun->events) { + qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), + iscsi_process_read, + (ev & POLLOUT) ? iscsi_process_write : NULL, + iscsi_process_flush, + iscsilun); + + } + + /* If we just added an event, the callback might be delayed + * unless we call qemu_notify_event(). + */ + if (ev & ~iscsilun->events) { + qemu_notify_event(); + } + iscsilun->events = ev; +} + +static void +iscsi_process_read(void *arg) +{ + IscsiLun *iscsilun = arg; + struct iscsi_context *iscsi = iscsilun->iscsi; + + iscsi_service(iscsi, POLLIN); + iscsi_set_events(iscsilun); +} + +static void +iscsi_process_write(void *arg) +{ + IscsiLun *iscsilun = arg; + struct iscsi_context *iscsi = iscsilun->iscsi; + + iscsi_service(iscsi, POLLOUT); + iscsi_set_events(iscsilun); +} + + +static void +iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled); + + g_free(acb->buf); + + if (acb->canceled != 0) { + return; + } + + acb->status = 0; + if (status < 0) { + error_report("Failed to write16 data to iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + iscsi_schedule_bh(acb); +} + +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) +{ + return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; +} + +static BlockDriverAIOCB * +iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + IscsiAIOCB *acb; + size_t size; + uint32_t num_sectors; + uint64_t lba; + struct iscsi_data data; + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb); + + acb->iscsilun = iscsilun; + acb->qiov = qiov; + + acb->canceled = 0; + acb->bh = NULL; + acb->status = -EINPROGRESS; + + /* XXX we should pass the iovec to write16 to avoid the extra copy */ + /* this will allow us to get rid of 'buf' completely */ + size = nb_sectors * BDRV_SECTOR_SIZE; + acb->buf = g_malloc(size); + qemu_iovec_to_buf(acb->qiov, 0, acb->buf, size); + + acb->task = malloc(sizeof(struct scsi_task)); + if (acb->task == NULL) { + error_report("iSCSI: Failed to allocate task for scsi WRITE16 " + "command. %s", iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + memset(acb->task, 0, sizeof(struct scsi_task)); + + acb->task->xfer_dir = SCSI_XFER_WRITE; + acb->task->cdb_size = 16; + acb->task->cdb[0] = 0x8a; + if (!(bs->open_flags & BDRV_O_CACHE_WB)) { + /* set FUA on writes when cache mode is write through */ + acb->task->cdb[1] |= 0x04; + } + lba = sector_qemu2lun(sector_num, iscsilun); + *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); + *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); + num_sectors = size / iscsilun->block_size; + *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); + acb->task->expxferlen = size; + + data.data = acb->buf; + data.size = size; + + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, + iscsi_aio_write16_cb, + &data, + acb) != 0) { + scsi_free_scsi_task(acb->task); + g_free(acb->buf); + qemu_aio_release(acb); + return NULL; + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +static void +iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled); + + if (acb->canceled != 0) { + return; + } + + acb->status = 0; + if (status != 0) { + error_report("Failed to read16 data from iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + iscsi_schedule_bh(acb); +} + +static BlockDriverAIOCB * +iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + IscsiAIOCB *acb; + size_t qemu_read_size; + int i; + uint64_t lba; + uint32_t num_sectors; + + qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors; + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb); + + acb->iscsilun = iscsilun; + acb->qiov = qiov; + + acb->canceled = 0; + acb->bh = NULL; + acb->status = -EINPROGRESS; + acb->read_size = qemu_read_size; + acb->buf = NULL; + + /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU + * may be misaligned to the LUN, so we may need to read some extra + * data. + */ + acb->read_offset = 0; + if (iscsilun->block_size > BDRV_SECTOR_SIZE) { + uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num; + + acb->read_offset = bdrv_offset % iscsilun->block_size; + } + + num_sectors = (qemu_read_size + iscsilun->block_size + + acb->read_offset - 1) + / iscsilun->block_size; + + acb->task = malloc(sizeof(struct scsi_task)); + if (acb->task == NULL) { + error_report("iSCSI: Failed to allocate task for scsi READ16 " + "command. %s", iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + memset(acb->task, 0, sizeof(struct scsi_task)); + + acb->task->xfer_dir = SCSI_XFER_READ; + lba = sector_qemu2lun(sector_num, iscsilun); + acb->task->expxferlen = qemu_read_size; + + switch (iscsilun->type) { + case TYPE_DISK: + acb->task->cdb_size = 16; + acb->task->cdb[0] = 0x88; + *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); + *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); + *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); + break; + default: + acb->task->cdb_size = 10; + acb->task->cdb[0] = 0x28; + *(uint32_t *)&acb->task->cdb[2] = htonl(lba); + *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors); + break; + } + + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, + iscsi_aio_read16_cb, + NULL, + acb) != 0) { + scsi_free_scsi_task(acb->task); + qemu_aio_release(acb); + return NULL; + } + + for (i = 0; i < acb->qiov->niov; i++) { + scsi_task_add_data_in_buffer(acb->task, + acb->qiov->iov[i].iov_len, + acb->qiov->iov[i].iov_base); + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + + +static void +iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + if (acb->canceled != 0) { + return; + } + + acb->status = 0; + if (status < 0) { + error_report("Failed to sync10 data on iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + iscsi_schedule_bh(acb); +} + +static BlockDriverAIOCB * +iscsi_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + IscsiAIOCB *acb; + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->canceled = 0; + acb->bh = NULL; + acb->status = -EINPROGRESS; + + acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun, + 0, 0, 0, 0, + iscsi_synccache10_cb, + acb); + if (acb->task == NULL) { + error_report("iSCSI: Failed to send synchronizecache10 command. %s", + iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +static void +iscsi_unmap_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + if (acb->canceled != 0) { + return; + } + + acb->status = 0; + if (status < 0) { + error_report("Failed to unmap data on iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + iscsi_schedule_bh(acb); +} + +static BlockDriverAIOCB * +iscsi_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + IscsiAIOCB *acb; + struct unmap_list list[1]; + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->canceled = 0; + acb->bh = NULL; + acb->status = -EINPROGRESS; + + list[0].lba = sector_qemu2lun(sector_num, iscsilun); + list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size; + + acb->task = iscsi_unmap_task(iscsi, iscsilun->lun, + 0, 0, &list[0], 1, + iscsi_unmap_cb, + acb); + if (acb->task == NULL) { + error_report("iSCSI: Failed to send unmap command. %s", + iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +#ifdef __linux__ +static void +iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + if (acb->canceled != 0) { + return; + } + + acb->status = 0; + if (status < 0) { + error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + acb->ioh->driver_status = 0; + acb->ioh->host_status = 0; + acb->ioh->resid = 0; + +#define SG_ERR_DRIVER_SENSE 0x08 + + if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) { + int ss; + + acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + + acb->ioh->sb_len_wr = acb->task->datain.size - 2; + ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? + acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; + memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); + } + + iscsi_schedule_bh(acb); +} + +static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + struct iscsi_data data; + IscsiAIOCB *acb; + + assert(req == SG_IO); + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->canceled = 0; + acb->bh = NULL; + acb->status = -EINPROGRESS; + acb->buf = NULL; + acb->ioh = buf; + + acb->task = malloc(sizeof(struct scsi_task)); + if (acb->task == NULL) { + error_report("iSCSI: Failed to allocate task for scsi command. %s", + iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + memset(acb->task, 0, sizeof(struct scsi_task)); + + switch (acb->ioh->dxfer_direction) { + case SG_DXFER_TO_DEV: + acb->task->xfer_dir = SCSI_XFER_WRITE; + break; + case SG_DXFER_FROM_DEV: + acb->task->xfer_dir = SCSI_XFER_READ; + break; + default: + acb->task->xfer_dir = SCSI_XFER_NONE; + break; + } + + acb->task->cdb_size = acb->ioh->cmd_len; + memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); + acb->task->expxferlen = acb->ioh->dxfer_len; + + if (acb->task->xfer_dir == SCSI_XFER_WRITE) { + data.data = acb->ioh->dxferp; + data.size = acb->ioh->dxfer_len; + } + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, + iscsi_aio_ioctl_cb, + (acb->task->xfer_dir == SCSI_XFER_WRITE) ? + &data : NULL, + acb) != 0) { + scsi_free_scsi_task(acb->task); + qemu_aio_release(acb); + return NULL; + } + + /* tell libiscsi to read straight into the buffer we got from ioctl */ + if (acb->task->xfer_dir == SCSI_XFER_READ) { + scsi_task_add_data_in_buffer(acb->task, + acb->ioh->dxfer_len, + acb->ioh->dxferp); + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + IscsiLun *iscsilun = bs->opaque; + + switch (req) { + case SG_GET_VERSION_NUM: + *(int *)buf = 30000; + break; + case SG_GET_SCSI_ID: + ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; + break; + default: + return -1; + } + return 0; +} +#endif + +static int64_t +iscsi_getlength(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + int64_t len; + + len = iscsilun->num_blocks; + len *= iscsilun->block_size; + + return len; +} + +static void +iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + struct IscsiTask *itask = opaque; + struct scsi_readcapacity16 *rc16; + struct scsi_task *task = command_data; + + if (status != 0) { + error_report("iSCSI: Failed to read capacity of iSCSI lun. %s", + iscsi_get_error(iscsi)); + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + rc16 = scsi_datain_unmarshall(task); + if (rc16 == NULL) { + error_report("iSCSI: Failed to unmarshall readcapacity16 data."); + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + itask->iscsilun->block_size = rc16->block_length; + itask->iscsilun->num_blocks = rc16->returned_lba + 1; + itask->bs->total_sectors = itask->iscsilun->num_blocks * + itask->iscsilun->block_size / BDRV_SECTOR_SIZE ; + + itask->status = 0; + itask->complete = 1; + scsi_free_scsi_task(task); +} + +static void +iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + struct IscsiTask *itask = opaque; + struct scsi_readcapacity10 *rc10; + struct scsi_task *task = command_data; + + if (status != 0) { + error_report("iSCSI: Failed to read capacity of iSCSI lun. %s", + iscsi_get_error(iscsi)); + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + rc10 = scsi_datain_unmarshall(task); + if (rc10 == NULL) { + error_report("iSCSI: Failed to unmarshall readcapacity10 data."); + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + itask->iscsilun->block_size = rc10->block_size; + if (rc10->lba == 0) { + /* blank disk loaded */ + itask->iscsilun->num_blocks = 0; + } else { + itask->iscsilun->num_blocks = rc10->lba + 1; + } + itask->bs->total_sectors = itask->iscsilun->num_blocks * + itask->iscsilun->block_size / BDRV_SECTOR_SIZE ; + + itask->status = 0; + itask->complete = 1; + scsi_free_scsi_task(task); +} + +static void +iscsi_inquiry_cb(struct iscsi_context *iscsi, int status, void *command_data, + void *opaque) +{ + struct IscsiTask *itask = opaque; + struct scsi_task *task = command_data; + struct scsi_inquiry_standard *inq; + + if (status != 0) { + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + inq = scsi_datain_unmarshall(task); + if (inq == NULL) { + error_report("iSCSI: Failed to unmarshall inquiry data."); + itask->status = 1; + itask->complete = 1; + scsi_free_scsi_task(task); + return; + } + + itask->iscsilun->type = inq->periperal_device_type; + + scsi_free_scsi_task(task); + + switch (itask->iscsilun->type) { + case TYPE_DISK: + task = iscsi_readcapacity16_task(iscsi, itask->iscsilun->lun, + iscsi_readcapacity16_cb, opaque); + if (task == NULL) { + error_report("iSCSI: failed to send readcapacity16 command."); + itask->status = 1; + itask->complete = 1; + return; + } + break; + case TYPE_ROM: + task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun, + 0, 0, + iscsi_readcapacity10_cb, opaque); + if (task == NULL) { + error_report("iSCSI: failed to send readcapacity16 command."); + itask->status = 1; + itask->complete = 1; + return; + } + break; + default: + itask->status = 0; + itask->complete = 1; + } +} + +static void +iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data, + void *opaque) +{ + struct IscsiTask *itask = opaque; + struct scsi_task *task; + + if (status != 0) { + itask->status = 1; + itask->complete = 1; + return; + } + + task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun, + 0, 0, 36, + iscsi_inquiry_cb, opaque); + if (task == NULL) { + error_report("iSCSI: failed to send inquiry command."); + itask->status = 1; + itask->complete = 1; + return; + } +} + +static int parse_chap(struct iscsi_context *iscsi, const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *user = NULL; + const char *password = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return 0; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return 0; + } + } + + user = qemu_opt_get(opts, "user"); + if (!user) { + return 0; + } + + password = qemu_opt_get(opts, "password"); + if (!password) { + error_report("CHAP username specified but no password was given"); + return -1; + } + + if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { + error_report("Failed to set initiator username and password"); + return -1; + } + + return 0; +} + +static void parse_header_digest(struct iscsi_context *iscsi, const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *digest = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return; + } + } + + digest = qemu_opt_get(opts, "header-digest"); + if (!digest) { + return; + } + + if (!strcmp(digest, "CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C); + } else if (!strcmp(digest, "NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE); + } else if (!strcmp(digest, "CRC32C-NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE); + } else if (!strcmp(digest, "NONE-CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + } else { + error_report("Invalid header-digest setting : %s", digest); + } +} + +static char *parse_initiator_name(const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *name = NULL; + const char *iscsi_name = qemu_get_vm_name(); + + list = qemu_find_opts("iscsi"); + if (list) { + opts = qemu_opts_find(list, target); + if (!opts) { + opts = QTAILQ_FIRST(&list->head); + } + if (opts) { + name = qemu_opt_get(opts, "initiator-name"); + } + } + + if (name) { + return g_strdup(name); + } else { + return g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", + iscsi_name ? ":" : "", + iscsi_name ? iscsi_name : ""); + } +} + +/* + * We support iscsi url's on the form + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> + */ +static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = NULL; + struct iscsi_url *iscsi_url = NULL; + struct IscsiTask task; + char *initiator_name = NULL; + int ret; + + if ((BDRV_SECTOR_SIZE % 512) != 0) { + error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. " + "BDRV_SECTOR_SIZE(%lld) is not a multiple " + "of 512", BDRV_SECTOR_SIZE); + return -EINVAL; + } + + iscsi_url = iscsi_parse_full_url(iscsi, filename); + if (iscsi_url == NULL) { + error_report("Failed to parse URL : %s %s", filename, + iscsi_get_error(iscsi)); + ret = -EINVAL; + goto out; + } + + memset(iscsilun, 0, sizeof(IscsiLun)); + + initiator_name = parse_initiator_name(iscsi_url->target); + + iscsi = iscsi_create_context(initiator_name); + if (iscsi == NULL) { + error_report("iSCSI: Failed to create iSCSI context."); + ret = -ENOMEM; + goto out; + } + + if (iscsi_set_targetname(iscsi, iscsi_url->target)) { + error_report("iSCSI: Failed to set target name."); + ret = -EINVAL; + goto out; + } + + if (iscsi_url->user != NULL) { + ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, + iscsi_url->passwd); + if (ret != 0) { + error_report("Failed to set initiator username and password"); + ret = -EINVAL; + goto out; + } + } + + /* check if we got CHAP username/password via the options */ + if (parse_chap(iscsi, iscsi_url->target) != 0) { + error_report("iSCSI: Failed to set CHAP user/password"); + ret = -EINVAL; + goto out; + } + + if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { + error_report("iSCSI: Failed to set session type to normal."); + ret = -EINVAL; + goto out; + } + + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + + /* check if we got HEADER_DIGEST via the options */ + parse_header_digest(iscsi, iscsi_url->target); + + task.iscsilun = iscsilun; + task.status = 0; + task.complete = 0; + task.bs = bs; + + iscsilun->iscsi = iscsi; + iscsilun->lun = iscsi_url->lun; + + if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun, + iscsi_connect_cb, &task) + != 0) { + error_report("iSCSI: Failed to start async connect."); + ret = -EINVAL; + goto out; + } + + while (!task.complete) { + iscsi_set_events(iscsilun); + qemu_aio_wait(); + } + if (task.status != 0) { + error_report("iSCSI: Failed to connect to LUN : %s", + iscsi_get_error(iscsi)); + ret = -EINVAL; + goto out; + } + + /* Medium changer or tape. We dont have any emulation for this so this must + * be sg ioctl compatible. We force it to be sg, otherwise qemu will try + * to read from the device to guess the image format. + */ + if (iscsilun->type == TYPE_MEDIUM_CHANGER || + iscsilun->type == TYPE_TAPE) { + bs->sg = 1; + } + + ret = 0; + +out: + if (initiator_name != NULL) { + g_free(initiator_name); + } + if (iscsi_url != NULL) { + iscsi_destroy_url(iscsi_url); + } + + if (ret) { + if (iscsi != NULL) { + iscsi_destroy_context(iscsi); + } + memset(iscsilun, 0, sizeof(IscsiLun)); + } + return ret; +} + +static void iscsi_close(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + + qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL); + iscsi_destroy_context(iscsi); + memset(iscsilun, 0, sizeof(IscsiLun)); +} + +static BlockDriver bdrv_iscsi = { + .format_name = "iscsi", + .protocol_name = "iscsi", + + .instance_size = sizeof(IscsiLun), + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, + + .bdrv_getlength = iscsi_getlength, + + .bdrv_aio_readv = iscsi_aio_readv, + .bdrv_aio_writev = iscsi_aio_writev, + .bdrv_aio_flush = iscsi_aio_flush, + + .bdrv_aio_discard = iscsi_aio_discard, + +#ifdef __linux__ + .bdrv_ioctl = iscsi_ioctl, + .bdrv_aio_ioctl = iscsi_aio_ioctl, +#endif +}; + +static void iscsi_block_init(void) +{ + bdrv_register(&bdrv_iscsi); +} + +block_init(iscsi_block_init); diff --git a/block/nbd.c b/block/nbd.c new file mode 100644 index 000000000..2bce47bf7 --- /dev/null +++ b/block/nbd.c @@ -0,0 +1,517 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "nbd.h" +#include "block_int.h" +#include "module.h" +#include "qemu_socket.h" + +#include <sys/types.h> +#include <unistd.h> + +#define EN_OPTSTR ":exportname=" + +/* #define DEBUG_NBD */ + +#if defined(DEBUG_NBD) +#define logout(fmt, ...) \ + fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +#define MAX_NBD_REQUESTS 16 +#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) +#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) + +typedef struct BDRVNBDState { + int sock; + uint32_t nbdflags; + off_t size; + size_t blocksize; + char *export_name; /* An NBD server may export several devices */ + + CoMutex send_mutex; + CoMutex free_sema; + Coroutine *send_coroutine; + int in_flight; + + Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; + struct nbd_reply reply; + + /* If it begins with '/', this is a UNIX domain socket. Otherwise, + * it's a string of the form <hostname|ip4|\[ip6\]>:port + */ + char *host_spec; +} BDRVNBDState; + +static int nbd_config(BDRVNBDState *s, const char *filename, int flags) +{ + char *file; + char *export_name; + const char *host_spec; + const char *unixpath; + int err = -EINVAL; + + file = g_strdup(filename); + + export_name = strstr(file, EN_OPTSTR); + if (export_name) { + if (export_name[strlen(EN_OPTSTR)] == 0) { + goto out; + } + export_name[0] = 0; /* truncate 'file' */ + export_name += strlen(EN_OPTSTR); + s->export_name = g_strdup(export_name); + } + + /* extract the host_spec - fail if it's not nbd:... */ + if (!strstart(file, "nbd:", &host_spec)) { + goto out; + } + + /* are we a UNIX or TCP socket? */ + if (strstart(host_spec, "unix:", &unixpath)) { + if (unixpath[0] != '/') { /* We demand an absolute path*/ + goto out; + } + s->host_spec = g_strdup(unixpath); + } else { + s->host_spec = g_strdup(host_spec); + } + + err = 0; + +out: + g_free(file); + if (err != 0) { + g_free(s->export_name); + g_free(s->host_spec); + } + return err; +} + +static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) +{ + int i; + + /* Poor man semaphore. The free_sema is locked when no other request + * can be accepted, and unlocked after receiving one reply. */ + if (s->in_flight >= MAX_NBD_REQUESTS - 1) { + qemu_co_mutex_lock(&s->free_sema); + assert(s->in_flight < MAX_NBD_REQUESTS); + } + s->in_flight++; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i] == NULL) { + s->recv_coroutine[i] = qemu_coroutine_self(); + break; + } + } + + assert(i < MAX_NBD_REQUESTS); + request->handle = INDEX_TO_HANDLE(s, i); +} + +static int nbd_have_request(void *opaque) +{ + BDRVNBDState *s = opaque; + + return s->in_flight > 0; +} + +static void nbd_reply_ready(void *opaque) +{ + BDRVNBDState *s = opaque; + uint64_t i; + int ret; + + if (s->reply.handle == 0) { + /* No reply already in flight. Fetch a header. It is possible + * that another thread has done the same thing in parallel, so + * the socket is not readable anymore. + */ + ret = nbd_receive_reply(s->sock, &s->reply); + if (ret == -EAGAIN) { + return; + } + if (ret < 0) { + s->reply.handle = 0; + goto fail; + } + } + + /* There's no need for a mutex on the receive side, because the + * handler acts as a synchronization point and ensures that only + * one coroutine is called until the reply finishes. */ + i = HANDLE_TO_INDEX(s, s->reply.handle); + if (i >= MAX_NBD_REQUESTS) { + goto fail; + } + + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + return; + } + +fail: + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + } + } +} + +static void nbd_restart_write(void *opaque) +{ + BDRVNBDState *s = opaque; + qemu_coroutine_enter(s->send_coroutine, NULL); +} + +static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, + QEMUIOVector *qiov, int offset) +{ + int rc, ret; + + qemu_co_mutex_lock(&s->send_mutex); + s->send_coroutine = qemu_coroutine_self(); + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, + nbd_have_request, s); + rc = nbd_send_request(s->sock, request); + if (rc >= 0 && qiov) { + ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + return -EIO; + } + } + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, + nbd_have_request, s); + s->send_coroutine = NULL; + qemu_co_mutex_unlock(&s->send_mutex); + return rc; +} + +static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request, + struct nbd_reply *reply, + QEMUIOVector *qiov, int offset) +{ + int ret; + + /* Wait until we're woken up by the read handler. TODO: perhaps + * peek at the next reply and avoid yielding if it's ours? */ + qemu_coroutine_yield(); + *reply = s->reply; + if (reply->handle != request->handle) { + reply->error = EIO; + } else { + if (qiov && reply->error == 0) { + ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + reply->error = EIO; + } + } + + /* Tell the read handler to read another header. */ + s->reply.handle = 0; + } +} + +static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request) +{ + int i = HANDLE_TO_INDEX(s, request->handle); + s->recv_coroutine[i] = NULL; + if (s->in_flight-- == MAX_NBD_REQUESTS) { + qemu_co_mutex_unlock(&s->free_sema); + } +} + +static int nbd_establish_connection(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + int sock; + int ret; + off_t size; + size_t blocksize; + + if (s->host_spec[0] == '/') { + sock = unix_socket_outgoing(s->host_spec); + } else { + sock = tcp_socket_outgoing_spec(s->host_spec); + } + + /* Failed to establish connection */ + if (sock < 0) { + logout("Failed to establish connection to NBD server\n"); + return -errno; + } + + /* NBD handshake */ + ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, + &blocksize); + if (ret < 0) { + logout("Failed to negotiate with the NBD server\n"); + closesocket(sock); + return ret; + } + + /* Now that we're connected, set the socket to be non-blocking and + * kick the reply mechanism. */ + socket_set_nonblock(sock); + qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, + nbd_have_request, s); + + s->sock = sock; + s->size = size; + s->blocksize = blocksize; + + logout("Established connection with NBD server\n"); + return 0; +} + +static void nbd_teardown_connection(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + + request.type = NBD_CMD_DISC; + request.from = 0; + request.len = 0; + nbd_send_request(s->sock, &request); + + qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); + closesocket(s->sock); +} + +static int nbd_open(BlockDriverState *bs, const char* filename, int flags) +{ + BDRVNBDState *s = bs->opaque; + int result; + + qemu_co_mutex_init(&s->send_mutex); + qemu_co_mutex_init(&s->free_sema); + + /* Pop the config into our state object. Exit if invalid. */ + result = nbd_config(s, filename, flags); + if (result != 0) { + return result; + } + + /* establish TCP connection, return error if it fails + * TODO: Configurable retry-until-timeout behaviour. + */ + result = nbd_establish_connection(bs); + + return result; +} + +static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + ssize_t ret; + + request.type = NBD_CMD_READ; + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(s, &request); + ret = nbd_co_send_request(s, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(s, &request, &reply, qiov, offset); + } + nbd_coroutine_end(s, &request); + return -reply.error; + +} + +static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + ssize_t ret; + + request.type = NBD_CMD_WRITE; + if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(s, &request); + ret = nbd_co_send_request(s, &request, qiov, offset); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(s, &request, &reply, NULL, 0); + } + nbd_coroutine_end(s, &request); + return -reply.error; +} + +/* qemu-nbd has a limit of slightly less than 1M per request. Try to + * remain aligned to 4K. */ +#define NBD_MAX_SECTORS 2040 + +static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); +} + +static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); +} + +static int nbd_co_flush(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + ssize_t ret; + + if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) { + return 0; + } + + request.type = NBD_CMD_FLUSH; + if (s->nbdflags & NBD_FLAG_SEND_FUA) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = 0; + request.len = 0; + + nbd_coroutine_start(s, &request); + ret = nbd_co_send_request(s, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(s, &request, &reply, NULL, 0); + } + nbd_coroutine_end(s, &request); + return -reply.error; +} + +static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + ssize_t ret; + + if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) { + return 0; + } + request.type = NBD_CMD_TRIM; + request.from = sector_num * 512;; + request.len = nb_sectors * 512; + + nbd_coroutine_start(s, &request); + ret = nbd_co_send_request(s, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(s, &request, &reply, NULL, 0); + } + nbd_coroutine_end(s, &request); + return -reply.error; +} + +static void nbd_close(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + g_free(s->export_name); + g_free(s->host_spec); + + nbd_teardown_connection(bs); +} + +static int64_t nbd_getlength(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + + return s->size; +} + +static BlockDriver bdrv_nbd = { + .format_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_getlength = nbd_getlength, + .protocol_name = "nbd", +}; + +static void bdrv_nbd_init(void) +{ + bdrv_register(&bdrv_nbd); +} + +block_init(bdrv_nbd_init); diff --git a/block/parallels.c b/block/parallels.c new file mode 100644 index 000000000..d30f0ecf7 --- /dev/null +++ b/block/parallels.c @@ -0,0 +1,170 @@ +/* + * Block driver for Parallels disk image format + * + * Copyright (c) 2007 Alex Beregszaszi + * + * This code is based on comparing different disk images created by Parallels. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "WithoutFreeSpace" +#define HEADER_VERSION 2 +#define HEADER_SIZE 64 + +// always little-endian +struct parallels_header { + char magic[16]; // "WithoutFreeSpace" + uint32_t version; + uint32_t heads; + uint32_t cylinders; + uint32_t tracks; + uint32_t catalog_entries; + uint32_t nb_sectors; + char padding[24]; +} QEMU_PACKED; + +typedef struct BDRVParallelsState { + CoMutex lock; + + uint32_t *catalog_bitmap; + int catalog_size; + + int tracks; +} BDRVParallelsState; + +static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct parallels_header *ph = (const void *)buf; + + if (buf_size < HEADER_SIZE) + return 0; + + if (!memcmp(ph->magic, HEADER_MAGIC, 16) && + (le32_to_cpu(ph->version) == HEADER_VERSION)) + return 100; + + return 0; +} + +static int parallels_open(BlockDriverState *bs, int flags) +{ + BDRVParallelsState *s = bs->opaque; + int i; + struct parallels_header ph; + + bs->read_only = 1; // no write support yet + + if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph)) + goto fail; + + if (memcmp(ph.magic, HEADER_MAGIC, 16) || + (le32_to_cpu(ph.version) != HEADER_VERSION)) { + goto fail; + } + + bs->total_sectors = le32_to_cpu(ph.nb_sectors); + + s->tracks = le32_to_cpu(ph.tracks); + + s->catalog_size = le32_to_cpu(ph.catalog_entries); + s->catalog_bitmap = g_malloc(s->catalog_size * 4); + if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) != + s->catalog_size * 4) + goto fail; + for (i = 0; i < s->catalog_size; i++) + le32_to_cpus(&s->catalog_bitmap[i]); + + qemu_co_mutex_init(&s->lock); + return 0; +fail: + if (s->catalog_bitmap) + g_free(s->catalog_bitmap); + return -1; +} + +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ + BDRVParallelsState *s = bs->opaque; + uint32_t index, offset; + + index = sector_num / s->tracks; + offset = sector_num % s->tracks; + + /* not allocated */ + if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0)) + return -1; + return (uint64_t)(s->catalog_bitmap[index] + offset) * 512; +} + +static int parallels_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + while (nb_sectors > 0) { + int64_t position = seek_to_sector(bs, sector_num); + if (position >= 0) { + if (bdrv_pread(bs->file, position, buf, 512) != 512) + return -1; + } else { + memset(buf, 0, 512); + } + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static coroutine_fn int parallels_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVParallelsState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = parallels_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void parallels_close(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + g_free(s->catalog_bitmap); +} + +static BlockDriver bdrv_parallels = { + .format_name = "parallels", + .instance_size = sizeof(BDRVParallelsState), + .bdrv_probe = parallels_probe, + .bdrv_open = parallels_open, + .bdrv_read = parallels_co_read, + .bdrv_close = parallels_close, +}; + +static void bdrv_parallels_init(void) +{ + bdrv_register(&bdrv_parallels); +} + +block_init(bdrv_parallels_init); diff --git a/block/qcow.c b/block/qcow.c new file mode 100644 index 000000000..7b5ab87d2 --- /dev/null +++ b/block/qcow.c @@ -0,0 +1,890 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> +#include "aes.h" +#include "migration.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + uint64_t *l2_cache; + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + CoMutex lock; + Error *migration_blocker; +} BDRVQcowState; + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) == QCOW_VERSION) + return 100; + else + return 0; +} + +static int qcow_open(BlockDriverState *bs, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, shift, ret; + QCowHeader header; + + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + if (ret < 0) { + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be32_to_cpus(&header.mtime); + be64_to_cpus(&header.size); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + + if (header.magic != QCOW_MAGIC) { + ret = -EINVAL; + goto fail; + } + if (header.version != QCOW_VERSION) { + char version[64]; + snprintf(version, sizeof(version), "QCOW version %d", header.version); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow", version); + ret = -ENOTSUP; + goto fail; + } + + if (header.size <= 1 || header.cluster_bits < 9) { + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header.l2_bits; + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + + /* read the level 1 table */ + shift = s->cluster_bits + s->l2_bits; + s->l1_size = (header.size + (1LL << shift) - 1) >> shift; + + s->l1_table_offset = header.l1_table_offset; + s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + /* alloc L2 cache */ + s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + s->cluster_cache = g_malloc(s->cluster_size); + s->cluster_data = g_malloc(s->cluster_size); + s->cluster_cache_offset = -1; + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) { + len = 1023; + } + ret = bdrv_pread(bs->file, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + goto fail; + } + bs->backing_file[len] = '\0'; + } + + /* Disable migration when qcow images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "qcow", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + + qemu_co_mutex_init(&s->lock); + return 0; + + fail: + g_free(s->l1_table); + g_free(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + return ret; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; + return 0; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + int min_index, i, j, l1_index, l2_index; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* allocate a new l2 entry */ + l2_offset = bdrv_getlength(bs->file); + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + tmp = cpu_to_be64(l2_offset); + if (bdrv_pwrite_sync(bs->file, + s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + new_l2_table = 1; + } + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (new_l2_table) { + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) < 0) + return 0; + } else { + if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { + if (!allocate) + return 0; + /* allocate a new cluster */ + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* if the cluster is already compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(bs, cluster_offset) < 0) + return 0; + cluster_offset = bdrv_getlength(bs->file); + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + /* write the cluster content */ + if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != + s->cluster_size) + return -1; + } else { + cluster_offset = bdrv_getlength(bs->file); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + bdrv_truncate(bs->file, cluster_offset + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (s->crypt_method && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + memset(s->cluster_data + 512, 0x00, 512); + for(i = 0; i < s->cluster_sectors; i++) { + if (i < n_start || i >= n_end) { + encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, 1, + &s->aes_encrypt_key); + if (bdrv_pwrite(bs->file, cluster_offset + i * 512, + s->cluster_data, 512) != 512) + return -1; + } + } + } + } else if (allocate == 2) { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + } + return cluster_offset; +} + +static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n; + uint64_t cluster_offset; + + qemu_co_mutex_lock(&s->lock); + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + qemu_co_mutex_unlock(&s->lock); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + int ret = 0, n; + uint64_t cluster_offset; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + /* prepare next request */ + cluster_offset = get_cluster_offset(bs, sector_num << 9, + 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + + if (!cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + } else { + /* Note: in this case, no need to wait */ + memset(buf, 0, 512 * n); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(bs, cluster_offset) < 0) { + goto fail; + } + memcpy(buf, + s->cluster_cache + index_in_cluster * 512, 512 * n); + } else { + if ((cluster_offset & 511) != 0) { + goto fail; + } + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + if (s->crypt_method) { + encrypt_sectors(s, sector_num, buf, buf, + n, 0, + &s->aes_decrypt_key); + } + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + +done: + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); + qemu_vfree(orig_buf); + } + + return ret; + +fail: + ret = -EIO; + goto done; +} + +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + uint64_t cluster_offset; + const uint8_t *src_buf; + int ret = 0, n; + uint8_t *cluster_data = NULL; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buf(qiov, 0, buf, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); + if (!cluster_offset || (cluster_offset & 511) != 0) { + ret = -EIO; + break; + } + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = g_malloc0(s->cluster_size); + } + encrypt_sectors(s, sector_num, cluster_data, buf, + n, 1, &s->aes_encrypt_key); + src_buf = cluster_data; + } else { + src_buf = buf; + } + + hd_iov.iov_base = (void *)src_buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_vfree(orig_buf); + } + g_free(cluster_data); + + return ret; +} + +static void qcow_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + g_free(s->l1_table); + g_free(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static int qcow_create(const char *filename, QEMUOptionParameter *options) +{ + int header_size, backing_filename_len, l1_size, shift, i; + QCowHeader header; + uint8_t *tmp; + int64_t total_size = 0; + const char *backing_file = NULL; + int flags = 0; + int ret; + BlockDriverState *qcow_bs; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / 512; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { + flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; + } + options++; + } + + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + ret = bdrv_truncate(qcow_bs, 0); + if (ret < 0) { + goto exit; + } + + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + header.size = cpu_to_be64(total_size * 512); + header_size = sizeof(header); + backing_filename_len = 0; + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_file); + header.backing_file_size = cpu_to_be32(backing_filename_len); + header_size += backing_filename_len; + } else { + /* special backing file for vvfat */ + backing_file = NULL; + } + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodifyed sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + } else { + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + } + header_size = (header_size + 7) & ~7; + shift = header.cluster_bits + header.l2_bits; + l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + /* write all the data */ + ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + if (ret != sizeof(header)) { + goto exit; + } + + if (backing_file) { + ret = bdrv_pwrite(qcow_bs, sizeof(header), + backing_file, backing_filename_len); + if (ret != backing_filename_len) { + goto exit; + } + } + + tmp = g_malloc0(BDRV_SECTOR_SIZE); + for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ + BDRV_SECTOR_SIZE); i++) { + ret = bdrv_pwrite(qcow_bs, header_size + + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + if (ret != BDRV_SECTOR_SIZE) { + g_free(tmp); + goto exit; + } + } + + g_free(tmp); + ret = 0; +exit: + bdrv_delete(qcow_bs); + return ret; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors != s->cluster_sectors) + return -EINVAL; + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); + if (cluster_offset == 0) { + ret = -EIO; + goto fail; + } + + cluster_offset &= s->cluster_offset_mask; + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + return 0; +} + + +static QEMUOptionParameter qcow_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = OPT_FLAG, + .help = "Encrypt the image" + }, + { NULL } +}; + +static BlockDriver bdrv_qcow = { + .format_name = "qcow", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, + .bdrv_create = qcow_create, + + .bdrv_co_readv = qcow_co_readv, + .bdrv_co_writev = qcow_co_writev, + .bdrv_co_is_allocated = qcow_co_is_allocated, + + .bdrv_set_key = qcow_set_key, + .bdrv_make_empty = qcow_make_empty, + .bdrv_write_compressed = qcow_write_compressed, + .bdrv_get_info = qcow_get_info, + + .create_options = qcow_create_options, +}; + +static void bdrv_qcow_init(void) +{ + bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c new file mode 100644 index 000000000..2d4322a8d --- /dev/null +++ b/block/qcow2-cache.c @@ -0,0 +1,323 @@ +/* + * L2/refcount table cache for the QCOW2 format + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block_int.h" +#include "qemu-common.h" +#include "qcow2.h" +#include "trace.h" + +typedef struct Qcow2CachedTable { + void* table; + int64_t offset; + bool dirty; + int cache_hits; + int ref; +} Qcow2CachedTable; + +struct Qcow2Cache { + Qcow2CachedTable* entries; + struct Qcow2Cache* depends; + int size; + bool depends_on_flush; +}; + +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) +{ + BDRVQcowState *s = bs->opaque; + Qcow2Cache *c; + int i; + + c = g_malloc0(sizeof(*c)); + c->size = num_tables; + c->entries = g_malloc0(sizeof(*c->entries) * num_tables); + + for (i = 0; i < c->size; i++) { + c->entries[i].table = qemu_blockalign(bs, s->cluster_size); + } + + return c; +} + +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) +{ + int i; + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + qemu_vfree(c->entries[i].table); + } + + g_free(c->entries); + g_free(c); + + return 0; +} + +static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret; + + ret = qcow2_cache_flush(bs, c->depends); + if (ret < 0) { + return ret; + } + + c->depends = NULL; + c->depends_on_flush = false; + + return 0; +} + +static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) +{ + BDRVQcowState *s = bs->opaque; + int ret = 0; + + if (!c->entries[i].dirty || !c->entries[i].offset) { + return 0; + } + + trace_qcow2_cache_entry_flush(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + if (c->depends) { + ret = qcow2_cache_flush_dependency(bs, c); + } else if (c->depends_on_flush) { + ret = bdrv_flush(bs->file); + if (ret >= 0) { + c->depends_on_flush = false; + } + } + + if (ret < 0) { + return ret; + } + + if (c == s->refcount_block_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); + } else if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); + } + + ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table, + s->cluster_size); + if (ret < 0) { + return ret; + } + + c->entries[i].dirty = false; + + return 0; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ + BDRVQcowState *s = bs->opaque; + int result = 0; + int ret; + int i; + + trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); + + for (i = 0; i < c->size; i++) { + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0 && result != -ENOSPC) { + result = ret; + } + } + + if (result == 0) { + ret = bdrv_flush(bs->file); + if (ret < 0) { + result = ret; + } + } + + return result; +} + +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency) +{ + int ret; + + if (dependency->depends) { + ret = qcow2_cache_flush_dependency(bs, dependency); + if (ret < 0) { + return ret; + } + } + + if (c->depends && (c->depends != dependency)) { + ret = qcow2_cache_flush_dependency(bs, c); + if (ret < 0) { + return ret; + } + } + + c->depends = dependency; + return 0; +} + +void qcow2_cache_depends_on_flush(Qcow2Cache *c) +{ + c->depends_on_flush = true; +} + +static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) +{ + int i; + int min_count = INT_MAX; + int min_index = -1; + + + for (i = 0; i < c->size; i++) { + if (c->entries[i].ref) { + continue; + } + + if (c->entries[i].cache_hits < min_count) { + min_index = i; + min_count = c->entries[i].cache_hits; + } + + /* Give newer hits priority */ + /* TODO Check how to optimize the replacement strategy */ + c->entries[i].cache_hits /= 2; + } + + if (min_index == -1) { + /* This can't happen in current synchronous code, but leave the check + * here as a reminder for whoever starts using AIO with the cache */ + abort(); + } + return min_index; +} + +static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, + uint64_t offset, void **table, bool read_from_disk) +{ + BDRVQcowState *s = bs->opaque; + int i; + int ret; + + trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, + offset, read_from_disk); + + /* Check if the table is already cached */ + for (i = 0; i < c->size; i++) { + if (c->entries[i].offset == offset) { + goto found; + } + } + + /* If not, write a table back and replace it */ + i = qcow2_cache_find_entry_to_replace(c); + trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), + c == s->l2_table_cache, i); + if (i < 0) { + return i; + } + + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0) { + return ret; + } + + trace_qcow2_cache_get_read(qemu_coroutine_self(), + c == s->l2_table_cache, i); + c->entries[i].offset = 0; + if (read_from_disk) { + if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); + } + + ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size); + if (ret < 0) { + return ret; + } + } + + /* Give the table some hits for the start so that it won't be replaced + * immediately. The number 32 is completely arbitrary. */ + c->entries[i].cache_hits = 32; + c->entries[i].offset = offset; + + /* And return the right table */ +found: + c->entries[i].cache_hits++; + c->entries[i].ref++; + *table = c->entries[i].table; + + trace_qcow2_cache_get_done(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + return 0; +} + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, true); +} + +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, false); +} + +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +{ + int i; + + for (i = 0; i < c->size; i++) { + if (c->entries[i].table == *table) { + goto found; + } + } + return -ENOENT; + +found: + c->entries[i].ref--; + *table = NULL; + + assert(c->entries[i].ref >= 0); + return 0; +} + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) +{ + int i; + + for (i = 0; i < c->size; i++) { + if (c->entries[i].table == table) { + goto found; + } + } + abort(); + +found: + c->entries[i].dirty = true; +} diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c new file mode 100644 index 000000000..e179211c5 --- /dev/null +++ b/block/qcow2-cluster.c @@ -0,0 +1,1199 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <zlib.h> + +#include "qemu-common.h" +#include "block_int.h" +#include "block/qcow2.h" +#include "trace.h" + +int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) +{ + BDRVQcowState *s = bs->opaque; + int new_l1_size, new_l1_size2, ret, i; + uint64_t *new_l1_table; + int64_t new_l1_table_offset; + uint8_t data[12]; + + if (min_size <= s->l1_size) + return 0; + + if (exact_size) { + new_l1_size = min_size; + } else { + /* Bump size up to reduce the number of times we have to grow */ + new_l1_size = s->l1_size; + if (new_l1_size == 0) { + new_l1_size = 1; + } + while (min_size > new_l1_size) { + new_l1_size = (new_l1_size * 3 + 1) / 2; + } + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size); +#endif + + new_l1_size2 = sizeof(uint64_t) * new_l1_size; + new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + + /* write new table (align to cluster) */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); + new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); + if (new_l1_table_offset < 0) { + g_free(new_l1_table); + return new_l1_table_offset; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = cpu_to_be64(new_l1_table[i]); + ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); + if (ret < 0) + goto fail; + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + + /* set new table */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); + cpu_to_be32w((uint32_t*)data, new_l1_size); + cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); + if (ret < 0) { + goto fail; + } + g_free(s->l1_table); + qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t)); + s->l1_table_offset = new_l1_table_offset; + s->l1_table = new_l1_table; + s->l1_size = new_l1_size; + return 0; + fail: + g_free(new_l1_table); + qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2); + return ret; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, + uint64_t **l2_table) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); + + return ret; +} + +/* + * Writes one sector of the L1 table to the disk (can't update single entries + * and we really don't want bdrv_pread to perform a read-modify-write) + */ +#define L1_ENTRIES_PER_SECTOR (512 / 8) +static int write_l1_entry(BlockDriverState *bs, int l1_index) +{ + BDRVQcowState *s = bs->opaque; + uint64_t buf[L1_ENTRIES_PER_SECTOR]; + int l1_start_index; + int i, ret; + + l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); + for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { + buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, + buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) +{ + BDRVQcowState *s = bs->opaque; + uint64_t old_l2_offset; + uint64_t *l2_table; + int64_t l2_offset; + int ret; + + old_l2_offset = s->l1_table[l1_index]; + + trace_qcow2_l2_allocate(bs, l1_index); + + /* allocate a new l2 entry */ + + l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); + if (l2_offset < 0) { + return l2_offset; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* allocate a new entry in the l2 cache */ + + trace_qcow2_l2_allocate_get_empty(bs, l1_index); + ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); + if (ret < 0) { + return ret; + } + + l2_table = *table; + + if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { + /* if there was no old l2 table, clear the new table */ + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + } else { + uint64_t* old_table; + + /* if there was an old l2 table, read it from the disk */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); + ret = qcow2_cache_get(bs, s->l2_table_cache, + old_l2_offset & L1E_OFFSET_MASK, + (void**) &old_table); + if (ret < 0) { + goto fail; + } + + memcpy(l2_table, old_table, s->cluster_size); + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); + if (ret < 0) { + goto fail; + } + } + + /* write the l2 table to the file */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + + trace_qcow2_l2_allocate_write_l2(bs, l1_index); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + /* update the L1 entry */ + trace_qcow2_l2_allocate_write_l1(bs, l1_index); + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + ret = write_l1_entry(bs, l1_index); + if (ret < 0) { + goto fail; + } + + *table = l2_table; + trace_qcow2_l2_allocate_done(bs, l1_index, 0); + return 0; + +fail: + trace_qcow2_l2_allocate_done(bs, l1_index, ret); + qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + s->l1_table[l1_index] = old_l2_offset; + return ret; +} + +/* + * Checks how many clusters in a given L2 table are contiguous in the image + * file. As soon as one of the flags in the bitmask stop_flags changes compared + * to the first cluster, the search is stopped and the cluster is not counted + * as contiguous. (This allows it, for example, to stop at the first compressed + * cluster which may require a different handling) + */ +static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, + uint64_t *l2_table, uint64_t start, uint64_t stop_flags) +{ + int i; + uint64_t mask = stop_flags | L2E_OFFSET_MASK; + uint64_t offset = be64_to_cpu(l2_table[0]) & mask; + + if (!offset) + return 0; + + for (i = start; i < start + nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; + if (offset + (uint64_t) i * cluster_size != l2_entry) { + break; + } + } + + return (i - start); +} + +static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); + + if (type != QCOW2_CLUSTER_UNALLOCATED) { + break; + } + } + + return i; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +static int coroutine_fn copy_sectors(BlockDriverState *bs, + uint64_t start_sect, + uint64_t cluster_offset, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + QEMUIOVector qiov; + struct iovec iov; + int n, ret; + + /* + * If this is the last cluster and it is only partially used, we must only + * copy until the end of the image, or bdrv_check_request will fail for the + * bdrv_read/write calls below. + */ + if (start_sect + n_end > bs->total_sectors) { + n_end = bs->total_sectors - start_sect; + } + + n = n_end - n_start; + if (n <= 0) { + return 0; + } + + iov.iov_len = n * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + + qemu_iovec_init_external(&qiov, &iov, 1); + + BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + + /* Call .bdrv_co_readv() directly instead of using the public block-layer + * interface. This avoids double I/O throttling and request tracking, + * which can lead to deadlock when block layer copy-on-read is enabled. + */ + ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + if (s->crypt_method) { + qcow2_encrypt_sectors(s, start_sect + n_start, + iov.iov_base, iov.iov_base, n, 1, + &s->aes_encrypt_key); + } + + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); + ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(iov.iov_base); + return ret; +} + + +/* + * get_cluster_offset + * + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. + * + * on entry, *num is the number of contiguous sectors we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous sectors we can read. + * + * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error + * cases. + */ +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + unsigned int l1_index, l2_index; + uint64_t l2_offset, *l2_table; + int l1_bits, c; + unsigned int index_in_cluster, nb_clusters; + uint64_t nb_available, nb_needed; + int ret; + + index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); + nb_needed = *num + index_in_cluster; + + l1_bits = s->l2_bits + s->cluster_bits; + + /* compute how many bytes there are between the offset and + * the end of the l1 entry + */ + + nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); + + /* compute the number of available sectors */ + + nb_available = (nb_available >> 9) + index_in_cluster; + + if (nb_needed > nb_available) { + nb_needed = nb_available; + } + + *cluster_offset = 0; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> l1_bits; + if (l1_index >= s->l1_size) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + if (!l2_offset) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + /* load the l2 table in memory */ + + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + *cluster_offset = be64_to_cpu(l2_table[l2_index]); + nb_clusters = size_to_clusters(s, nb_needed << 9); + + ret = qcow2_get_cluster_type(*cluster_offset); + switch (ret) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters can only be processed one by one */ + c = 1; + *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; + break; + case QCOW2_CLUSTER_ZERO: + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_UNALLOCATED: + /* how many empty clusters ? */ + c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_NORMAL: + /* how many allocated clusters ? */ + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + *cluster_offset &= L2E_OFFSET_MASK; + break; + default: + abort(); + } + + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + + nb_available = (c * s->cluster_sectors); + +out: + if (nb_available > nb_needed) + nb_available = nb_needed; + + *num = nb_available - index_in_cluster; + + return ret; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + * Returns 0 on success, -errno in failure case + */ +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, + uint64_t **new_l2_table, + int *new_l2_index) +{ + BDRVQcowState *s = bs->opaque; + unsigned int l1_index, l2_index; + uint64_t l2_offset; + uint64_t *l2_table = NULL; + int ret; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + if (l1_index >= s->l1_size) { + ret = qcow2_grow_l1_table(bs, l1_index + 1, false); + if (ret < 0) { + return ret; + } + } + + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + + /* seek the l2 table of the given l2 offset */ + + if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { + /* load the l2 table in memory */ + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + } else { + /* First allocate a new L2 table (and do COW if needed) */ + ret = l2_allocate(bs, l1_index, &l2_table); + if (ret < 0) { + return ret; + } + + /* Then decrease the refcount of the old table */ + if (l2_offset) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); + } + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + + *new_l2_table = l2_table; + *new_l2_index = l2_index; + + return 0; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size) +{ + BDRVQcowState *s = bs->opaque; + int l2_index, ret; + uint64_t *l2_table; + int64_t cluster_offset; + int nb_csectors; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return 0; + } + + /* Compression can't overwrite anything. Fail if the cluster was already + * allocated. */ + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (cluster_offset & L2E_OFFSET_MASK) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + cluster_offset = qcow2_alloc_bytes(bs, compressed_size); + if (cluster_offset < 0) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - + (cluster_offset >> 9); + + cluster_offset |= QCOW_OFLAG_COMPRESSED | + ((uint64_t)nb_csectors << s->csize_shift); + + /* update L2 table */ + + /* compressed clusters never have the copied flag */ + + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + l2_table[l2_index] = cpu_to_be64(cluster_offset); + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return 0; + } + + return cluster_offset; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int i, j = 0, l2_index, ret; + uint64_t *old_cluster, start_sect, *l2_table; + uint64_t cluster_offset = m->alloc_offset; + bool cow = false; + + trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); + + if (m->nb_clusters == 0) + return 0; + + old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + + /* copy content of unmodified sectors */ + start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; + if (m->n_start) { + cow = true; + qemu_co_mutex_unlock(&s->lock); + ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) + goto err; + } + + if (m->nb_available & (s->cluster_sectors - 1)) { + cow = true; + qemu_co_mutex_unlock(&s->lock); + ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available, + align_offset(m->nb_available, s->cluster_sectors)); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) + goto err; + } + + /* + * Update L2 table. + * + * Before we update the L2 table to actually point to the new cluster, we + * need to be sure that the refcounts have been increased and COW was + * handled. + */ + if (cow) { + qcow2_cache_depends_on_flush(s->l2_table_cache); + } + + if (qcow2_need_accurate_refcounts(s)) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); + if (ret < 0) { + goto err; + } + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + + for (i = 0; i < m->nb_clusters; i++) { + /* if two concurrent writes happen to the same unallocated cluster + * each write allocates separate cluster and writes data concurrently. + * The first one to complete updates l2 table with pointer to its + * cluster the second one has to do RMW (which is done above by + * copy_sectors()), update l2 table with its cluster pointer and free + * old cluster. This is what this loop does */ + if(l2_table[l2_index + i] != 0) + old_cluster[j++] = l2_table[l2_index + i]; + + l2_table[l2_index + i] = cpu_to_be64((cluster_offset + + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); + } + + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + goto err; + } + + /* + * If this was a COW, we need to decrease the refcount of the old cluster. + * Also flush bs->file to get the right order for L2 and refcount update. + */ + if (j != 0) { + for (i = 0; i < j; i++) { + qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1); + } + } + + ret = 0; +err: + g_free(old_cluster); + return ret; + } + +/* + * Returns the number of contiguous clusters that can be used for an allocating + * write, but require COW to be performed (this includes yet unallocated space, + * which must copy from the backing file) + */ +static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, + uint64_t *l2_table, int l2_index) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); + int cluster_type = qcow2_get_cluster_type(l2_entry); + + switch(cluster_type) { + case QCOW2_CLUSTER_NORMAL: + if (l2_entry & QCOW_OFLAG_COPIED) { + goto out; + } + break; + case QCOW2_CLUSTER_UNALLOCATED: + case QCOW2_CLUSTER_COMPRESSED: + case QCOW2_CLUSTER_ZERO: + break; + default: + abort(); + } + } + +out: + assert(i <= nb_clusters); + return i; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, unsigned int *nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + QCowL2Meta *old_alloc; + + trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, + *host_offset, *nb_clusters); + + /* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + */ + QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + + uint64_t start = guest_offset >> s->cluster_bits; + uint64_t end = start + *nb_clusters; + uint64_t old_start = old_alloc->offset >> s->cluster_bits; + uint64_t old_end = old_start + old_alloc->nb_clusters; + + if (end < old_start || start > old_end) { + /* No intersection */ + } else { + if (start < old_start) { + /* Stop at the start of a running allocation */ + *nb_clusters = old_start - start; + } else { + *nb_clusters = 0; + } + + if (*nb_clusters == 0) { + /* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ + qemu_co_mutex_unlock(&s->lock); + qemu_co_queue_wait(&old_alloc->dependent_requests); + qemu_co_mutex_lock(&s->lock); + return -EAGAIN; + } + } + } + + if (!*nb_clusters) { + abort(); + } + + /* Allocate new clusters */ + trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); + if (*host_offset == 0) { + int64_t cluster_offset = + qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); + if (cluster_offset < 0) { + return cluster_offset; + } + *host_offset = cluster_offset; + return 0; + } else { + int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); + if (ret < 0) { + return ret; + } + *nb_clusters = ret; + return 0; + } +} + +/* + * alloc_cluster_offset + * + * For a given offset on the virtual disk, find the cluster offset in qcow2 + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster was already allocated, m->nb_clusters is set to 0 and + * other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. + * + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. + * + * Return 0 on success and -errno in error cases + */ +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index, ret, sectors; + uint64_t *l2_table; + unsigned int nb_clusters, keep_clusters; + uint64_t cluster_offset; + + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, + n_start, n_end); + + /* Find L2 entry for the first involved cluster */ +again: + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS), + s->l2_size - l2_index); + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* + * Check how many clusters are already allocated and don't need COW, and how + * many need a new allocation. + */ + if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL + && (cluster_offset & QCOW_OFLAG_COPIED)) + { + /* We keep all QCOW_OFLAG_COPIED clusters */ + keep_clusters = + count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); + assert(keep_clusters <= nb_clusters); + nb_clusters -= keep_clusters; + } else { + keep_clusters = 0; + cluster_offset = 0; + } + + if (nb_clusters > 0) { + /* For the moment, overwrite compressed clusters one by one */ + uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]); + if (entry & QCOW_OFLAG_COMPRESSED) { + nb_clusters = 1; + } else { + nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, + l2_index + keep_clusters); + } + } + + cluster_offset &= L2E_OFFSET_MASK; + + /* + * The L2 table isn't used any more after this. As long as the cache works + * synchronously, it's important to release it before calling + * do_alloc_cluster_offset, which may yield if we need to wait for another + * request to complete. If we still had the reference, we could use up the + * whole cache with sleeping requests. + */ + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + /* If there is something left to allocate, do that now */ + *m = (QCowL2Meta) { + .cluster_offset = cluster_offset, + .nb_clusters = 0, + }; + qemu_co_queue_init(&m->dependent_requests); + + if (nb_clusters > 0) { + uint64_t alloc_offset; + uint64_t alloc_cluster_offset; + uint64_t keep_bytes = keep_clusters * s->cluster_size; + + /* Calculate start and size of allocation */ + alloc_offset = offset + keep_bytes; + + if (keep_clusters == 0) { + alloc_cluster_offset = 0; + } else { + alloc_cluster_offset = cluster_offset + keep_bytes; + } + + /* Allocate, if necessary at a given offset in the image file */ + ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset, + &nb_clusters); + if (ret == -EAGAIN) { + goto again; + } else if (ret < 0) { + goto fail; + } + + /* save info needed for meta data update */ + if (nb_clusters > 0) { + /* + * requested_sectors: Number of sectors from the start of the first + * newly allocated cluster to the end of the (possibly shortened + * before) write request. + * + * avail_sectors: Number of sectors from the start of the first + * newly allocated to the end of the last newly allocated cluster. + */ + int requested_sectors = n_end - keep_clusters * s->cluster_sectors; + int avail_sectors = nb_clusters + << (s->cluster_bits - BDRV_SECTOR_BITS); + + *m = (QCowL2Meta) { + .cluster_offset = keep_clusters == 0 ? + alloc_cluster_offset : cluster_offset, + .alloc_offset = alloc_cluster_offset, + .offset = alloc_offset, + .n_start = keep_clusters == 0 ? n_start : 0, + .nb_clusters = nb_clusters, + .nb_available = MIN(requested_sectors, avail_sectors), + }; + qemu_co_queue_init(&m->dependent_requests); + QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight); + } + } + + /* Some cleanup work */ + sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9); + if (sectors > n_end) { + sectors = n_end; + } + + assert(sectors > n_start); + *num = sectors - n_start; + + return 0; + +fail: + if (m->nb_clusters > 0) { + QLIST_REMOVE(m, next_in_flight); + } + return ret; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, csize, nb_csectors, sector_offset; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + sector_offset = coffset & 511; + csize = nb_csectors * 512 - sector_offset; + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); + if (ret < 0) { + return ret; + } + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { + return -EIO; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +/* + * This discards as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of discarded + * clusters. + */ +static int discard_single_l2(BlockDriverState *bs, uint64_t offset, + unsigned int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_offset; + + old_offset = be64_to_cpu(l2_table[l2_index + i]); + if ((old_offset & L2E_OFFSET_MASK) == 0) { + continue; + } + + /* First remove L2 entries */ + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + l2_table[l2_index + i] = cpu_to_be64(0); + + /* Then decrease the refcount */ + qcow2_free_any_clusters(bs, old_offset, 1); + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + return nb_clusters; +} + +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + uint64_t end_offset; + unsigned int nb_clusters; + int ret; + + end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); + + /* Round start up and end down */ + offset = align_offset(offset, s->cluster_size); + end_offset &= ~(s->cluster_size - 1); + + if (offset > end_offset) { + return 0; + } + + nb_clusters = size_to_clusters(s, end_offset - offset); + + /* Each L2 table is handled by its own loop iteration */ + while (nb_clusters > 0) { + ret = discard_single_l2(bs, offset, nb_clusters); + if (ret < 0) { + return ret; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + return 0; +} + +/* + * This zeroes as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of zeroed + * clusters. + */ +static int zero_single_l2(BlockDriverState *bs, uint64_t offset, + unsigned int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_offset; + + old_offset = be64_to_cpu(l2_table[l2_index + i]); + + /* Update L2 entries */ + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + if (old_offset & QCOW_OFLAG_COMPRESSED) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + qcow2_free_any_clusters(bs, old_offset, 1); + } else { + l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); + } + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + return nb_clusters; +} + +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + unsigned int nb_clusters; + int ret; + + /* The zero flag is only supported by version 3 and newer */ + if (s->qcow_version < 3) { + return -ENOTSUP; + } + + /* Each L2 table is handled by its own loop iteration */ + nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + + while (nb_clusters > 0) { + ret = zero_single_l2(bs, offset, nb_clusters); + if (ret < 0) { + return ret; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + return 0; +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c new file mode 100644 index 000000000..5e3f9153f --- /dev/null +++ b/block/qcow2-refcount.c @@ -0,0 +1,1240 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "block/qcow2.h" + +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, + int addend); + + +/*********************************************************/ +/* refcount handling */ + +int qcow2_refcount_init(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret, refcount_table_size2, i; + + refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); + s->refcount_table = g_malloc(refcount_table_size2); + if (s->refcount_table_size > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); + ret = bdrv_pread(bs->file, s->refcount_table_offset, + s->refcount_table, refcount_table_size2); + if (ret != refcount_table_size2) + goto fail; + for(i = 0; i < s->refcount_table_size; i++) + be64_to_cpus(&s->refcount_table[i]); + } + return 0; + fail: + return -ENOMEM; +} + +void qcow2_refcount_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + g_free(s->refcount_table); +} + + +static int load_refcount_block(BlockDriverState *bs, + int64_t refcount_block_offset, + void **refcount_block) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + refcount_block); + + return ret; +} + +/* + * Returns the refcount of the cluster given by its index. Any non-negative + * return value is the refcount of the cluster, negative values are -errno + * and indicate an error. + */ +static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +{ + BDRVQcowState *s = bs->opaque; + int refcount_table_index, block_index; + int64_t refcount_block_offset; + int ret; + uint16_t *refcount_block; + uint16_t refcount; + + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + if (refcount_table_index >= s->refcount_table_size) + return 0; + refcount_block_offset = s->refcount_table[refcount_table_index]; + if (!refcount_block_offset) + return 0; + + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + (void**) &refcount_block); + if (ret < 0) { + return ret; + } + + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + refcount = be16_to_cpu(refcount_block[block_index]); + + ret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (ret < 0) { + return ret; + } + + return refcount; +} + +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcowState *s, + unsigned int min_size) +{ + unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; + unsigned int refcount_table_clusters = + MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); + + while (min_clusters > refcount_table_clusters) { + refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; + } + + return refcount_table_clusters << (s->cluster_bits - 3); +} + + +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, + uint64_t offset_b) +{ + uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + + return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns 0 on success or -errno in error case + */ +static int alloc_refcount_block(BlockDriverState *bs, + int64_t cluster_index, uint16_t **refcount_block) +{ + BDRVQcowState *s = bs->opaque; + unsigned int refcount_table_index; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Find the refcount block for the given cluster */ + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + + if (refcount_table_index < s->refcount_table_size) { + + uint64_t refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + + /* If it's already there, we're done */ + if (refcount_block_offset) { + return load_refcount_block(bs, refcount_block_offset, + (void**) refcount_block); + } + } + + /* + * If we came here, we need to allocate something. Something is at least + * a cluster for the new refcount block. It may also include a new refcount + * table if the old refcount table is too small. + * + * Note that allocating clusters here needs some special care: + * + * - We can't use the normal qcow2_alloc_clusters(), it would try to + * increase the refcount and very likely we would end up with an endless + * recursion. Instead we must place the refcount blocks in a way that + * they can describe them themselves. + * + * - We need to consider that at this point we are inside update_refcounts + * and doing the initial refcount increase. This means that some clusters + * have already been allocated by the caller, but their refcount isn't + * accurate yet. free_cluster_index tells us where this allocation ends + * as long as we don't overwrite it by freeing clusters. + * + * - alloc_clusters_noref and qcow2_free_clusters may load a different + * refcount block into the cache + */ + + *refcount_block = NULL; + + /* We write to the refcount table, so we might depend on L2 tables */ + qcow2_cache_flush(bs, s->l2_table_cache); + + /* Allocate the refcount block itself and mark it as used */ + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + if (new_block < 0) { + return new_block; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 + " at %" PRIx64 "\n", + refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + + if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { + /* Zero the new refcount block before updating it */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + + /* The block describes itself, need to update the cache */ + int block_index = (new_block >> s->cluster_bits) & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + (*refcount_block)[block_index] = cpu_to_be16(1); + } else { + /* Described somewhere else. This can recurse at most twice before we + * arrive at a block that describes itself. */ + ret = update_refcount(bs, new_block, s->cluster_size, 1); + if (ret < 0) { + goto fail_block; + } + + bdrv_flush(bs->file); + + /* Initialize the new refcount block only after updating its refcount, + * update_refcount uses the refcount cache itself */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + } + + /* Now the new refcount block needs to be written to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); + qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } + + /* If the refcount table is big enough, just hook the block up there */ + if (refcount_table_index < s->refcount_table_size) { + uint64_t data64 = cpu_to_be64(new_block); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); + ret = bdrv_pwrite_sync(bs->file, + s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), + &data64, sizeof(data64)); + if (ret < 0) { + goto fail_block; + } + + s->refcount_table[refcount_table_index] = new_block; + return 0; + } + + ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + /* + * If we come here, we need to grow the refcount table. Again, a new + * refcount table needs some space and we can't simply allocate to avoid + * endless recursion. + * + * Therefore let's grab new refcount blocks at the end of the image, which + * will describe themselves and the new refcount table. This way we can + * reference them only in the new table and do the switch to the new + * refcount table at once without producing an inconsistent state in + * between. + */ + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + + /* Calculate the number of refcount blocks needed so far */ + uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); + uint64_t blocks_used = (s->free_cluster_index + + refcount_block_clusters - 1) / refcount_block_clusters; + + /* And now we need at least one block more for the new metadata */ + uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); + uint64_t last_table_size; + uint64_t blocks_clusters; + do { + uint64_t table_clusters = size_to_clusters(s, table_size); + blocks_clusters = 1 + + ((table_clusters + refcount_block_clusters - 1) + / refcount_block_clusters); + uint64_t meta_clusters = table_clusters + blocks_clusters; + + last_table_size = table_size; + table_size = next_refcount_table_size(s, blocks_used + + ((meta_clusters + refcount_block_clusters - 1) + / refcount_block_clusters)); + + } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", + s->refcount_table_size, table_size); +#endif + + /* Create the new refcount table and blocks */ + uint64_t meta_offset = (blocks_used * refcount_block_clusters) * + s->cluster_size; + uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; + uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); + uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); + + assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); + + /* Fill the new refcount table */ + memcpy(new_table, s->refcount_table, + s->refcount_table_size * sizeof(uint64_t)); + new_table[refcount_table_index] = new_block; + + int i; + for (i = 0; i < blocks_clusters; i++) { + new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); + } + + /* Fill the refcount blocks */ + uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); + int block = 0; + for (i = 0; i < table_clusters + blocks_clusters; i++) { + new_blocks[block++] = cpu_to_be16(1); + } + + /* Write refcount blocks to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); + ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, + blocks_clusters * s->cluster_size); + g_free(new_blocks); + if (ret < 0) { + goto fail_table; + } + + /* Write refcount table to disk */ + for(i = 0; i < table_size; i++) { + cpu_to_be64s(&new_table[i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); + ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, + table_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail_table; + } + + for(i = 0; i < table_size; i++) { + be64_to_cpus(&new_table[i]); + } + + /* Hook up the new refcount table in the qcow2 header */ + uint8_t data[12]; + cpu_to_be64w((uint64_t*)data, table_offset); + cpu_to_be32w((uint32_t*)(data + 8), table_clusters); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), + data, sizeof(data)); + if (ret < 0) { + goto fail_table; + } + + /* And switch it in memory */ + uint64_t old_table_offset = s->refcount_table_offset; + uint64_t old_table_size = s->refcount_table_size; + + g_free(s->refcount_table); + s->refcount_table = new_table; + s->refcount_table_size = table_size; + s->refcount_table_offset = table_offset; + + /* Free old table. Remember, we must not change free_cluster_index */ + uint64_t old_free_cluster_index = s->free_cluster_index; + qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); + s->free_cluster_index = old_free_cluster_index; + + ret = load_refcount_block(bs, new_block, (void**) refcount_block); + if (ret < 0) { + return ret; + } + + return 0; + +fail_table: + g_free(new_table); +fail_block: + if (*refcount_block != NULL) { + qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + } + return ret; +} + +/* XXX: cache several refcount block clusters ? */ +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, int addend) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + uint16_t *refcount_block = NULL; + int64_t old_table_index = -1; + int ret; + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", + offset, length, addend); +#endif + if (length < 0) { + return -EINVAL; + } else if (length == 0) { + return 0; + } + + if (addend < 0) { + qcow2_cache_set_dependency(bs, s->refcount_block_cache, + s->l2_table_cache); + } + + start = offset & ~(s->cluster_size - 1); + last = (offset + length - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) + { + int block_index, refcount; + int64_t cluster_index = cluster_offset >> s->cluster_bits; + int64_t table_index = + cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + + /* Load the refcount block and allocate it if needed */ + if (table_index != old_table_index) { + if (refcount_block) { + ret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (ret < 0) { + goto fail; + } + } + + ret = alloc_refcount_block(bs, cluster_index, &refcount_block); + if (ret < 0) { + goto fail; + } + } + old_table_index = table_index; + + qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); + + /* we can update the count and save it */ + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + + refcount = be16_to_cpu(refcount_block[block_index]); + refcount += addend; + if (refcount < 0 || refcount > 0xffff) { + ret = -EINVAL; + goto fail; + } + if (refcount == 0 && cluster_index < s->free_cluster_index) { + s->free_cluster_index = cluster_index; + } + refcount_block[block_index] = cpu_to_be16(refcount); + } + + ret = 0; +fail: + /* Write last changed block to disk */ + if (refcount_block) { + int wret; + wret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (wret < 0) { + return ret < 0 ? ret : wret; + } + } + + /* + * Try do undo any updates if an error is returned (This may succeed in + * some cases like ENOSPC for allocating a new refcount block) + */ + if (ret < 0) { + int dummy; + dummy = update_refcount(bs, offset, cluster_offset - offset, -addend); + (void)dummy; + } + + return ret; +} + +/* + * Increases or decreases the refcount of a given cluster by one. + * addend must be 1 or -1. + * + * If the return value is non-negative, it is the new refcount of the cluster. + * If it is negative, it is -errno and indicates an error. + */ +static int update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend); + if (ret < 0) { + return ret; + } + + bdrv_flush(bs->file); + + return get_refcount(bs, cluster_index); +} + + + +/*********************************************************/ +/* cluster allocation functions */ + + + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int i, nb_clusters, refcount; + + nb_clusters = size_to_clusters(s, size); +retry: + for(i = 0; i < nb_clusters; i++) { + int64_t next_cluster_index = s->free_cluster_index++; + refcount = get_refcount(bs, next_cluster_index); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + goto retry; + } + } +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", + size, + (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif + return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) +{ + int64_t offset; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); + offset = alloc_clusters_noref(bs, size); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1); + if (ret < 0) { + return ret; + } + + return offset; +} + +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t cluster_index; + uint64_t old_free_cluster_index; + int i, refcount, ret; + + /* Check how many clusters there are free */ + cluster_index = offset >> s->cluster_bits; + for(i = 0; i < nb_clusters; i++) { + refcount = get_refcount(bs, cluster_index++); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + break; + } + } + + /* And then allocate them */ + old_free_cluster_index = s->free_cluster_index; + s->free_cluster_index = cluster_index + i; + + ret = update_refcount(bs, offset, i << s->cluster_bits, 1); + if (ret < 0) { + return ret; + } + + s->free_cluster_index = old_free_cluster_index; + + return i; +} + +/* only used to allocate compressed sectors. We try to allocate + contiguous sectors. size must be <= cluster_size */ +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +{ + BDRVQcowState *s = bs->opaque; + int64_t offset, cluster_offset; + int free_in_cluster; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); + assert(size > 0 && size <= s->cluster_size); + if (s->free_byte_offset == 0) { + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; + } + s->free_byte_offset = offset; + } + redo: + free_in_cluster = s->cluster_size - + (s->free_byte_offset & (s->cluster_size - 1)); + if (size <= free_in_cluster) { + /* enough space in current cluster */ + offset = s->free_byte_offset; + s->free_byte_offset += size; + free_in_cluster -= size; + if (free_in_cluster == 0) + s->free_byte_offset = 0; + if ((offset & (s->cluster_size - 1)) != 0) + update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + } else { + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; + } + cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); + if ((cluster_offset + s->cluster_size) == offset) { + /* we are lucky: contiguous data */ + offset = s->free_byte_offset; + update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + s->free_byte_offset += size; + } else { + s->free_byte_offset = offset; + goto redo; + } + } + + bdrv_flush(bs->file); + return offset; +} + +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size) +{ + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); + ret = update_refcount(bs, offset, size, -1); + if (ret < 0) { + fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); + /* TODO Remember the clusters to free them later and avoid leaking */ + } +} + +/* + * Free a cluster using its L2 entry (handles clusters of all types, e.g. + * normal cluster, compressed cluster, etc.) + */ +void qcow2_free_any_clusters(BlockDriverState *bs, + uint64_t l2_entry, int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + { + int nb_csectors; + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + qcow2_free_clusters(bs, + (l2_entry & s->cluster_offset_mask) & ~511, + nb_csectors * 512); + } + break; + case QCOW2_CLUSTER_NORMAL: + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits); + break; + case QCOW2_CLUSTER_UNALLOCATED: + case QCOW2_CLUSTER_ZERO: + break; + default: + abort(); + } +} + + + +/*********************************************************/ +/* snapshots and image creation */ + + + +/* update the refcounts of snapshots and the copied flag */ +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; + int64_t old_offset, old_l2_offset; + int i, j, l1_modified = 0, nb_csectors, refcount; + int ret; + + l2_table = NULL; + l1_table = NULL; + l1_size2 = l1_size * sizeof(uint64_t); + + /* WARNING: qcow2_snapshot_goto relies on this function not using the + * l1_table_offset when it is the current s->l1_table_offset! Be careful + * when changing this! */ + if (l1_table_offset != s->l1_table_offset) { + if (l1_size2 != 0) { + l1_table = g_malloc0(align_offset(l1_size2, 512)); + } else { + l1_table = NULL; + } + l1_allocated = 1; + if (bdrv_pread(bs->file, l1_table_offset, + l1_table, l1_size2) != l1_size2) + { + ret = -EIO; + goto fail; + } + + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } else { + assert(l1_size == s->l1_size); + l1_table = s->l1_table; + l1_allocated = 0; + } + + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + old_l2_offset = l2_offset; + l2_offset &= L1E_OFFSET_MASK; + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void**) &l2_table); + if (ret < 0) { + goto fail; + } + + for(j = 0; j < s->l2_size; j++) { + offset = be64_to_cpu(l2_table[j]); + if (offset != 0) { + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + if (offset & QCOW_OFLAG_COMPRESSED) { + nb_csectors = ((offset >> s->csize_shift) & + s->csize_mask) + 1; + if (addend != 0) { + int ret; + ret = update_refcount(bs, + (offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512, addend); + if (ret < 0) { + goto fail; + } + + /* TODO Flushing once for the whole function should + * be enough */ + bdrv_flush(bs->file); + } + /* compressed clusters are never modified */ + refcount = 2; + } else { + uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + if (addend != 0) { + refcount = update_cluster_refcount(bs, cluster_index, addend); + } else { + refcount = get_refcount(bs, cluster_index); + } + + if (refcount < 0) { + ret = -EIO; + goto fail; + } + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + if (addend > 0) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + l2_table[j] = cpu_to_be64(offset); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + } + } + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + goto fail; + } + + + if (addend != 0) { + refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend); + } else { + refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + } + if (refcount < 0) { + ret = -EIO; + goto fail; + } else if (refcount == 1) { + l2_offset |= QCOW_OFLAG_COPIED; + } + if (l2_offset != old_l2_offset) { + l1_table[i] = l2_offset; + l1_modified = 1; + } + } + } + + ret = 0; +fail: + if (l2_table) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + } + + /* Update L1 only if it isn't deleted anyway (addend = -1) */ + if (addend >= 0 && l1_modified) { + for(i = 0; i < l1_size; i++) + cpu_to_be64s(&l1_table[i]); + if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, + l1_size2) < 0) + goto fail; + for(i = 0; i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } + if (l1_allocated) + g_free(l1_table); + return ret; +} + + + + +/*********************************************************/ +/* refcount checking functions */ + + + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared the the refcount table saved in the image. + * + * Modifies the number of errors in res. + */ +static void inc_refcounts(BlockDriverState *bs, + BdrvCheckResult *res, + uint16_t *refcount_table, + int refcount_table_size, + int64_t offset, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + int k; + + if (size <= 0) + return; + + start = offset & ~(s->cluster_size - 1); + last = (offset + size - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + k = cluster_offset >> s->cluster_bits; + if (k < 0) { + fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", + cluster_offset); + res->corruptions++; + } else if (k >= refcount_table_size) { + fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " + "the end of the image file, can't properly check refcounts.\n", + cluster_offset); + res->check_errors++; + } else { + if (++refcount_table[k] == 0) { + fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 + "\n", cluster_offset); + res->corruptions++; + } + } + } +} + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, + uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, + int check_copied) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table, l2_entry; + int i, l2_size, nb_csectors, refcount; + + /* Read L2 table from disk */ + l2_size = s->l2_size * sizeof(uint64_t); + l2_table = g_malloc(l2_size); + + if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) + goto fail; + + /* Do the actual checks */ + for(i = 0; i < s->l2_size; i++) { + l2_entry = be64_to_cpu(l2_table[i]); + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters don't have QCOW_OFLAG_COPIED */ + if (l2_entry & QCOW_OFLAG_COPIED) { + fprintf(stderr, "ERROR: cluster %" PRId64 ": " + "copied flag must never be set for compressed " + "clusters\n", l2_entry >> s->cluster_bits); + l2_entry &= ~QCOW_OFLAG_COPIED; + res->corruptions++; + } + + /* Mark cluster as used */ + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + l2_entry &= s->cluster_offset_mask; + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_entry & ~511, nb_csectors * 512); + break; + + case QCOW2_CLUSTER_ZERO: + if ((l2_entry & L2E_OFFSET_MASK) == 0) { + break; + } + /* fall through */ + + case QCOW2_CLUSTER_NORMAL: + { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + uint64_t offset = l2_entry & L2E_OFFSET_MASK; + + if (check_copied) { + refcount = get_refcount(bs, offset >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for offset %" + PRIx64 ": %s\n", l2_entry, strerror(-refcount)); + goto fail; + } + if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" + PRIx64 " refcount=%d\n", l2_entry, refcount); + res->corruptions++; + } + } + + /* Mark cluster as used */ + inc_refcounts(bs, res, refcount_table,refcount_table_size, + offset, s->cluster_size); + + /* Correct offsets are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " + "properly aligned; L2 entry corrupted.\n", offset); + res->corruptions++; + } + break; + } + + case QCOW2_CLUSTER_UNALLOCATED: + break; + + default: + abort(); + } + } + + g_free(l2_table); + return 0; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); + g_free(l2_table); + return -EIO; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, + BdrvCheckResult *res, + uint16_t *refcount_table, + int refcount_table_size, + int64_t l1_table_offset, int l1_size, + int check_copied) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, l2_offset, l1_size2; + int i, refcount, ret; + + l1_size2 = l1_size * sizeof(uint64_t); + + /* Mark L1 table as used */ + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); + + /* Read L1 table entries from disk */ + if (l1_size2 == 0) { + l1_table = NULL; + } else { + l1_table = g_malloc(l1_size2); + if (bdrv_pread(bs->file, l1_table_offset, + l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } + + /* Do the actual checks */ + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + if (check_copied) { + refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) + >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for l2_offset %" + PRIx64 ": %s\n", l2_offset, strerror(-refcount)); + goto fail; + } + if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 + " refcount=%d\n", l2_offset, refcount); + res->corruptions++; + } + } + + /* Mark L2 table as used */ + l2_offset &= L1E_OFFSET_MASK; + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_offset, s->cluster_size); + + /* L2 tables are cluster aligned */ + if (l2_offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + res->corruptions++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, check_copied); + if (ret < 0) { + goto fail; + } + } + } + g_free(l1_table); + return 0; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; + g_free(l1_table); + return -EIO; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occurred. + */ +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcowState *s = bs->opaque; + int64_t size, i; + int nb_clusters, refcount1, refcount2; + QCowSnapshot *sn; + uint16_t *refcount_table; + int ret; + + size = bdrv_getlength(bs->file); + nb_clusters = size_to_clusters(s, size); + refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + + /* header */ + inc_refcounts(bs, res, refcount_table, nb_clusters, + 0, s->cluster_size); + + /* current L1 table */ + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + goto fail; + } + + /* snapshots */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + sn->l1_table_offset, sn->l1_size, 0); + if (ret < 0) { + goto fail; + } + } + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); + + /* refcount data */ + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + + for(i = 0; i < s->refcount_table_size; i++) { + uint64_t offset, cluster; + offset = s->refcount_table[i]; + cluster = offset >> s->cluster_bits; + + /* Refcount blocks are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR refcount block %" PRId64 " is not " + "cluster aligned; refcount table entry corrupted\n", i); + res->corruptions++; + continue; + } + + if (cluster >= nb_clusters) { + fprintf(stderr, "ERROR refcount block %" PRId64 + " is outside image\n", i); + res->corruptions++; + continue; + } + + if (offset != 0) { + inc_refcounts(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); + if (refcount_table[cluster] != 1) { + fprintf(stderr, "ERROR refcount block %" PRId64 + " refcount=%d\n", + i, refcount_table[cluster]); + res->corruptions++; + } + } + } + + /* compare ref counts */ + for(i = 0; i < nb_clusters; i++) { + refcount1 = get_refcount(bs, i); + if (refcount1 < 0) { + fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", + i, strerror(-refcount1)); + res->check_errors++; + continue; + } + + refcount2 = refcount_table[i]; + if (refcount1 != refcount2) { + + /* Check if we're allowed to fix the mismatch */ + int *num_fixed = NULL; + if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { + num_fixed = &res->leaks_fixed; + } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { + num_fixed = &res->corruptions_fixed; + } + + fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", + num_fixed != NULL ? "Repairing" : + refcount1 < refcount2 ? "ERROR" : + "Leaked", + i, refcount1, refcount2); + + if (num_fixed) { + ret = update_refcount(bs, i << s->cluster_bits, 1, + refcount2 - refcount1); + if (ret >= 0) { + (*num_fixed)++; + continue; + } + } + + /* And if we couldn't, print an error */ + if (refcount1 < refcount2) { + res->corruptions++; + } else { + res->leaks++; + } + } + } + + ret = 0; + +fail: + g_free(refcount_table); + + return ret; +} + diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c new file mode 100644 index 000000000..4e7c93b8b --- /dev/null +++ b/block/qcow2-snapshot.c @@ -0,0 +1,660 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "block/qcow2.h" + +typedef struct QEMU_PACKED QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; +} QCowSnapshotExtraData; + +void qcow2_free_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + g_free(s->snapshots[i].name); + g_free(s->snapshots[i].id_str); + } + g_free(s->snapshots); + s->snapshots = NULL; + s->nb_snapshots = 0; +} + +int qcow2_read_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + QCowSnapshot *sn; + int i, id_str_size, name_size; + int64_t offset; + uint32_t extra_data_size; + int ret; + + if (!s->nb_snapshots) { + s->snapshots = NULL; + s->snapshots_size = 0; + return 0; + } + + offset = s->snapshots_offset; + s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); + + for(i = 0; i < s->nb_snapshots; i++) { + /* Read statically sized part of the snapshot header */ + offset = align_offset(offset, 8); + ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + + offset += sizeof(h); + sn = s->snapshots + i; + sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); + sn->l1_size = be32_to_cpu(h.l1_size); + sn->vm_state_size = be32_to_cpu(h.vm_state_size); + sn->date_sec = be32_to_cpu(h.date_sec); + sn->date_nsec = be32_to_cpu(h.date_nsec); + sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); + extra_data_size = be32_to_cpu(h.extra_data_size); + + id_str_size = be16_to_cpu(h.id_str_size); + name_size = be16_to_cpu(h.name_size); + + /* Read extra data */ + ret = bdrv_pread(bs->file, offset, &extra, + MIN(sizeof(extra), extra_data_size)); + if (ret < 0) { + goto fail; + } + offset += extra_data_size; + + if (extra_data_size >= 8) { + sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); + } + + if (extra_data_size >= 16) { + sn->disk_size = be64_to_cpu(extra.disk_size); + } else { + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + } + + /* Read snapshot ID */ + sn->id_str = g_malloc(id_str_size + 1); + ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + sn->id_str[id_str_size] = '\0'; + + /* Read snapshot name */ + sn->name = g_malloc(name_size + 1); + ret = bdrv_pread(bs->file, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + sn->name[name_size] = '\0'; + } + + s->snapshots_size = offset - s->snapshots_offset; + return 0; + +fail: + qcow2_free_snapshots(bs); + return ret; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow2_write_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + int i, name_size, id_str_size, snapshots_size; + struct { + uint32_t nb_snapshots; + uint64_t snapshots_offset; + } QEMU_PACKED header_data; + int64_t offset, snapshots_offset; + int ret; + + /* compute the size of the snapshots */ + offset = 0; + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + offset = align_offset(offset, 8); + offset += sizeof(h); + offset += sizeof(extra); + offset += strlen(sn->id_str); + offset += strlen(sn->name); + } + snapshots_size = offset; + + /* Allocate space for the new snapshot list */ + snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); + bdrv_flush(bs->file); + offset = snapshots_offset; + if (offset < 0) { + return offset; + } + + /* Write all snapshots to the new list */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + memset(&h, 0, sizeof(h)); + h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); + h.l1_size = cpu_to_be32(sn->l1_size); + /* If it doesn't fit in 32 bit, older implementations should treat it + * as a disk-only snapshot rather than truncate the VM state */ + if (sn->vm_state_size <= 0xffffffff) { + h.vm_state_size = cpu_to_be32(sn->vm_state_size); + } + h.date_sec = cpu_to_be32(sn->date_sec); + h.date_nsec = cpu_to_be32(sn->date_nsec); + h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); + h.extra_data_size = cpu_to_be32(sizeof(extra)); + + memset(&extra, 0, sizeof(extra)); + extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); + extra.disk_size = cpu_to_be64(sn->disk_size); + + id_str_size = strlen(sn->id_str); + name_size = strlen(sn->name); + h.id_str_size = cpu_to_be16(id_str_size); + h.name_size = cpu_to_be16(name_size); + offset = align_offset(offset, 8); + + ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + offset += sizeof(h); + + ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); + if (ret < 0) { + goto fail; + } + offset += sizeof(extra); + + ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + + ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + } + + /* + * Update the header to point to the new snapshot table. This requires the + * new table and its refcounts to be stable on disk. + */ + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != + offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + + header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); + header_data.snapshots_offset = cpu_to_be64(snapshots_offset); + + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), + &header_data, sizeof(header_data)); + if (ret < 0) { + goto fail; + } + + /* free the old snapshot table */ + qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size); + s->snapshots_offset = snapshots_offset; + s->snapshots_size = snapshots_size; + return 0; + +fail: + return ret; +} + +static void find_new_snapshot_id(BlockDriverState *bs, + char *id_str, int id_str_size) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, id, id_max = 0; + + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + id = strtoul(sn->id_str, NULL, 10); + if (id > id_max) + id_max = id; + } + snprintf(id_str, id_str_size, "%d", id_max + 1); +} + +static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id_str)) + return i; + } + return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +{ + BDRVQcowState *s = bs->opaque; + int i, ret; + + ret = find_snapshot_by_id(bs, name); + if (ret >= 0) + return ret; + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) + return i; + } + return -1; +} + +/* if no id is provided, a new one is constructed */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *new_snapshot_list = NULL; + QCowSnapshot *old_snapshot_list = NULL; + QCowSnapshot sn1, *sn = &sn1; + int i, ret; + uint64_t *l1_table = NULL; + int64_t l1_table_offset; + + memset(sn, 0, sizeof(*sn)); + + /* Generate an ID if it wasn't passed */ + if (sn_info->id_str[0] == '\0') { + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); + } + + /* Check that the ID is unique */ + if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { + return -EEXIST; + } + + /* Populate sn with passed data */ + sn->id_str = g_strdup(sn_info->id_str); + sn->name = g_strdup(sn_info->name); + + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + sn->vm_state_size = sn_info->vm_state_size; + sn->date_sec = sn_info->date_sec; + sn->date_nsec = sn_info->date_nsec; + sn->vm_clock_nsec = sn_info->vm_clock_nsec; + + /* Allocate the L1 table of the snapshot and copy the current one there. */ + l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + if (l1_table_offset < 0) { + ret = l1_table_offset; + goto fail; + } + + sn->l1_table_offset = l1_table_offset; + sn->l1_size = s->l1_size; + + l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + for(i = 0; i < s->l1_size; i++) { + l1_table[i] = cpu_to_be64(s->l1_table[i]); + } + + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + g_free(l1_table); + l1_table = NULL; + + /* + * Increase the refcounts of all clusters and make sure everything is + * stable on disk before updating the snapshot table to contain a pointer + * to the new L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + goto fail; + } + + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + /* Append the new snapshot to the snapshot list */ + new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); + if (s->snapshots) { + memcpy(new_snapshot_list, s->snapshots, + s->nb_snapshots * sizeof(QCowSnapshot)); + old_snapshot_list = s->snapshots; + } + s->snapshots = new_snapshot_list; + s->snapshots[s->nb_snapshots++] = *sn; + + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + g_free(s->snapshots); + s->snapshots = old_snapshot_list; + goto fail; + } + + g_free(old_snapshot_list); + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn->id_str); + g_free(sn->name); + g_free(l1_table); + + return ret; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, snapshot_index; + int cur_l1_bytes, sn_l1_bytes; + int ret; + uint64_t *sn_l1_table = NULL; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { + error_report("qcow2: Loading snapshots with different disk " + "size is not implemented"); + ret = -ENOTSUP; + goto fail; + } + + /* + * Make sure that the current L1 table is big enough to contain the whole + * L1 table of the snapshot. If the snapshot L1 table is smaller, the + * current one must be padded with zeros. + */ + ret = qcow2_grow_l1_table(bs, sn->l1_size, true); + if (ret < 0) { + goto fail; + } + + cur_l1_bytes = s->l1_size * sizeof(uint64_t); + sn_l1_bytes = sn->l1_size * sizeof(uint64_t); + + /* + * Copy the snapshot L1 table to the current L1 table. + * + * Before overwriting the old current L1 table on disk, make sure to + * increase all refcounts for the clusters referenced by the new one. + * Decrease the refcount referenced by the old one only when the L1 + * table is overwritten. + */ + sn_l1_table = g_malloc0(cur_l1_bytes); + + ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); + if (ret < 0) { + goto fail; + } + + ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, + sn->l1_size, 1); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, + cur_l1_bytes); + if (ret < 0) { + goto fail; + } + + /* + * Decrease refcount of clusters of current L1 table. + * + * At this point, the in-memory s->l1_table points to the old L1 table, + * whereas on disk we already have the new one. + * + * qcow2_update_snapshot_refcount special cases the current L1 table to use + * the in-memory data instead of really using the offset to load a new one, + * which is why this works. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, + s->l1_size, -1); + + /* + * Now update the in-memory L1 table to be in sync with the on-disk one. We + * need to do this even if updating refcounts failed. + */ + for(i = 0;i < s->l1_size; i++) { + s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); + } + + if (ret < 0) { + goto fail; + } + + g_free(sn_l1_table); + sn_l1_table = NULL; + + /* + * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed + * when we decreased the refcount of the old snapshot. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + goto fail; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn_l1_table); + return ret; +} + +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot sn; + int snapshot_index, ret; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = s->snapshots[snapshot_index]; + + /* Remove it from the snapshot list */ + memmove(s->snapshots + snapshot_index, + s->snapshots + snapshot_index + 1, + (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); + s->nb_snapshots--; + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + return ret; + } + + /* + * The snapshot is now unused, clean up. If we fail after this point, we + * won't recover but just leak clusters. + */ + g_free(sn.id_str); + g_free(sn.name); + + /* + * Now decrease the refcounts of clusters referenced by the snapshot and + * free the L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, + sn.l1_size, -1); + if (ret < 0) { + return ret; + } + qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t)); + + /* must update the copied flag on the current cluster offsets */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + return ret; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; +} + +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ + BDRVQcowState *s = bs->opaque; + QEMUSnapshotInfo *sn_tab, *sn_info; + QCowSnapshot *sn; + int i; + + if (!s->nb_snapshots) { + *psn_tab = NULL; + return s->nb_snapshots; + } + + sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo)); + for(i = 0; i < s->nb_snapshots; i++) { + sn_info = sn_tab + i; + sn = s->snapshots + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), + sn->id_str); + pstrcpy(sn_info->name, sizeof(sn_info->name), + sn->name); + sn_info->vm_state_size = sn->vm_state_size; + sn_info->date_sec = sn->date_sec; + sn_info->date_nsec = sn->date_nsec; + sn_info->vm_clock_nsec = sn->vm_clock_nsec; + } + *psn_tab = sn_tab; + return s->nb_snapshots; +} + +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) +{ + int i, snapshot_index; + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + uint64_t *new_l1_table; + int new_l1_bytes; + int ret; + + assert(bs->read_only); + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + /* Allocate and read in the snapshot's L1 table */ + new_l1_bytes = s->l1_size * sizeof(uint64_t); + new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); + + ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); + if (ret < 0) { + g_free(new_l1_table); + return ret; + } + + /* Switch the L1 table */ + g_free(s->l1_table); + + s->l1_size = sn->l1_size; + s->l1_table_offset = sn->l1_table_offset; + s->l1_table = new_l1_table; + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + + return 0; +} diff --git a/block/qcow2.c b/block/qcow2.c new file mode 100644 index 000000000..8f183f146 --- /dev/null +++ b/block/qcow2.c @@ -0,0 +1,1719 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> +#include "aes.h" +#include "block/qcow2.h" +#include "qemu-error.h" +#include "qerror.h" +#include "trace.h" + +/* + Differences with QCOW: + + - Support for multiple incremental snapshots. + - Memory management by reference counts. + - Clusters which have a reference count of one have the bit + QCOW_OFLAG_COPIED to optimize write performance. + - Size of compressed clusters is stored in sectors to reduce bit usage + in the cluster offsets. + - Support for storing additional data (such as the VM state) in the + snapshots. + - If a backing store is used, the cluster size is not constrained + (could be backported to QCOW). + - L2 tables have always a size of one cluster. +*/ + + +typedef struct { + uint32_t magic; + uint32_t len; +} QCowExtension; +#define QCOW2_EXT_MAGIC_END 0 +#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA +#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 + +static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) >= 2) + return 100; + else + return 0; +} + + +/* + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, + uint64_t end_offset, void **p_feature_table) +{ + BDRVQcowState *s = bs->opaque; + QCowExtension ext; + uint64_t offset; + int ret; + +#ifdef DEBUG_EXT + printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif + offset = start_offset; + while (offset < end_offset) { + +#ifdef DEBUG_EXT + /* Sanity check */ + if (offset > s->cluster_size) + printf("qcow2_read_extension: suspicious offset %lu\n", offset); + + printf("attempting to read extended header in offset %lu\n", offset); +#endif + + if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { + fprintf(stderr, "qcow2_read_extension: ERROR: " + "pread fail from offset %" PRIu64 "\n", + offset); + return 1; + } + be32_to_cpus(&ext.magic); + be32_to_cpus(&ext.len); + offset += sizeof(ext); +#ifdef DEBUG_EXT + printf("ext.magic = 0x%x\n", ext.magic); +#endif + if (ext.len > end_offset - offset) { + error_report("Header extension too large"); + return -EINVAL; + } + + switch (ext.magic) { + case QCOW2_EXT_MAGIC_END: + return 0; + + case QCOW2_EXT_MAGIC_BACKING_FORMAT: + if (ext.len >= sizeof(bs->backing_format)) { + fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" + " (>=%zu)\n", + ext.len, sizeof(bs->backing_format)); + return 2; + } + if (bdrv_pread(bs->file, offset , bs->backing_format, + ext.len) != ext.len) + return 3; + bs->backing_format[ext.len] = '\0'; +#ifdef DEBUG_EXT + printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif + break; + + case QCOW2_EXT_MAGIC_FEATURE_TABLE: + if (p_feature_table != NULL) { + void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); + ret = bdrv_pread(bs->file, offset , feature_table, ext.len); + if (ret < 0) { + return ret; + } + + *p_feature_table = feature_table; + } + break; + + default: + /* unknown magic - save it in case we need to rewrite the header */ + { + Qcow2UnknownHeaderExtension *uext; + + uext = g_malloc0(sizeof(*uext) + ext.len); + uext->magic = ext.magic; + uext->len = ext.len; + QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + + ret = bdrv_pread(bs->file, offset , uext->data, uext->len); + if (ret < 0) { + return ret; + } + } + break; + } + + offset += ((ext.len + 7) & ~7); + } + + return 0; +} + +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + Qcow2UnknownHeaderExtension *uext, *next; + + QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { + QLIST_REMOVE(uext, next); + g_free(uext); + } +} + +static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, + const char *fmt, ...) +{ + char msg[64]; + va_list ap; + + va_start(ap, fmt); + vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow2", msg); +} + +static void report_unsupported_feature(BlockDriverState *bs, + Qcow2Feature *table, uint64_t mask) +{ + while (table && table->name[0] != '\0') { + if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { + if (mask & (1 << table->bit)) { + report_unsupported(bs, "%.46s",table->name); + mask &= ~(1 << table->bit); + } + } + table++; + } + + if (mask) { + report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); + } +} + +/* + * Sets the dirty bit and flushes afterwards if necessary. + * + * The incompatible_features bit is only set if the image file header was + * updated successfully. Therefore it is not required to check the return + * value of this function. + */ +static int qcow2_mark_dirty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint64_t val; + int ret; + + assert(s->qcow_version >= 3); + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + return 0; /* already dirty */ + } + + val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); + ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), + &val, sizeof(val)); + if (ret < 0) { + return ret; + } + ret = bdrv_flush(bs->file); + if (ret < 0) { + return ret; + } + + /* Only treat image as dirty if the header was updated successfully */ + s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; + return 0; +} + +/* + * Clears the dirty bit and flushes before if necessary. Only call this + * function when there are no pending requests, it does not guard against + * concurrent requests dirtying the image. + */ +static int qcow2_mark_clean(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + int ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + return qcow2_update_header(bs); + } + return 0; +} + +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + int ret = qcow2_check_refcounts(bs, result, fix); + if (ret < 0) { + return ret; + } + + if (fix && result->check_errors == 0 && result->corruptions == 0) { + return qcow2_mark_clean(bs); + } + return ret; +} + +static int qcow2_open(BlockDriverState *bs, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, ret = 0; + QCowHeader header; + uint64_t ext_end; + + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + if (ret < 0) { + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be64_to_cpus(&header.size); + be32_to_cpus(&header.cluster_bits); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + be32_to_cpus(&header.l1_size); + be64_to_cpus(&header.refcount_table_offset); + be32_to_cpus(&header.refcount_table_clusters); + be64_to_cpus(&header.snapshots_offset); + be32_to_cpus(&header.nb_snapshots); + + if (header.magic != QCOW_MAGIC) { + ret = -EINVAL; + goto fail; + } + if (header.version < 2 || header.version > 3) { + report_unsupported(bs, "QCOW version %d", header.version); + ret = -ENOTSUP; + goto fail; + } + + s->qcow_version = header.version; + + /* Initialise version 3 header fields */ + if (header.version == 2) { + header.incompatible_features = 0; + header.compatible_features = 0; + header.autoclear_features = 0; + header.refcount_order = 4; + header.header_length = 72; + } else { + be64_to_cpus(&header.incompatible_features); + be64_to_cpus(&header.compatible_features); + be64_to_cpus(&header.autoclear_features); + be32_to_cpus(&header.refcount_order); + be32_to_cpus(&header.header_length); + } + + if (header.header_length > sizeof(header)) { + s->unknown_header_fields_size = header.header_length - sizeof(header); + s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); + ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, + s->unknown_header_fields_size); + if (ret < 0) { + goto fail; + } + } + + if (header.backing_file_offset) { + ext_end = header.backing_file_offset; + } else { + ext_end = 1 << header.cluster_bits; + } + + /* Handle feature bits */ + s->incompatible_features = header.incompatible_features; + s->compatible_features = header.compatible_features; + s->autoclear_features = header.autoclear_features; + + if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { + void *feature_table = NULL; + qcow2_read_extensions(bs, header.header_length, ext_end, + &feature_table); + report_unsupported_feature(bs, feature_table, + s->incompatible_features & + ~QCOW2_INCOMPAT_MASK); + ret = -ENOTSUP; + goto fail; + } + + /* Check support for various header values */ + if (header.refcount_order != 4) { + report_unsupported(bs, "%d bit reference counts", + 1 << header.refcount_order); + ret = -ENOTSUP; + goto fail; + } + + if (header.cluster_bits < MIN_CLUSTER_BITS || + header.cluster_bits > MAX_CLUSTER_BITS) { + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->csize_shift = (62 - (s->cluster_bits - 8)); + s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; + s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + s->refcount_table_offset = header.refcount_table_offset; + s->refcount_table_size = + header.refcount_table_clusters << (s->cluster_bits - 3); + + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + + /* read the level 1 table */ + s->l1_size = header.l1_size; + s->l1_vm_state_index = size_to_l1(s, header.size); + /* the L1 table must contain at least enough entries to put + header.size bytes */ + if (s->l1_size < s->l1_vm_state_index) { + ret = -EINVAL; + goto fail; + } + s->l1_table_offset = header.l1_table_offset; + if (s->l1_size > 0) { + s->l1_table = g_malloc0( + align_offset(s->l1_size * sizeof(uint64_t), 512)); + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + } + + /* alloc L2 table/refcount block cache */ + s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); + s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); + + s->cluster_cache = g_malloc(s->cluster_size); + /* one more sector for decompressed data alignment */ + s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + + 512); + s->cluster_cache_offset = -1; + s->flags = flags; + + ret = qcow2_refcount_init(bs); + if (ret != 0) { + goto fail; + } + + QLIST_INIT(&s->cluster_allocs); + + /* read qcow2 extensions */ + if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { + ret = -EINVAL; + goto fail; + } + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) { + len = 1023; + } + ret = bdrv_pread(bs->file, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + goto fail; + } + bs->backing_file[len] = '\0'; + } + + ret = qcow2_read_snapshots(bs); + if (ret < 0) { + goto fail; + } + + /* Clear unknown autoclear feature bits */ + if (!bs->read_only && s->autoclear_features != 0) { + s->autoclear_features = 0; + ret = qcow2_update_header(bs); + if (ret < 0) { + goto fail; + } + } + + /* Initialise locks */ + qemu_co_mutex_init(&s->lock); + + /* Repair image if dirty */ + if (!(flags & BDRV_O_CHECK) && !bs->read_only && + (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { + BdrvCheckResult result = {0}; + + ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); + if (ret < 0) { + goto fail; + } + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return ret; + + fail: + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + qcow2_free_snapshots(bs); + qcow2_refcount_close(bs); + g_free(s->l1_table); + if (s->l2_table_cache) { + qcow2_cache_destroy(bs, s->l2_table_cache); + } + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + return ret; +} + +static int qcow2_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for(i=0;i<16;i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for(i = 0; i < 16; i++) + printf(" %02x", tmp[i]); + printf("\n"); + for(i = 0; i < 16; i++) + printf(" %02x", out[i]); + printf("\n"); + } +#endif + return 0; +} + +static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + uint64_t cluster_offset; + int ret; + + *pnum = nb_sectors; + /* FIXME We can get errors here, but the bdrv_co_is_allocated interface + * can't pass them on today */ + qemu_co_mutex_lock(&s->lock); + ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + *pnum = 0; + } + + return (cluster_offset != 0); +} + +/* handle reading after the end of the backing file */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors) +{ + int n1; + if ((sector_num + nb_sectors) <= bs->total_sectors) + return nb_sectors; + if (sector_num >= bs->total_sectors) + n1 = 0; + else + n1 = bs->total_sectors - sector_num; + + qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + + return n1; +} + +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n1; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset = 0; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + uint8_t *cluster_data = NULL; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + /* prepare next request */ + cur_nr_sectors = remaining_sectors; + if (s->crypt_method) { + cur_nr_sectors = MIN(cur_nr_sectors, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + } + + ret = qcow2_get_cluster_offset(bs, sector_num << 9, + &cur_nr_sectors, &cluster_offset); + if (ret < 0) { + goto fail; + } + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + switch (ret) { + case QCOW2_CLUSTER_UNALLOCATED: + + if (bs->backing_hd) { + /* read from the base image */ + n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, + sector_num, cur_nr_sectors); + if (n1 > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n1, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + } + } else { + /* Note: in this case, no need to wait */ + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + } + break; + + case QCOW2_CLUSTER_ZERO: + if (s->qcow_version < 3) { + ret = -EIO; + goto fail; + } + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_COMPRESSED: + /* add AIO support for compressed blocks ? */ + ret = qcow2_decompress_cluster(bs, cluster_offset); + if (ret < 0) { + goto fail; + } + + qemu_iovec_from_buf(&hd_qiov, 0, + s->cluster_cache + index_in_cluster * 512, + 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_NORMAL: + if ((cluster_offset & 511) != 0) { + ret = -EIO; + goto fail; + } + + if (s->crypt_method) { + /* + * For encrypted images, read everything into a temporary + * contiguous buffer on which the AES functions can work. + */ + if (!cluster_data) { + cluster_data = + qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + } + + assert(cur_nr_sectors <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + 512 * cur_nr_sectors); + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + if (s->crypt_method) { + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); + qemu_iovec_from_buf(qiov, bytes_done, + cluster_data, 512 * cur_nr_sectors); + } + break; + + default: + g_assert_not_reached(); + ret = -EIO; + goto fail; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + ret = 0; + +fail: + qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + + return ret; +} + +static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) +{ + /* Take the request off the list of running requests */ + if (m->nb_clusters != 0) { + QLIST_REMOVE(m, next_in_flight); + } + + /* Restart all dependent requests */ + if (!qemu_co_queue_empty(&m->dependent_requests)) { + qemu_co_mutex_unlock(&s->lock); + qemu_co_queue_restart_all(&m->dependent_requests); + qemu_co_mutex_lock(&s->lock); + } +} + +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, + int64_t sector_num, + int remaining_sectors, + QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + int n_end; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset; + QEMUIOVector hd_qiov; + uint64_t bytes_done = 0; + uint8_t *cluster_data = NULL; + QCowL2Meta l2meta = { + .nb_clusters = 0, + }; + + trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, + remaining_sectors); + + qemu_co_queue_init(&l2meta.dependent_requests); + + qemu_iovec_init(&hd_qiov, qiov->niov); + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + trace_qcow2_writev_start_part(qemu_coroutine_self()); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n_end = index_in_cluster + remaining_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + } + + ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, + index_in_cluster, n_end, &cur_nr_sectors, &l2meta); + if (ret < 0) { + goto fail; + } + + if (l2meta.nb_clusters > 0 && + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) { + qcow2_mark_dirty(bs); + } + + cluster_offset = l2meta.cluster_offset; + assert((cluster_offset & 511) == 0); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * + s->cluster_size); + } + + assert(hd_qiov.size <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); + + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + cur_nr_sectors * 512); + } + + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + qemu_co_mutex_unlock(&s->lock); + trace_qcow2_writev_data(qemu_coroutine_self(), + (cluster_offset >> 9) + index_in_cluster); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + + ret = qcow2_alloc_cluster_link_l2(bs, &l2meta); + if (ret < 0) { + goto fail; + } + + run_dependent_requests(s, &l2meta); + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); + } + ret = 0; + +fail: + run_dependent_requests(s, &l2meta); + + qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + + return ret; +} + +static void qcow2_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + g_free(s->l1_table); + + qcow2_cache_flush(bs, s->l2_table_cache); + qcow2_cache_flush(bs, s->refcount_block_cache); + + qcow2_mark_clean(bs); + + qcow2_cache_destroy(bs, s->l2_table_cache); + qcow2_cache_destroy(bs, s->refcount_block_cache); + + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + qcow2_refcount_close(bs); + qcow2_free_snapshots(bs); +} + +static void qcow2_invalidate_cache(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int flags = s->flags; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + uint32_t crypt_method = 0; + + /* + * Backing files are read-only which makes all of their metadata immutable, + * that means we don't have to worry about reopening them here. + */ + + if (s->crypt_method) { + crypt_method = s->crypt_method; + memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); + memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); + } + + qcow2_close(bs); + + memset(s, 0, sizeof(BDRVQcowState)); + qcow2_open(bs, flags); + + if (crypt_method) { + s->crypt_method = crypt_method; + memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); + memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); + } +} + +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, + size_t len, size_t buflen) +{ + QCowExtension *ext_backing_fmt = (QCowExtension*) buf; + size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); + + if (buflen < ext_len) { + return -ENOSPC; + } + + *ext_backing_fmt = (QCowExtension) { + .magic = cpu_to_be32(magic), + .len = cpu_to_be32(len), + }; + memcpy(buf + sizeof(QCowExtension), s, len); + + return ext_len; +} + +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowHeader *header; + char *buf; + size_t buflen = s->cluster_size; + int ret; + uint64_t total_size; + uint32_t refcount_table_clusters; + size_t header_length; + Qcow2UnknownHeaderExtension *uext; + + buf = qemu_blockalign(bs, buflen); + + /* Header structure */ + header = (QCowHeader*) buf; + + if (buflen < sizeof(*header)) { + ret = -ENOSPC; + goto fail; + } + + header_length = sizeof(*header) + s->unknown_header_fields_size; + total_size = bs->total_sectors * BDRV_SECTOR_SIZE; + refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + + *header = (QCowHeader) { + /* Version 2 fields */ + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(s->qcow_version), + .backing_file_offset = 0, + .backing_file_size = 0, + .cluster_bits = cpu_to_be32(s->cluster_bits), + .size = cpu_to_be64(total_size), + .crypt_method = cpu_to_be32(s->crypt_method_header), + .l1_size = cpu_to_be32(s->l1_size), + .l1_table_offset = cpu_to_be64(s->l1_table_offset), + .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), + .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), + .nb_snapshots = cpu_to_be32(s->nb_snapshots), + .snapshots_offset = cpu_to_be64(s->snapshots_offset), + + /* Version 3 fields */ + .incompatible_features = cpu_to_be64(s->incompatible_features), + .compatible_features = cpu_to_be64(s->compatible_features), + .autoclear_features = cpu_to_be64(s->autoclear_features), + .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .header_length = cpu_to_be32(header_length), + }; + + /* For older versions, write a shorter header */ + switch (s->qcow_version) { + case 2: + ret = offsetof(QCowHeader, incompatible_features); + break; + case 3: + ret = sizeof(*header); + break; + default: + ret = -EINVAL; + goto fail; + } + + buf += ret; + buflen -= ret; + memset(buf, 0, buflen); + + /* Preserve any unknown field in the header */ + if (s->unknown_header_fields_size) { + if (buflen < s->unknown_header_fields_size) { + ret = -ENOSPC; + goto fail; + } + + memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); + buf += s->unknown_header_fields_size; + buflen -= s->unknown_header_fields_size; + } + + /* Backing file format header extension */ + if (*bs->backing_format) { + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, + bs->backing_format, strlen(bs->backing_format), + buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* Feature table */ + Qcow2Feature features[] = { + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_DIRTY_BITNR, + .name = "dirty bit", + }, + { + .type = QCOW2_FEAT_TYPE_COMPATIBLE, + .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + .name = "lazy refcounts", + }, + }; + + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, + features, sizeof(features), buflen); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; + + /* Keep unknown header extensions */ + QLIST_FOREACH(uext, &s->unknown_header_ext, next) { + ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* End of header extensions */ + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + + /* Backing file name */ + if (*bs->backing_file) { + size_t backing_file_len = strlen(bs->backing_file); + + if (buflen < backing_file_len) { + ret = -ENOSPC; + goto fail; + } + + strncpy(buf, bs->backing_file, buflen); + + header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); + header->backing_file_size = cpu_to_be32(backing_file_len); + } + + /* Write the new header */ + ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = 0; +fail: + qemu_vfree(header); + return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + + return qcow2_update_header(bs); +} + +static int preallocate(BlockDriverState *bs) +{ + uint64_t nb_sectors; + uint64_t offset; + int num; + int ret; + QCowL2Meta meta; + + nb_sectors = bdrv_getlength(bs) >> 9; + offset = 0; + qemu_co_queue_init(&meta.dependent_requests); + meta.cluster_offset = 0; + + while (nb_sectors) { + num = MIN(nb_sectors, INT_MAX >> 9); + ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta); + if (ret < 0) { + return ret; + } + + ret = qcow2_alloc_cluster_link_l2(bs, &meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters); + return ret; + } + + /* There are no dependent requests, but we need to remove our request + * from the list of in-flight requests */ + run_dependent_requests(bs->opaque, &meta); + + /* TODO Preallocate data if requested */ + + nb_sectors -= num; + offset += num << 9; + } + + /* + * It is expected that the image file is large enough to actually contain + * all of the allocated clusters (otherwise we get failing reads after + * EOF). Extend the image to the last allocated sector. + */ + if (meta.cluster_offset != 0) { + uint8_t buf[512]; + memset(buf, 0, 512); + ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +static int qcow2_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags, size_t cluster_size, int prealloc, + QEMUOptionParameter *options, int version) +{ + /* Calculate cluster_bits */ + int cluster_bits; + cluster_bits = ffs(cluster_size) - 1; + if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || + (1 << cluster_bits) != cluster_size) + { + error_report( + "Cluster size must be a power of two between %d and %dk", + 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); + return -EINVAL; + } + + /* + * Open the image file and write a minimal qcow2 header. + * + * We keep things simple and start with a zero-sized image. We also + * do without refcount blocks or a L1 table for now. We'll fix the + * inconsistency later. + * + * We do need a refcount table because growing the refcount table means + * allocating two new refcount blocks - the seconds of which would be at + * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file + * size for any qcow2 image. + */ + BlockDriverState* bs; + QCowHeader header; + uint8_t* refcount_table; + int ret; + + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + /* Write the header */ + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(version); + header.cluster_bits = cpu_to_be32(cluster_bits); + header.size = cpu_to_be64(0); + header.l1_table_offset = cpu_to_be64(0); + header.l1_size = cpu_to_be32(0); + header.refcount_table_offset = cpu_to_be64(cluster_size); + header.refcount_table_clusters = cpu_to_be32(1); + header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); + header.header_length = cpu_to_be32(sizeof(header)); + + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { + header.compatible_features |= + cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); + } + + ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); + if (ret < 0) { + goto out; + } + + /* Write an empty refcount table */ + refcount_table = g_malloc0(cluster_size); + ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); + g_free(refcount_table); + + if (ret < 0) { + goto out; + } + + bdrv_close(bs); + + /* + * And now open the image and make it consistent first (i.e. increase the + * refcount of the cluster that is occupied by the header and the refcount + * table) + */ + BlockDriver* drv = bdrv_find_format("qcow2"); + assert(drv != NULL); + ret = bdrv_open(bs, filename, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); + if (ret < 0) { + goto out; + } + + ret = qcow2_alloc_clusters(bs, 2 * cluster_size); + if (ret < 0) { + goto out; + + } else if (ret != 0) { + error_report("Huh, first cluster in empty image is already in use?"); + abort(); + } + + /* Okay, now that we have a valid image, let's give it the right size */ + ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto out; + } + + /* Want a backing file? There you go.*/ + if (backing_file) { + ret = bdrv_change_backing_file(bs, backing_file, backing_format); + if (ret < 0) { + goto out; + } + } + + /* And if we're supposed to preallocate metadata, do that now */ + if (prealloc) { + BDRVQcowState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = preallocate(bs); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + goto out; + } + } + + ret = 0; +out: + bdrv_delete(bs); + return ret; +} + +static int qcow2_create(const char *filename, QEMUOptionParameter *options) +{ + const char *backing_file = NULL; + const char *backing_fmt = NULL; + uint64_t sectors = 0; + int flags = 0; + size_t cluster_size = DEFAULT_CLUSTER_SIZE; + int prealloc = 0; + int version = 2; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + sectors = options->value.n / 512; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { + backing_fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { + flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cluster_size = options->value.n; + } + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "metadata")) { + prealloc = 1; + } else { + fprintf(stderr, "Invalid preallocation mode: '%s'\n", + options->value.s); + return -EINVAL; + } + } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { + if (!options->value.s || !strcmp(options->value.s, "0.10")) { + version = 2; + } else if (!strcmp(options->value.s, "1.1")) { + version = 3; + } else { + fprintf(stderr, "Invalid compatibility level: '%s'\n", + options->value.s); + return -EINVAL; + } + } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { + flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; + } + options++; + } + + if (backing_file && prealloc) { + fprintf(stderr, "Backing file and preallocation cannot be used at " + "the same time\n"); + return -EINVAL; + } + + if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { + fprintf(stderr, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)\n"); + return -EINVAL; + } + + return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, + cluster_size, prealloc, options, version); +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ +#if 0 + /* XXX: not correct */ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + l2_cache_reset(bs); +#endif + return 0; +} + +static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + BDRVQcowState *s = bs->opaque; + + /* Emulate misaligned zero writes */ + if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { + return -ENOTSUP; + } + + /* Whatever is left can use real zero clusters */ + qemu_co_mutex_lock(&s->lock); + ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + BDRVQcowState *s = bs->opaque; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, new_l1_size; + + if (offset & 511) { + error_report("The new size must be a multiple of 512"); + return -EINVAL; + } + + /* cannot proceed if image has snapshots */ + if (s->nb_snapshots) { + error_report("Can't resize an image which has snapshots"); + return -ENOTSUP; + } + + /* shrinking is currently not supported */ + if (offset < bs->total_sectors * 512) { + error_report("qcow2 doesn't support shrinking images yet"); + return -ENOTSUP; + } + + new_l1_size = size_to_l1(s, offset); + ret = qcow2_grow_l1_table(bs, new_l1_size, true); + if (ret < 0) { + return ret; + } + + /* write updated header.size */ + offset = cpu_to_be64(offset); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), + &offset, sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + s->l1_vm_state_index = new_l1_size; + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors == 0) { + /* align end of file to a sector boundary to ease reading with + sector based I/Os */ + cluster_offset = bdrv_getlength(bs->file); + cluster_offset = (cluster_offset + 511) & ~511; + bdrv_truncate(bs->file, cluster_offset); + return 0; + } + + if (nb_sectors != s->cluster_sectors) + return -EINVAL; + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, + sector_num << 9, out_len); + if (!cluster_offset) { + ret = -EIO; + goto fail; + } + cluster_offset &= s->cluster_offset_mask; + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + + if (qcow2_need_accurate_refcounts(s)) { + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + } + qemu_co_mutex_unlock(&s->lock); + + return 0; +} + +static int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + bdi->vm_state_offset = qcow2_vm_state_offset(s); + return 0; +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int64_t nb_clusters, k, k1, size; + int refcount; + + size = bdrv_getlength(bs->file); + nb_clusters = size_to_clusters(s, size); + for(k = 0; k < nb_clusters;) { + k1 = k; + refcount = get_refcount(bs, k); + k++; + while (k < nb_clusters && get_refcount(bs, k) == refcount) + k++; + printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, + k - k1); + } +} +#endif + +static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + BDRVQcowState *s = bs->opaque; + int growable = bs->growable; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); + bs->growable = 1; + ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size); + bs->growable = growable; + + return ret; +} + +static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BDRVQcowState *s = bs->opaque; + int growable = bs->growable; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); + bs->growable = 1; + ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); + bs->growable = growable; + + return ret; +} + +static QEMUOptionParameter qcow2_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_COMPAT_LEVEL, + .type = OPT_STRING, + .help = "Compatibility level (0.10 or 1.1)" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = OPT_FLAG, + .help = "Encrypt the image" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "qcow2 cluster size", + .value = { .n = DEFAULT_CLUSTER_SIZE }, + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, metadata)" + }, + { + .name = BLOCK_OPT_LAZY_REFCOUNTS, + .type = OPT_FLAG, + .help = "Postpone refcount updates", + }, + { NULL } +}; + +static BlockDriver bdrv_qcow2 = { + .format_name = "qcow2", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow2_probe, + .bdrv_open = qcow2_open, + .bdrv_close = qcow2_close, + .bdrv_create = qcow2_create, + .bdrv_co_is_allocated = qcow2_co_is_allocated, + .bdrv_set_key = qcow2_set_key, + .bdrv_make_empty = qcow2_make_empty, + + .bdrv_co_readv = qcow2_co_readv, + .bdrv_co_writev = qcow2_co_writev, + .bdrv_co_flush_to_os = qcow2_co_flush_to_os, + + .bdrv_co_write_zeroes = qcow2_co_write_zeroes, + .bdrv_co_discard = qcow2_co_discard, + .bdrv_truncate = qcow2_truncate, + .bdrv_write_compressed = qcow2_write_compressed, + + .bdrv_snapshot_create = qcow2_snapshot_create, + .bdrv_snapshot_goto = qcow2_snapshot_goto, + .bdrv_snapshot_delete = qcow2_snapshot_delete, + .bdrv_snapshot_list = qcow2_snapshot_list, + .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, + .bdrv_get_info = qcow2_get_info, + + .bdrv_save_vmstate = qcow2_save_vmstate, + .bdrv_load_vmstate = qcow2_load_vmstate, + + .bdrv_change_backing_file = qcow2_change_backing_file, + + .bdrv_invalidate_cache = qcow2_invalidate_cache, + + .create_options = qcow2_create_options, + .bdrv_check = qcow2_check, +}; + +static void bdrv_qcow2_init(void) +{ + bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/block/qcow2.h b/block/qcow2.h new file mode 100644 index 000000000..b4eb65470 --- /dev/null +++ b/block/qcow2.h @@ -0,0 +1,336 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QCOW2_H +#define BLOCK_QCOW2_H + +#include "aes.h" +#include "qemu-coroutine.h" + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1LL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1LL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1LL << 0) + +#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 + +#define L2_CACHE_SIZE 16 + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define REFCOUNT_CACHE_SIZE 4 + +#define DEFAULT_CLUSTER_SIZE 65536 + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; + + /* The following fields are only valid for version >= 3 */ + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + uint32_t refcount_order; + uint32_t header_length; +} QCowHeader; + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint64_t disk_size; + uint64_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; +} QCowSnapshot; + +struct Qcow2Cache; +typedef struct Qcow2Cache Qcow2Cache; + +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { + QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, + QCOW2_FEAT_TYPE_COMPATIBLE = 1, + QCOW2_FEAT_TYPE_AUTOCLEAR = 2, +}; + +/* Incompatible feature bits */ +enum { + QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, +}; + +/* Compatible feature bits */ +enum { + QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, + QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + + QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +typedef struct Qcow2Feature { + uint8_t type; + uint8_t bit; + char name[46]; +} QEMU_PACKED Qcow2Feature; + +typedef struct BDRVQcowState { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + int l1_vm_state_index; + int csize_shift; + int csize_mask; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + + Qcow2Cache* l2_table_cache; + Qcow2Cache* refcount_block_cache; + + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; + + uint64_t *refcount_table; + uint64_t refcount_table_offset; + uint32_t refcount_table_size; + int64_t free_cluster_index; + int64_t free_byte_offset; + + CoMutex lock; + + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + uint64_t snapshots_offset; + int snapshots_size; + int nb_snapshots; + QCowSnapshot *snapshots; + + int flags; + int qcow_version; + + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + size_t unknown_header_fields_size; + void* unknown_header_fields; + QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; +} BDRVQcowState; + +/* XXX: use std qcow open function ? */ +typedef struct QCowCreateState { + int cluster_size; + int cluster_bits; + uint16_t *refcount_block; + uint64_t *refcount_table; + int64_t l1_table_offset; + int64_t refcount_table_offset; + int64_t refcount_block_offset; +} QCowCreateState; + +struct QCowAIOCB; + +/* XXX This could be private for qcow2-cluster.c */ +typedef struct QCowL2Meta +{ + uint64_t offset; + uint64_t cluster_offset; + uint64_t alloc_offset; + int n_start; + int nb_available; + int nb_clusters; + CoQueue dependent_requests; + + QLIST_ENTRY(QCowL2Meta) next_in_flight; +} QCowL2Meta; + +enum { + QCOW2_CLUSTER_UNALLOCATED, + QCOW2_CLUSTER_NORMAL, + QCOW2_CLUSTER_COMPRESSED, + QCOW2_CLUSTER_ZERO +}; + +#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xffffffffffffff00ULL + +static inline int size_to_clusters(BDRVQcowState *s, int64_t size) +{ + return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static inline int size_to_l1(BDRVQcowState *s, int64_t size) +{ + int shift = s->cluster_bits + s->l2_bits; + return (size + (1ULL << shift) - 1) >> shift; +} + +static inline int64_t align_offset(int64_t offset, int n) +{ + offset = (offset + n - 1) & ~(n - 1); + return offset; +} + +static inline int qcow2_get_cluster_type(uint64_t l2_entry) +{ + if (l2_entry & QCOW_OFLAG_COMPRESSED) { + return QCOW2_CLUSTER_COMPRESSED; + } else if (l2_entry & QCOW_OFLAG_ZERO) { + return QCOW2_CLUSTER_ZERO; + } else if (!(l2_entry & L2E_OFFSET_MASK)) { + return QCOW2_CLUSTER_UNALLOCATED; + } else { + return QCOW2_CLUSTER_NORMAL; + } +} + +/* Check whether refcounts are eager or lazy */ +static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) +{ + return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); +} + +// FIXME Need qcow2_ prefix to global functions + +/* qcow2.c functions */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors); +int qcow2_update_header(BlockDriverState *bs); + +/* qcow2-refcount.c functions */ +int qcow2_refcount_init(BlockDriverState *bs); +void qcow2_refcount_close(BlockDriverState *bs); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int nb_clusters); +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size); +void qcow2_free_any_clusters(BlockDriverState *bs, + uint64_t cluster_offset, int nb_clusters); + +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend); + +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); + +/* qcow2-cluster.c functions */ +int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size); +void qcow2_l2_cache_reset(BlockDriverState *bs); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key); + +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, QCowL2Meta *m); +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size); + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); + +/* qcow2-snapshot.c functions */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); + +void qcow2_free_snapshots(BlockDriverState *bs); +int qcow2_read_snapshots(BlockDriverState *bs); + +/* qcow2-cache.c functions */ +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency); +void qcow2_cache_depends_on_flush(Qcow2Cache *c); + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); + +#endif diff --git a/block/qed-check.c b/block/qed-check.c new file mode 100644 index 000000000..b473dcd61 --- /dev/null +++ b/block/qed-check.c @@ -0,0 +1,248 @@ +/* + * QEMU Enhanced Disk Format Consistency Check + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { + BDRVQEDState *s; + BdrvCheckResult *result; + bool fix; /* whether to fix invalid offsets */ + + uint64_t nclusters; + uint32_t *used_clusters; /* referenced cluster bitmap */ + + QEDRequest request; +} QEDCheck; + +static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { + return !!(bitmap[n / 32] & (1 << (n % 32))); +} + +static void qed_set_bit(uint32_t *bitmap, uint64_t n) { + bitmap[n / 32] |= 1 << (n % 32); +} + +/** + * Set bitmap bits for clusters + * + * @check: Check structure + * @offset: Starting offset in bytes + * @n: Number of clusters + */ +static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, + unsigned int n) +{ + uint64_t cluster = qed_bytes_to_clusters(check->s, offset); + unsigned int corruptions = 0; + + while (n-- != 0) { + /* Clusters should only be referenced once */ + if (qed_test_bit(check->used_clusters, cluster)) { + corruptions++; + } + + qed_set_bit(check->used_clusters, cluster); + cluster++; + } + + check->result->corruptions += corruptions; + return corruptions == 0; +} + +/** + * Check an L2 table + * + * @ret: Number of invalid cluster offsets + */ +static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid = 0; + uint64_t last_offset = 0; + + for (i = 0; i < s->table_nelems; i++) { + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset) || + qed_offset_is_zero_cluster(offset)) { + continue; + } + check->result->bfi.allocated_clusters++; + if (last_offset && (last_offset + s->header.cluster_size != offset)) { + check->result->bfi.fragmented_clusters++; + } + last_offset = offset; + + /* Detect invalid cluster offset */ + if (!qed_check_cluster_offset(s, offset)) { + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid++; + continue; + } + + qed_set_used_clusters(check, offset, 1); + } + + return num_invalid; +} + +/** + * Descend tables and check each cluster is referenced once only + */ +static int qed_check_l1_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid_l1 = 0; + int ret, last_error = 0; + + /* Mark L1 table clusters used */ + qed_set_used_clusters(check, s->header.l1_table_offset, + s->header.table_size); + + for (i = 0; i < s->table_nelems; i++) { + unsigned int num_invalid_l2; + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset)) { + continue; + } + + /* Detect invalid L2 offset */ + if (!qed_check_table_offset(s, offset)) { + /* Clear invalid offset */ + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid_l1++; + continue; + } + + if (!qed_set_used_clusters(check, offset, s->header.table_size)) { + continue; /* skip an invalid table */ + } + + ret = qed_read_l2_table_sync(s, &check->request, offset); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + + num_invalid_l2 = qed_check_l2_table(check, + check->request.l2_table->table); + + /* Write out fixed L2 table */ + if (num_invalid_l2 > 0 && check->fix) { + ret = qed_write_l2_table_sync(s, &check->request, 0, + s->table_nelems, false); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + } + } + + /* Drop reference to final table */ + qed_unref_l2_cache_entry(check->request.l2_table); + check->request.l2_table = NULL; + + /* Write out fixed L1 table */ + if (num_invalid_l1 > 0 && check->fix) { + ret = qed_write_l1_table_sync(s, 0, s->table_nelems); + if (ret) { + check->result->check_errors++; + last_error = ret; + } + } + + return last_error; +} + +/** + * Check for unreferenced (leaked) clusters + */ +static void qed_check_for_leaks(QEDCheck *check) +{ + BDRVQEDState *s = check->s; + uint64_t i; + + for (i = s->header.header_size; i < check->nclusters; i++) { + if (!qed_test_bit(check->used_clusters, i)) { + check->result->leaks++; + } + } +} + +/** + * Mark an image clean once it passes check or has been repaired + */ +static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) +{ + /* Skip if there were unfixable corruptions or I/O errors */ + if (result->corruptions > 0 || result->check_errors > 0) { + return; + } + + /* Skip if image is already marked clean */ + if (!(s->header.features & QED_F_NEED_CHECK)) { + return; + } + + /* Ensure fixes reach storage before clearing check bit */ + bdrv_flush(s->bs); + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); +} + +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) +{ + QEDCheck check = { + .s = s, + .result = result, + .nclusters = qed_bytes_to_clusters(s, s->file_size), + .request = { .l2_table = NULL }, + .fix = fix, + }; + int ret; + + check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * + sizeof(check.used_clusters[0])); + + check.result->bfi.total_clusters = + (s->header.image_size + s->header.cluster_size - 1) / + s->header.cluster_size; + ret = qed_check_l1_table(&check, s->l1_table); + if (ret == 0) { + /* Only check for leaks if entire image was scanned successfully */ + qed_check_for_leaks(&check); + + if (fix) { + qed_check_mark_clean(s, result); + } + } + + g_free(check.used_clusters); + return ret; +} diff --git a/block/qed-cluster.c b/block/qed-cluster.c new file mode 100644 index 000000000..f64b2af8f --- /dev/null +++ b/block/qed-cluster.c @@ -0,0 +1,165 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Maximum number of clusters + * @offset: Set to first cluster offset + * + * This function scans tables for contiguous clusters. A contiguous run of + * clusters may be allocated, unallocated, or zero. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, + QEDTable *table, + unsigned int index, + unsigned int n, + uint64_t *offset) +{ + unsigned int end = MIN(index + n, s->table_nelems); + uint64_t last = table->offsets[index]; + unsigned int i; + + *offset = last; + + for (i = index + 1; i < end; i++) { + if (qed_offset_is_unalloc_cluster(last)) { + /* Counting unallocated clusters */ + if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { + break; + } + } else if (qed_offset_is_zero_cluster(last)) { + /* Counting zero clusters */ + if (!qed_offset_is_zero_cluster(table->offsets[i])) { + break; + } + } else { + /* Counting allocated clusters */ + if (table->offsets[i] != last + s->header.cluster_size) { + break; + } + last = table->offsets[i]; + } + } + return i - index; +} + +typedef struct { + BDRVQEDState *s; + uint64_t pos; + size_t len; + + QEDRequest *request; + + /* User callback */ + QEDFindClusterFunc *cb; + void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ + QEDFindClusterCB *find_cluster_cb = opaque; + BDRVQEDState *s = find_cluster_cb->s; + QEDRequest *request = find_cluster_cb->request; + uint64_t offset = 0; + size_t len = 0; + unsigned int index; + unsigned int n; + + if (ret) { + goto out; + } + + index = qed_l2_index(s, find_cluster_cb->pos); + n = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, find_cluster_cb->pos) + + find_cluster_cb->len); + n = qed_count_contiguous_clusters(s, request->l2_table->table, + index, n, &offset); + + if (qed_offset_is_unalloc_cluster(offset)) { + ret = QED_CLUSTER_L2; + } else if (qed_offset_is_zero_cluster(offset)) { + ret = QED_CLUSTER_ZERO; + } else if (qed_check_cluster_offset(s, offset)) { + ret = QED_CLUSTER_FOUND; + } else { + ret = -EINVAL; + } + + len = MIN(find_cluster_cb->len, n * s->header.cluster_size - + qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: + find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); + g_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s: QED state + * @request: L2 cache entry + * @pos: Byte position in device + * @len: Number of bytes + * @cb: Completion function + * @opaque: User data for completion function + * + * This function translates a position in the block device to an offset in the + * image file. It invokes the cb completion callback to report back the + * translated offset or unallocated range in the image file. + * + * If the L2 table exists, request->l2_table points to the L2 table cache entry + * and the caller must free the reference when they are finished. The cache + * entry is exposed in this way to avoid callers having to read the L2 table + * again later during request processing. If request->l2_table is non-NULL it + * will be unreferenced before taking on the new cache entry. + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque) +{ + QEDFindClusterCB *find_cluster_cb; + uint64_t l2_offset; + + /* Limit length to L2 boundary. Requests are broken up at the L2 boundary + * so that a request acts on one L2 table at a time. + */ + len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + + l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; + if (qed_offset_is_unalloc_cluster(l2_offset)) { + cb(opaque, QED_CLUSTER_L1, 0, len); + return; + } + if (!qed_check_table_offset(s, l2_offset)) { + cb(opaque, -EINVAL, 0, 0); + return; + } + + find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); + find_cluster_cb->s = s; + find_cluster_cb->pos = pos; + find_cluster_cb->len = len; + find_cluster_cb->cb = cb; + find_cluster_cb->opaque = opaque; + find_cluster_cb->request = request; + + qed_read_l2_table(s, request, l2_offset, + qed_find_cluster_cb, find_cluster_cb); +} diff --git a/block/qed-gencb.c b/block/qed-gencb.c new file mode 100644 index 000000000..7d7ac1ffc --- /dev/null +++ b/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) +{ + GenericCB *gencb = g_malloc(len); + gencb->cb = cb; + gencb->opaque = opaque; + return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ + GenericCB *gencb = opaque; + BlockDriverCompletionFunc *cb = gencb->cb; + void *user_opaque = gencb->opaque; + + g_free(gencb); + cb(user_opaque, ret); +} diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c new file mode 100644 index 000000000..e9b2aae44 --- /dev/null +++ b/block/qed-l2-cache.c @@ -0,0 +1,187 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +/* + * L2 table cache usage is as follows: + * + * An open image has one L2 table cache that is used to avoid accessing the + * image file for recently referenced L2 tables. + * + * Cluster offset lookup translates the logical offset within the block device + * to a cluster offset within the image file. This is done by indexing into + * the L1 and L2 tables which store cluster offsets. It is here where the L2 + * table cache serves up recently referenced L2 tables. + * + * If there is a cache miss, that L2 table is read from the image file and + * committed to the cache. Subsequent accesses to that L2 table will be served + * from the cache until the table is evicted from the cache. + * + * L2 tables are also committed to the cache when new L2 tables are allocated + * in the image file. Since the L2 table cache is write-through, the new L2 + * table is first written out to the image file and then committed to the + * cache. + * + * Multiple I/O requests may be using an L2 table cache entry at any given + * time. That means an entry may be in use across several requests and + * reference counting is needed to free the entry at the correct time. In + * particular, an entry evicted from the cache will only be freed once all + * references are dropped. + * + * An in-flight I/O request will hold a reference to a L2 table cache entry for + * the period during which it needs to access the L2 table. This includes + * cluster offset lookup, L2 table allocation, and L2 table update when a new + * data cluster has been allocated. + * + * An interesting case occurs when two requests need to access an L2 table that + * is not in the cache. Since the operation to read the table from the image + * file takes some time to complete, both requests may see a cache miss and + * start reading the L2 table from the image file. The first to finish will + * commit its L2 table into the cache. When the second tries to commit its + * table will be deleted in favor of the existing cache entry. + */ + +#include "trace.h" +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache) +{ + QTAILQ_INIT(&l2_cache->entries); + l2_cache->n_entries = 0; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ + CachedL2Table *entry, *next_entry; + + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + * The caller must allocate the actual table field for this entry and it must + * be freeable using qemu_vfree(). + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ + CachedL2Table *entry; + + entry = g_malloc0(sizeof(*entry)); + entry->ref++; + + trace_qed_alloc_l2_cache_entry(l2_cache, entry); + + return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(CachedL2Table *entry) +{ + if (!entry) { + return; + } + + entry->ref--; + trace_qed_unref_l2_cache_entry(entry, entry->ref); + if (entry->ref == 0) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Find an entry in the L2 cache. This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ + CachedL2Table *entry; + + QTAILQ_FOREACH(entry, &l2_cache->entries, node) { + if (entry->offset == offset) { + trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); + entry->ref++; + return entry; + } + } + return NULL; +} + +/** + * Commit an L2 cache entry into the cache. This is meant to be used as part of + * the process to satisfy a cache miss. A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * N.B. This function steals a reference to the l2_table from the caller so the + * caller must obtain a new reference by issuing a call to + * qed_find_l2_cache_entry(). + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ + CachedL2Table *entry; + + entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); + if (entry) { + qed_unref_l2_cache_entry(entry); + qed_unref_l2_cache_entry(l2_table); + return; + } + + /* Evict an unused cache entry so we have space. If all entries are in use + * we can grow the cache temporarily and we try to shrink back down later. + */ + if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { + CachedL2Table *next; + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { + if (entry->ref > 1) { + continue; + } + + QTAILQ_REMOVE(&l2_cache->entries, entry, node); + l2_cache->n_entries--; + qed_unref_l2_cache_entry(entry); + + /* Stop evicting when we've shrunk back to max size */ + if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { + break; + } + } + } + + l2_cache->n_entries++; + QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/block/qed-table.c b/block/qed-table.c new file mode 100644 index 000000000..ce07b0554 --- /dev/null +++ b/block/qed-table.c @@ -0,0 +1,297 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu_socket.h" /* for EINPROGRESS on Windows */ +#include "qed.h" + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *table; + + struct iovec iov; + QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ + QEDReadTableCB *read_table_cb = opaque; + QEDTable *table = read_table_cb->table; + int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); + int i; + + /* Handle I/O error */ + if (ret) { + goto out; + } + + /* Byteswap offsets */ + for (i = 0; i < noffsets; i++) { + table->offsets[i] = le64_to_cpu(table->offsets[i]); + } + +out: + /* Completion */ + trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); + gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), + cb, opaque); + QEMUIOVector *qiov = &read_table_cb->qiov; + + trace_qed_read_table(s, offset, table); + + read_table_cb->s = s; + read_table_cb->table = table; + read_table_cb->iov.iov_base = table->offsets, + read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + + qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); + bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, + qiov->size / BDRV_SECTOR_SIZE, + qed_read_table_cb, read_table_cb); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *orig_table; + QEDTable *table; + bool flush; /* flush after write? */ + + struct iovec iov; + QEMUIOVector qiov; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ + QEDWriteTableCB *write_table_cb = opaque; + + trace_qed_write_table_cb(write_table_cb->s, + write_table_cb->orig_table, + write_table_cb->flush, + ret); + + if (ret) { + goto out; + } + + if (write_table_cb->flush) { + /* We still need to flush first */ + write_table_cb->flush = false; + bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, + write_table_cb); + return; + } + +out: + qemu_vfree(write_table_cb->table); + gencb_complete(&write_table_cb->gencb, ret); + return; +} + +/** + * Write out an updated part or all of a table + * + * @s: QED state + * @offset: Offset of table in image file, in bytes + * @table: Table + * @index: Index of first element + * @n: Number of elements + * @flush: Whether or not to sync to disk + * @cb: Completion function + * @opaque: Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDWriteTableCB *write_table_cb; + unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; + unsigned int start, end, i; + size_t len_bytes; + + trace_qed_write_table(s, offset, table, index, n); + + /* Calculate indices of the first and one after last elements */ + start = index & ~sector_mask; + end = (index + n + sector_mask) & ~sector_mask; + + len_bytes = (end - start) * sizeof(uint64_t); + + write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); + write_table_cb->s = s; + write_table_cb->orig_table = table; + write_table_cb->flush = flush; + write_table_cb->table = qemu_blockalign(s->bs, len_bytes); + write_table_cb->iov.iov_base = write_table_cb->table->offsets; + write_table_cb->iov.iov_len = len_bytes; + qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + + /* Byteswap table */ + for (i = start; i < end; i++) { + uint64_t le_offset = cpu_to_le64(table->offsets[i]); + write_table_cb->table->offsets[i - start] = le_offset; + } + + /* Adjust for offset into table */ + offset += start * sizeof(uint64_t); + + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &write_table_cb->qiov, + write_table_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_write_table_cb, write_table_cb); +} + +/** + * Propagate return value from async callback + */ +static void qed_sync_cb(void *opaque, int ret) +{ + *(int *)opaque = ret; +} + +int qed_read_l1_table_sync(BDRVQEDState *s) +{ + int ret = -EINPROGRESS; + + qed_read_table(s, s->header.l1_table_offset, + s->l1_table, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); + qed_write_table(s, s->header.l1_table_offset, + s->l1_table, index, n, false, cb, opaque); +} + +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n) +{ + int ret = -EINPROGRESS; + + qed_write_l1_table(s, index, n, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + uint64_t l2_offset; + QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ + QEDReadL2TableCB *read_l2_table_cb = opaque; + QEDRequest *request = read_l2_table_cb->request; + BDRVQEDState *s = read_l2_table_cb->s; + CachedL2Table *l2_table = request->l2_table; + uint64_t l2_offset = read_l2_table_cb->l2_offset; + + if (ret) { + /* can't trust loaded L2 table anymore */ + qed_unref_l2_cache_entry(l2_table); + request->l2_table = NULL; + } else { + l2_table->offset = l2_offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry + * to the cache. + */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(request->l2_table != NULL); + } + + gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadL2TableCB *read_l2_table_cb; + + qed_unref_l2_cache_entry(request->l2_table); + + /* Check for cached L2 entry */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); + if (request->l2_table) { + cb(opaque, 0); + return; + } + + request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + request->l2_table->table = qed_alloc_table(s); + + read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); + read_l2_table_cb->s = s; + read_l2_table_cb->l2_offset = offset; + read_l2_table_cb->request = request; + + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); + qed_read_table(s, offset, request->l2_table->table, + qed_read_l2_table_cb, read_l2_table_cb); +} + +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) +{ + int ret = -EINPROGRESS; + + qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); + qed_write_table(s, request->l2_table->offset, + request->l2_table->table, index, n, flush, cb, opaque); +} + +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush) +{ + int ret = -EINPROGRESS; + + qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} diff --git a/block/qed.c b/block/qed.c new file mode 100644 index 000000000..21cb23987 --- /dev/null +++ b/block/qed.c @@ -0,0 +1,1586 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-timer.h" +#include "trace.h" +#include "qed.h" +#include "qerror.h" +#include "migration.h" + +static void qed_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QEDAIOCB *acb = (QEDAIOCB *)blockacb; + bool finished = false; + + /* Wait for the request to finish */ + acb->finished = &finished; + while (!finished) { + qemu_aio_wait(); + } +} + +static AIOPool qed_aio_pool = { + .aiocb_size = sizeof(QEDAIOCB), + .cancel = qed_aio_cancel, +}; + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const QEDHeader *header = (const QEDHeader *)buf; + + if (buf_size < sizeof(*header)) { + return 0; + } + if (le32_to_cpu(header->magic) != QED_MAGIC) { + return 0; + } + return 100; +} + +/** + * Check whether an image format is raw + * + * @fmt: Backing file format, may be NULL + */ +static bool qed_fmt_is_raw(const char *fmt) +{ + return fmt && strcmp(fmt, "raw") == 0; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ + cpu->magic = le32_to_cpu(le->magic); + cpu->cluster_size = le32_to_cpu(le->cluster_size); + cpu->table_size = le32_to_cpu(le->table_size); + cpu->header_size = le32_to_cpu(le->header_size); + cpu->features = le64_to_cpu(le->features); + cpu->compat_features = le64_to_cpu(le->compat_features); + cpu->autoclear_features = le64_to_cpu(le->autoclear_features); + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); + cpu->image_size = le64_to_cpu(le->image_size); + cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); + cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ + le->magic = cpu_to_le32(cpu->magic); + le->cluster_size = cpu_to_le32(cpu->cluster_size); + le->table_size = cpu_to_le32(cpu->table_size); + le->header_size = cpu_to_le32(cpu->header_size); + le->features = cpu_to_le64(cpu->features); + le->compat_features = cpu_to_le64(cpu->compat_features); + le->autoclear_features = cpu_to_le64(cpu->autoclear_features); + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); + le->image_size = cpu_to_le64(cpu->image_size); + le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); + le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); +} + +int qed_write_header_sync(BDRVQEDState *s) +{ + QEDHeader le; + int ret; + + qed_header_cpu_to_le(&s->header, &le); + ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); + if (ret != sizeof(le)) { + return ret; + } + return 0; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + struct iovec iov; + QEMUIOVector qiov; + int nsectors; + uint8_t *buf; +} QEDWriteHeaderCB; + +static void qed_write_header_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + + qemu_vfree(write_header_cb->buf); + gencb_complete(write_header_cb, ret); +} + +static void qed_write_header_read_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + BDRVQEDState *s = write_header_cb->s; + + if (ret) { + qed_write_header_cb(write_header_cb, ret); + return; + } + + /* Update header */ + qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); + + bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, + write_header_cb->nsectors, qed_write_header_cb, + write_header_cb); +} + +/** + * Update header in-place (does not rewrite backing filename or other strings) + * + * This function only updates known header fields in-place and does not affect + * extra data after the QED header. + */ +static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb, + void *opaque) +{ + /* We must write full sectors for O_DIRECT but cannot necessarily generate + * the data following the header if an unrecognized compat feature is + * active. Therefore, first read the sectors containing the header, update + * them, and write back. + */ + + int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / + BDRV_SECTOR_SIZE; + size_t len = nsectors * BDRV_SECTOR_SIZE; + QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), + cb, opaque); + + write_header_cb->s = s; + write_header_cb->nsectors = nsectors; + write_header_cb->buf = qemu_blockalign(s->bs, len); + write_header_cb->iov.iov_base = write_header_cb->buf; + write_header_cb->iov.iov_len = len; + qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); + + bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, + qed_write_header_read_cb, write_header_cb); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ + uint64_t table_entries; + uint64_t l2_size; + + table_entries = (table_size * cluster_size) / sizeof(uint64_t); + l2_size = table_entries * cluster_size; + + return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ + if (cluster_size < QED_MIN_CLUSTER_SIZE || + cluster_size > QED_MAX_CLUSTER_SIZE) { + return false; + } + if (cluster_size & (cluster_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ + if (table_size < QED_MIN_TABLE_SIZE || + table_size > QED_MAX_TABLE_SIZE) { + return false; + } + if (table_size & (table_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, + uint32_t table_size) +{ + if (image_size % BDRV_SECTOR_SIZE != 0) { + return false; /* not multiple of sector size */ + } + if (image_size > qed_max_image_size(cluster_size, table_size)) { + return false; /* image is too large */ + } + return true; +} + +/** + * Read a string of known length from the image file + * + * @file: Image file + * @offset: File offset to start of string, in bytes + * @n: String length in bytes + * @buf: Destination buffer + * @buflen: Destination buffer length in bytes + * @ret: 0 on success, -errno on failure + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, + char *buf, size_t buflen) +{ + int ret; + if (n >= buflen) { + return -EINVAL; + } + ret = bdrv_pread(file, offset, buf, n); + if (ret < 0) { + return ret; + } + buf[n] = '\0'; + return 0; +} + +/** + * Allocate new clusters + * + * @s: QED state + * @n: Number of contiguous clusters to allocate + * @ret: Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written. It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ + uint64_t offset = s->file_size; + s->file_size += n * s->header.cluster_size; + return offset; +} + +QEDTable *qed_alloc_table(BDRVQEDState *s) +{ + /* Honor O_DIRECT memory alignment requirements */ + return qemu_blockalign(s->bs, + s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ + CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + + l2_table->table = qed_alloc_table(s); + l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + + memset(l2_table->table->offsets, 0, + s->header.cluster_size * s->header.table_size); + return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +{ + assert(!s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = true; +} + +static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) +{ + QEDAIOCB *acb; + + assert(s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = false; + + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } +} + +static void qed_finish_clear_need_check(void *opaque, int ret) +{ + /* Do nothing */ +} + +static void qed_flush_after_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + + /* No need to wait until flush completes */ + qed_unplug_allocating_write_reqs(s); +} + +static void qed_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + if (ret) { + qed_unplug_allocating_write_reqs(s); + return; + } + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header(s, qed_flush_after_clear_need_check, s); +} + +static void qed_need_check_timer_cb(void *opaque) +{ + BDRVQEDState *s = opaque; + + /* The timer should only fire when allocating writes have drained */ + assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); + + trace_qed_need_check_timer_cb(s); + + qed_plug_allocating_write_reqs(s); + + /* Ensure writes are on disk before clearing flag */ + bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static void qed_start_need_check_timer(BDRVQEDState *s) +{ + trace_qed_start_need_check_timer(s); + + /* Use vm_clock so we don't alter the image file while suspended for + * migration. + */ + qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + + get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); +} + +/* It's okay to call this multiple times or when no timer is started */ +static void qed_cancel_need_check_timer(BDRVQEDState *s) +{ + trace_qed_cancel_need_check_timer(s); + qemu_del_timer(s->need_check_timer); +} + +static void bdrv_qed_rebind(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + s->bs = bs; +} + +static int bdrv_qed_open(BlockDriverState *bs, int flags) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader le_header; + int64_t file_size; + int ret; + + s->bs = bs; + QSIMPLEQ_INIT(&s->allocating_write_reqs); + + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + return ret; + } + qed_header_le_to_cpu(&le_header, &s->header); + + if (s->header.magic != QED_MAGIC) { + return -EINVAL; + } + if (s->header.features & ~QED_FEATURE_MASK) { + /* image uses unsupported feature bits */ + char buf[64]; + snprintf(buf, sizeof(buf), "%" PRIx64, + s->header.features & ~QED_FEATURE_MASK); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "QED", buf); + return -ENOTSUP; + } + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { + return -EINVAL; + } + + /* Round down file size to the last cluster */ + file_size = bdrv_getlength(bs->file); + if (file_size < 0) { + return file_size; + } + s->file_size = qed_start_of_cluster(s, file_size); + + if (!qed_is_table_size_valid(s->header.table_size)) { + return -EINVAL; + } + if (!qed_is_image_size_valid(s->header.image_size, + s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + if (!qed_check_table_offset(s, s->header.l1_table_offset)) { + return -EINVAL; + } + + s->table_nelems = (s->header.cluster_size * s->header.table_size) / + sizeof(uint64_t); + s->l2_shift = ffs(s->header.cluster_size) - 1; + s->l2_mask = s->table_nelems - 1; + s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + + if ((s->header.features & QED_F_BACKING_FILE)) { + if ((uint64_t)s->header.backing_filename_offset + + s->header.backing_filename_size > + s->header.cluster_size * s->header.header_size) { + return -EINVAL; + } + + ret = qed_read_string(bs->file, s->header.backing_filename_offset, + s->header.backing_filename_size, bs->backing_file, + sizeof(bs->backing_file)); + if (ret < 0) { + return ret; + } + + if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { + pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); + } + } + + /* Reset unknown autoclear feature bits. This is a backwards + * compatibility mechanism that allows images to be opened by older + * programs, which "knock out" unknown feature bits. When an image is + * opened by a newer program again it can detect that the autoclear + * feature is no longer valid. + */ + if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && + !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { + s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; + + ret = qed_write_header_sync(s); + if (ret) { + return ret; + } + + /* From here on only known autoclear feature bits are valid */ + bdrv_flush(bs->file); + } + + s->l1_table = qed_alloc_table(s); + qed_init_l2_cache(&s->l2_cache); + + ret = qed_read_l1_table_sync(s); + if (ret) { + goto out; + } + + /* If image was not closed cleanly, check consistency */ + if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { + /* Read-only images cannot be fixed. There is no risk of corruption + * since write operations are not possible. Therefore, allow + * potentially inconsistent images to be opened read-only. This can + * aid data recovery from an otherwise inconsistent image. + */ + if (!bdrv_is_read_only(bs->file) && + !(flags & BDRV_O_INCOMING)) { + BdrvCheckResult result = {0}; + + ret = qed_check(s, &result, true); + if (ret) { + goto out; + } + } + } + + s->need_check_timer = qemu_new_timer_ns(vm_clock, + qed_need_check_timer_cb, s); + +out: + if (ret) { + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); + } + return ret; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + qed_cancel_need_check_timer(s); + qemu_free_timer(s->need_check_timer); + + /* Ensure writes reach stable storage */ + bdrv_flush(bs->file); + + /* Clean shutdown, no check required on next open */ + if (s->header.features & QED_F_NEED_CHECK) { + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); + } + + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); +} + +static int qed_create(const char *filename, uint32_t cluster_size, + uint64_t image_size, uint32_t table_size, + const char *backing_file, const char *backing_fmt) +{ + QEDHeader header = { + .magic = QED_MAGIC, + .cluster_size = cluster_size, + .table_size = table_size, + .header_size = 1, + .features = 0, + .compat_features = 0, + .l1_table_offset = cluster_size, + .image_size = image_size, + }; + QEDHeader le_header; + uint8_t *l1_table = NULL; + size_t l1_size = header.cluster_size * header.table_size; + int ret = 0; + BlockDriverState *bs = NULL; + + ret = bdrv_create_file(filename, NULL); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB); + if (ret < 0) { + return ret; + } + + /* File must start empty and grow, check truncate is supported */ + ret = bdrv_truncate(bs, 0); + if (ret < 0) { + goto out; + } + + if (backing_file) { + header.features |= QED_F_BACKING_FILE; + header.backing_filename_offset = sizeof(le_header); + header.backing_filename_size = strlen(backing_file); + + if (qed_fmt_is_raw(backing_fmt)) { + header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + qed_header_cpu_to_le(&header, &le_header); + ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + goto out; + } + ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, + header.backing_filename_size); + if (ret < 0) { + goto out; + } + + l1_table = g_malloc0(l1_size); + ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + if (ret < 0) { + goto out; + } + + ret = 0; /* success */ +out: + g_free(l1_table); + bdrv_delete(bs); + return ret; +} + +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +{ + uint64_t image_size = 0; + uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; + uint32_t table_size = QED_DEFAULT_TABLE_SIZE; + const char *backing_file = NULL; + const char *backing_fmt = NULL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { + backing_fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cluster_size = options->value.n; + } + } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { + if (options->value.n) { + table_size = options->value.n; + } + } + options++; + } + + if (!qed_is_cluster_size_valid(cluster_size)) { + fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + return -EINVAL; + } + if (!qed_is_table_size_valid(table_size)) { + fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + return -EINVAL; + } + if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { + fprintf(stderr, "QED image size must be a non-zero multiple of " + "cluster size and less than %" PRIu64 " bytes\n", + qed_max_image_size(cluster_size, table_size)); + return -EINVAL; + } + + return qed_create(filename, cluster_size, image_size, table_size, + backing_file, backing_fmt); +} + +typedef struct { + Coroutine *co; + int is_allocated; + int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ + QEDIsAllocatedCB *cb = opaque; + *cb->pnum = len / BDRV_SECTOR_SIZE; + cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVQEDState *s = bs->opaque; + uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; + QEDIsAllocatedCB cb = { + .is_allocated = -1, + .pnum = pnum, + }; + QEDRequest request = { .l2_table = NULL }; + + qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + + /* Now sleep if the callback wasn't invoked immediately */ + while (cb.is_allocated == -1) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + + qed_unref_l2_cache_entry(request.l2_table); + + return cb.is_allocated; +} + +static int bdrv_qed_make_empty(BlockDriverState *bs) +{ + return -ENOTSUP; +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ + return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s: QED state + * @pos: Byte position in device + * @qiov: Destination I/O vector + * @cb: Completion function + * @opaque: User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, + QEMUIOVector *qiov, + BlockDriverCompletionFunc *cb, void *opaque) +{ + uint64_t backing_length = 0; + size_t size; + + /* If there is a backing file, get its length. Treat the absence of a + * backing file like a zero length backing file. + */ + if (s->bs->backing_hd) { + int64_t l = bdrv_getlength(s->bs->backing_hd); + if (l < 0) { + cb(opaque, l); + return; + } + backing_length = l; + } + + /* Zero all sectors if reading beyond the end of the backing file */ + if (pos >= backing_length || + pos + qiov->size > backing_length) { + qemu_iovec_memset(qiov, 0, 0, qiov->size); + } + + /* Complete now if there are no backing file sectors to read */ + if (pos >= backing_length) { + cb(opaque, 0); + return; + } + + /* If the read straddles the end of the backing file, shorten it */ + size = MIN((uint64_t)backing_length - pos, qiov->size); + + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); + bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, + qiov, size / BDRV_SECTOR_SIZE, cb, opaque); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEMUIOVector qiov; + struct iovec iov; + uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + qemu_vfree(copy_cb->iov.iov_base); + gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + BDRVQEDState *s = copy_cb->s; + + if (ret) { + qed_copy_from_backing_file_cb(copy_cb, ret); + return; + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); + bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, + ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_cb, copy_cb); +} + +/** + * Copy data from backing file into the image + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @offset: Byte offset in image file + * @cb: Completion function + * @opaque: User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, + uint64_t len, uint64_t offset, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + CopyFromBackingFileCB *copy_cb; + + /* Skip copy entirely if there is no work to do */ + if (len == 0) { + cb(opaque, 0); + return; + } + + copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); + copy_cb->s = s; + copy_cb->offset = offset; + copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); + copy_cb->iov.iov_len = len; + qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + + qed_read_backing_file(s, pos, ©_cb->qiov, + qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Number of contiguous clusters + * @cluster: First cluster offset + * + * The cluster offset may be an allocated byte offset in the image file, the + * zero cluster marker, or the unallocated cluster marker. + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, + unsigned int n, uint64_t cluster) +{ + int i; + for (i = index; i < index + n; i++) { + table->offsets[i] = cluster; + if (!qed_offset_is_unalloc_cluster(cluster) && + !qed_offset_is_zero_cluster(cluster)) { + cluster += s->header.cluster_size; + } + } +} + +static void qed_aio_complete_bh(void *opaque) +{ + QEDAIOCB *acb = opaque; + BlockDriverCompletionFunc *cb = acb->common.cb; + void *user_opaque = acb->common.opaque; + int ret = acb->bh_ret; + bool *finished = acb->finished; + + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); + + /* Invoke callback */ + cb(user_opaque, ret); + + /* Signal cancel completion */ + if (finished) { + *finished = true; + } +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ + BDRVQEDState *s = acb_to_s(acb); + + trace_qed_aio_complete(s, acb, ret); + + /* Free resources */ + qemu_iovec_destroy(&acb->cur_qiov); + qed_unref_l2_cache_entry(acb->request.l2_table); + + /* Free the buffer we may have allocated for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + qemu_vfree(acb->qiov->iov[0].iov_base); + acb->qiov->iov[0].iov_base = NULL; + } + + /* Arrange for a bh to invoke the completion function */ + acb->bh_ret = ret; + acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + + /* Start next allocating write request waiting behind this one. Note that + * requests enqueue themselves when they first hit an unallocated cluster + * but they wait until the entire request is finished before waking up the + * next request in the queue. This ensures that we don't cycle through + * requests multiple times but rather finish one at a time completely. + */ + if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } else if (s->header.features & QED_F_NEED_CHECK) { + qed_start_need_check_timer(s); + } + } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + CachedL2Table *l2_table = acb->request.l2_table; + uint64_t l2_offset = l2_table->offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry to the + * cache. + */ + acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(acb->request.l2_table != NULL); + + qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + int index; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = acb->request.l2_table->offset; + + qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +{ + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; + int index; + + if (ret) { + goto err; + } + + if (need_alloc) { + qed_unref_l2_cache_entry(acb->request.l2_table); + acb->request.l2_table = qed_new_l2_table(s); + } + + index = qed_l2_index(s, acb->cur_pos); + qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, + offset); + + if (need_alloc) { + /* Write out the whole new L2 table */ + qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, + qed_aio_write_l1_update, acb); + } else { + /* Write out only the updated part of the L2 table */ + qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, + qed_aio_next_io, acb); + } + return; + +err: + qed_aio_complete(acb, ret); +} + +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash during an + * allocating write could result in empty clusters in the image. If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region. The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + + if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos); + BlockDriverCompletionFunc *next_fn; + + trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { + next_fn = qed_aio_next_io; + } else { + if (s->bs->backing_hd) { + next_fn = qed_aio_write_flush_before_l2_update; + } else { + next_fn = qed_aio_write_l2_update_cb; + } + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + next_fn, acb); +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = acb->cur_pos + acb->cur_qiov.size; + uint64_t len = + qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos) + + acb->cur_qiov.size; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + trace_qed_aio_write_postfill(s, acb, start, len, offset); + qed_copy_from_backing_file(s, start, len, offset, + qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = qed_start_of_cluster(s, acb->cur_pos); + uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); + qed_copy_from_backing_file(s, start, len, acb->cur_cluster, + qed_aio_write_postfill, acb); +} + +/** + * Check if the QED_F_NEED_CHECK bit should be set during allocating write + */ +static bool qed_should_set_need_check(BDRVQEDState *s) +{ + /* The flush before L2 update path ensures consistency */ + if (s->bs->backing_hd) { + return false; + } + + return !(s->header.features & QED_F_NEED_CHECK); +} + +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + qed_aio_write_l2_update(acb, 0, 1); +} + +/** + * Write new data cluster + * + * @acb: Write request + * @len: Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ + BDRVQEDState *s = acb_to_s(acb); + BlockDriverCompletionFunc *cb; + + /* Cancel timer when the first allocating request comes in */ + if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { + qed_cancel_need_check_timer(s); + } + + /* Freeze this request if another allocating write is in progress */ + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); + } + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || + s->allocating_write_reqs_plugged) { + return; /* wait for existing request to finish */ + } + + acb->cur_nclusters = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, acb->cur_pos) + len); + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + if (acb->flags & QED_AIOCB_ZERO) { + /* Skip ahead if the clusters are already zero */ + if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { + qed_aio_next_io(acb, 0); + return; + } + + cb = qed_aio_write_zero_cluster; + } else { + cb = qed_aio_write_prefill; + acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); + } + + if (qed_should_set_need_check(s)) { + s->header.features |= QED_F_NEED_CHECK; + qed_write_header(s, cb, acb); + } else { + cb(acb, 0); + } +} + +/** + * Write data cluster in place + * + * @acb: Write request + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ + /* Allocate buffer for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + struct iovec *iov = acb->qiov->iov; + + if (!iov->iov_base) { + iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); + memset(iov->iov_base, 0, iov->iov_len); + } + } + + /* Calculate the I/O vector */ + acb->cur_cluster = offset; + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Do the actual write */ + qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque: Write request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + + trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + + acb->find_cluster_ret = ret; + + switch (ret) { + case QED_CLUSTER_FOUND: + qed_aio_write_inplace(acb, offset, len); + break; + + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + case QED_CLUSTER_ZERO: + qed_aio_write_alloc(acb, len); + break; + + default: + qed_aio_complete(acb, ret); + break; + } +} + +/** + * Read data cluster + * + * @opaque: Read request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverState *bs = acb->common.bs; + + /* Adjust offset into cluster */ + offset += qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_read_data(s, acb, ret, offset, len); + + if (ret < 0) { + goto err; + } + + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Handle zero cluster and backing file reads */ + if (ret == QED_CLUSTER_ZERO) { + qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); + qed_aio_next_io(acb, 0); + return; + } else if (ret != QED_CLUSTER_FOUND) { + qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + qed_aio_next_io, acb); + return; + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? + qed_aio_write_data : qed_aio_read_data; + + trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + + /* Handle I/O error */ + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + acb->qiov_offset += acb->cur_qiov.size; + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); + + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + qed_aio_complete(acb, 0); + return; + } + + /* Find next cluster and start I/O */ + qed_find_cluster(s, &acb->request, + acb->cur_pos, acb->end_pos - acb->cur_pos, + io_fn, acb); +} + +static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, int flags) +{ + QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque); + + trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, + opaque, flags); + + acb->flags = flags; + acb->finished = NULL; + acb->qiov = qiov; + acb->qiov_offset = 0; + acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->request.l2_table = NULL; + qemu_iovec_init(&acb->cur_qiov, qiov->niov); + + /* Start request */ + qed_aio_next_io(acb, 0); + return &acb->common; +} + +static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, + opaque, QED_AIOCB_WRITE); +} + +typedef struct { + Coroutine *co; + int ret; + bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ + QEDWriteZeroesCB *cb = opaque; + + cb->done = true; + cb->ret = ret; + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + BlockDriverAIOCB *blockacb; + BDRVQEDState *s = bs->opaque; + QEDWriteZeroesCB cb = { .done = false }; + QEMUIOVector qiov; + struct iovec iov; + + /* Refuse if there are untouched backing file sectors */ + if (bs->backing_hd) { + if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + } + + /* Zero writes start without an I/O buffer. If a buffer becomes necessary + * then it will be allocated during request processing. + */ + iov.iov_base = NULL, + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + + qemu_iovec_init_external(&qiov, &iov, 1); + blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, + qed_co_write_zeroes_cb, &cb, + QED_AIOCB_WRITE | QED_AIOCB_ZERO); + if (!blockacb) { + return -EIO; + } + if (!cb.done) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + assert(cb.done); + return cb.ret; +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQEDState *s = bs->opaque; + uint64_t old_image_size; + int ret; + + if (!qed_is_image_size_valid(offset, s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + + /* Shrinking is currently not supported */ + if ((uint64_t)offset < s->header.image_size) { + return -ENOTSUP; + } + + old_image_size = s->header.image_size; + s->header.image_size = offset; + ret = qed_write_header_sync(s); + if (ret < 0) { + s->header.image_size = old_image_size; + } + return ret; +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQEDState *s = bs->opaque; + + memset(bdi, 0, sizeof(*bdi)); + bdi->cluster_size = s->header.cluster_size; + bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; + return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader new_header, le_header; + void *buffer; + size_t buffer_len, backing_file_len; + int ret; + + /* Refuse to set backing filename if unknown compat feature bits are + * active. If the image uses an unknown compat feature then we may not + * know the layout of data following the header structure and cannot safely + * add a new string. + */ + if (backing_file && (s->header.compat_features & + ~QED_COMPAT_FEATURE_MASK)) { + return -ENOTSUP; + } + + memcpy(&new_header, &s->header, sizeof(new_header)); + + new_header.features &= ~(QED_F_BACKING_FILE | + QED_F_BACKING_FORMAT_NO_PROBE); + + /* Adjust feature flags */ + if (backing_file) { + new_header.features |= QED_F_BACKING_FILE; + + if (qed_fmt_is_raw(backing_fmt)) { + new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + /* Calculate new header size */ + backing_file_len = 0; + + if (backing_file) { + backing_file_len = strlen(backing_file); + } + + buffer_len = sizeof(new_header); + new_header.backing_filename_offset = buffer_len; + new_header.backing_filename_size = backing_file_len; + buffer_len += backing_file_len; + + /* Make sure we can rewrite header without failing */ + if (buffer_len > new_header.header_size * new_header.cluster_size) { + return -ENOSPC; + } + + /* Prepare new header */ + buffer = g_malloc(buffer_len); + + qed_header_cpu_to_le(&new_header, &le_header); + memcpy(buffer, &le_header, sizeof(le_header)); + buffer_len = sizeof(le_header); + + if (backing_file) { + memcpy(buffer + buffer_len, backing_file, backing_file_len); + buffer_len += backing_file_len; + } + + /* Write new header */ + ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); + g_free(buffer); + if (ret == 0) { + memcpy(&s->header, &new_header, sizeof(new_header)); + } + return ret; +} + +static void bdrv_qed_invalidate_cache(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bdrv_qed_close(bs); + memset(s, 0, sizeof(BDRVQEDState)); + bdrv_qed_open(bs, bs->open_flags); +} + +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVQEDState *s = bs->opaque; + + return qed_check(s, result, !!fix); +} + +static QEMUOptionParameter qed_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size (in bytes)" + }, { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, { + .name = BLOCK_OPT_BACKING_FMT, + .type = OPT_STRING, + .help = "Image format of the base image" + }, { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "Cluster size (in bytes)", + .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, + }, { + .name = BLOCK_OPT_TABLE_SIZE, + .type = OPT_SIZE, + .help = "L1/L2 table size (in clusters)" + }, + { /* end of list */ } +}; + +static BlockDriver bdrv_qed = { + .format_name = "qed", + .instance_size = sizeof(BDRVQEDState), + .create_options = qed_create_options, + + .bdrv_probe = bdrv_qed_probe, + .bdrv_rebind = bdrv_qed_rebind, + .bdrv_open = bdrv_qed_open, + .bdrv_close = bdrv_qed_close, + .bdrv_create = bdrv_qed_create, + .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, + .bdrv_make_empty = bdrv_qed_make_empty, + .bdrv_aio_readv = bdrv_qed_aio_readv, + .bdrv_aio_writev = bdrv_qed_aio_writev, + .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, + .bdrv_truncate = bdrv_qed_truncate, + .bdrv_getlength = bdrv_qed_getlength, + .bdrv_get_info = bdrv_qed_get_info, + .bdrv_change_backing_file = bdrv_qed_change_backing_file, + .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, + .bdrv_check = bdrv_qed_check, +}; + +static void bdrv_qed_init(void) +{ + bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/block/qed.h b/block/qed.h new file mode 100644 index 000000000..a063bf70a --- /dev/null +++ b/block/qed.h @@ -0,0 +1,344 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + * +----------+ + * | L1 table | + * +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | L2 table | ... | L2 table | + * +----------+ +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | Data | ... | Data | + * +----------+ +----------+ + * + * The L1 table is fixed size and always present. L2 tables are allocated on + * demand. The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ + +enum { + QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + + /* The image supports a backing file */ + QED_F_BACKING_FILE = 0x01, + + /* The image needs a consistency check before use */ + QED_F_NEED_CHECK = 0x02, + + /* The backing file format must not be probed, treat as raw image */ + QED_F_BACKING_FORMAT_NO_PROBE = 0x04, + + /* Feature bits must be used when the on-disk format changes */ + QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ + QED_F_NEED_CHECK | + QED_F_BACKING_FORMAT_NO_PROBE, + QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ + QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ + + /* Data is stored in groups of sectors called clusters. Cluster size must + * be large to avoid keeping too much metadata. I/O requests that have + * sub-cluster size will require read-modify-write. + */ + QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ + QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, + QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, + + /* Allocated clusters are tracked using a 2-level pagetable. Table size is + * a multiple of clusters so large maximum image sizes can be supported + * without jacking up the cluster size too much. + */ + QED_MIN_TABLE_SIZE = 1, /* in clusters */ + QED_MAX_TABLE_SIZE = 16, + QED_DEFAULT_TABLE_SIZE = 4, + + /* Delay to flush and clean image after last allocating write completes */ + QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */ +}; + +typedef struct { + uint32_t magic; /* QED\0 */ + + uint32_t cluster_size; /* in bytes */ + uint32_t table_size; /* for L1 and L2 tables, in clusters */ + uint32_t header_size; /* in clusters */ + + uint64_t features; /* format feature bits */ + uint64_t compat_features; /* compatible feature bits */ + uint64_t autoclear_features; /* self-resetting feature bits */ + + uint64_t l1_table_offset; /* in bytes */ + uint64_t image_size; /* total logical image size, in bytes */ + + /* if (features & QED_F_BACKING_FILE) */ + uint32_t backing_filename_offset; /* in bytes from start of header */ + uint32_t backing_filename_size; /* in bytes */ +} QEDHeader; + +typedef struct { + uint64_t offsets[0]; /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { + QEDTable *table; + uint64_t offset; /* offset=0 indicates an invalidate entry */ + QTAILQ_ENTRY(CachedL2Table) node; + int ref; +} CachedL2Table; + +typedef struct { + QTAILQ_HEAD(, CachedL2Table) entries; + unsigned int n_entries; +} L2TableCache; + +typedef struct QEDRequest { + CachedL2Table *l2_table; +} QEDRequest; + +enum { + QED_AIOCB_WRITE = 0x0001, /* read or write? */ + QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ +}; + +typedef struct QEDAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + int bh_ret; /* final return status for completion bh */ + QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ + int flags; /* QED_AIOCB_* bits ORed together */ + bool *finished; /* signal for cancel completion */ + uint64_t end_pos; /* request end on block device, in bytes */ + + /* User scatter-gather list */ + QEMUIOVector *qiov; + size_t qiov_offset; /* byte count already processed */ + + /* Current cluster scatter-gather list */ + QEMUIOVector cur_qiov; + uint64_t cur_pos; /* position on block device, in bytes */ + uint64_t cur_cluster; /* cluster offset in image file */ + unsigned int cur_nclusters; /* number of clusters being accessed */ + int find_cluster_ret; /* used for L1/L2 update */ + + QEDRequest request; +} QEDAIOCB; + +typedef struct { + BlockDriverState *bs; /* device */ + uint64_t file_size; /* length of image file, in bytes */ + + QEDHeader header; /* always cpu-endian */ + QEDTable *l1_table; + L2TableCache l2_cache; /* l2 table cache */ + uint32_t table_nelems; + uint32_t l1_shift; + uint32_t l2_shift; + uint32_t l2_mask; + + /* Allocating write request queue */ + QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; + bool allocating_write_reqs_plugged; + + /* Periodic flush and clear need check flag */ + QEMUTimer *need_check_timer; +} BDRVQEDState; + +enum { + QED_CLUSTER_FOUND, /* cluster found */ + QED_CLUSTER_ZERO, /* zero cluster found */ + QED_CLUSTER_L2, /* cluster missing in L2 */ + QED_CLUSTER_L1, /* cluster missing in L1 */ +}; + +/** + * qed_find_cluster() completion callback + * + * @opaque: User data for completion callback + * @ret: QED_CLUSTER_FOUND Success + * QED_CLUSTER_L2 Data cluster unallocated in L2 + * QED_CLUSTER_L1 L2 unallocated in L1 + * -errno POSIX error occurred + * @offset: Data cluster offset + * @len: Contiguous bytes starting from cluster offset + * + * This function is invoked when qed_find_cluster() completes. + * + * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range + * in the image file. + * + * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 + * table offset, respectively. len is number of contiguous unallocated bytes. + */ +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { + BlockDriverCompletionFunc *cb; + void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * Header functions + */ +int qed_write_header_sync(BDRVQEDState *s); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table_sync(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n); +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + uint64_t offset); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Consistency check + */ +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); + +QEDTable *qed_alloc_table(BDRVQEDState *s); + +/** + * Round down to the start of a cluster + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & (s->header.cluster_size - 1); +} + +static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) +{ + return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / + (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ + return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ + return (pos >> s->l2_shift) & s->l2_mask; +} + +/** + * Test if a cluster offset is valid + */ +static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t header_size = (uint64_t)s->header.header_size * + s->header.cluster_size; + + if (offset & (s->header.cluster_size - 1)) { + return false; + } + return offset >= header_size && offset < s->file_size; +} + +/** + * Test if a table offset is valid + */ +static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t end_offset = offset + (s->header.table_size - 1) * + s->header.cluster_size; + + /* Overflow check */ + if (end_offset <= offset) { + return false; + } + + return qed_check_cluster_offset(s, offset) && + qed_check_cluster_offset(s, end_offset); +} + +static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, + uint64_t offset) +{ + if (qed_offset_into_cluster(s, offset)) { + return false; + } + return true; +} + +static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) +{ + if (offset == 0) { + return true; + } + return false; +} + +static inline bool qed_offset_is_zero_cluster(uint64_t offset) +{ + if (offset == 1) { + return true; + } + return false; +} + +#endif /* BLOCK_QED_H */ diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h new file mode 100644 index 000000000..ba118f616 --- /dev/null +++ b/block/raw-posix-aio.h @@ -0,0 +1,45 @@ +/* + * QEMU Posix block I/O backend AIO support + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#ifndef QEMU_RAW_POSIX_AIO_H +#define QEMU_RAW_POSIX_AIO_H + +/* AIO request types */ +#define QEMU_AIO_READ 0x0001 +#define QEMU_AIO_WRITE 0x0002 +#define QEMU_AIO_IOCTL 0x0004 +#define QEMU_AIO_FLUSH 0x0008 +#define QEMU_AIO_TYPE_MASK \ + (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH) + +/* AIO flags */ +#define QEMU_AIO_MISALIGNED 0x1000 + + +/* posix-aio-compat.c - thread pool based implementation */ +int paio_init(void); +BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type); +BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque); + +/* linux-aio.c - Linux native implementation */ +void *laio_init(void); +BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type); + +#endif /* QEMU_RAW_POSIX_AIO_H */ diff --git a/block/raw-posix.c b/block/raw-posix.c new file mode 100644 index 000000000..6be20b192 --- /dev/null +++ b/block/raw-posix.c @@ -0,0 +1,1383 @@ +/* + * Block driver for RAW files (posix) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu-timer.h" +#include "qemu-char.h" +#include "qemu-log.h" +#include "block_int.h" +#include "module.h" +#include "block/raw-posix-aio.h" + +#if defined(__APPLE__) && (__MACH__) +#include <paths.h> +#include <sys/param.h> +#include <IOKit/IOKitLib.h> +#include <IOKit/IOBSD.h> +#include <IOKit/storage/IOMediaBSDClient.h> +#include <IOKit/storage/IOMedia.h> +#include <IOKit/storage/IOCDMedia.h> +//#include <IOKit/storage/IOCDTypes.h> +#include <CoreFoundation/CoreFoundation.h> +#endif + +#ifdef __sun__ +#define _POSIX_PTHREAD_SEMANTICS 1 +#include <sys/dkio.h> +#endif +#ifdef __linux__ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/param.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#include <linux/fs.h> +#endif +#ifdef CONFIG_FIEMAP +#include <linux/fiemap.h> +#endif +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +#include <sys/disk.h> +#include <sys/cdio.h> +#endif + +#ifdef __OpenBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#endif + +#ifdef __NetBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#include <sys/disk.h> +#endif + +#ifdef __DragonFly__ +#include <sys/ioctl.h> +#include <sys/diskslice.h> +#endif + +#ifdef CONFIG_XFS +#include <xfs/xfs.h> +#endif + +//#define DEBUG_FLOPPY + +//#define DEBUG_BLOCK +#if defined(DEBUG_BLOCK) +#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \ + { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0) +#else +#define DEBUG_BLOCK_PRINT(formatCstr, ...) +#endif + +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_FD 2 + +/* if the FD is not accessed during that time (in ns), we try to + reopen it to see if the disk has been changed */ +#define FD_OPEN_TIMEOUT (1000000000) + +#define MAX_BLOCKSIZE 4096 + +typedef struct BDRVRawState { + int fd; + int type; + int open_flags; +#if defined(__linux__) + /* linux floppy specific */ + int64_t fd_open_time; + int64_t fd_error_time; + int fd_got_error; + int fd_media_changed; +#endif +#ifdef CONFIG_LINUX_AIO + int use_aio; + void *aio_ctx; +#endif + uint8_t *aligned_buf; + unsigned aligned_buf_size; +#ifdef CONFIG_XFS + bool is_xfs : 1; +#endif +} BDRVRawState; + +static int fd_open(BlockDriverState *bs); +static int64_t raw_getlength(BlockDriverState *bs); + +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_reopen(BlockDriverState *bs); +#endif + +#if defined(__NetBSD__) +static int raw_normalize_devicepath(const char **filename) +{ + static char namebuf[PATH_MAX]; + const char *dp, *fname; + struct stat sb; + + fname = *filename; + dp = strrchr(fname, '/'); + if (lstat(fname, &sb) < 0) { + fprintf(stderr, "%s: stat failed: %s\n", + fname, strerror(errno)); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + return 0; + } + + if (dp == NULL) { + snprintf(namebuf, PATH_MAX, "r%s", fname); + } else { + snprintf(namebuf, PATH_MAX, "%.*s/r%s", + (int)(dp - fname), fname, dp + 1); + } + fprintf(stderr, "%s is a block device", fname); + *filename = namebuf; + fprintf(stderr, ", using %s\n", *filename); + + return 0; +} +#else +static int raw_normalize_devicepath(const char **filename) +{ + return 0; +} +#endif + +static int raw_open_common(BlockDriverState *bs, const char *filename, + int bdrv_flags, int open_flags) +{ + BDRVRawState *s = bs->opaque; + int fd, ret; + + ret = raw_normalize_devicepath(&filename); + if (ret != 0) { + return ret; + } + + s->open_flags = open_flags | O_BINARY; + s->open_flags &= ~O_ACCMODE; + if (bdrv_flags & BDRV_O_RDWR) { + s->open_flags |= O_RDWR; + } else { + s->open_flags |= O_RDONLY; + } + + /* Use O_DSYNC for write-through caching, no flags for write-back caching, + * and O_DIRECT for no caching. */ + if ((bdrv_flags & BDRV_O_NOCACHE)) + s->open_flags |= O_DIRECT; + if (!(bdrv_flags & BDRV_O_CACHE_WB)) + s->open_flags |= O_DSYNC; + + s->fd = -1; + fd = qemu_open(filename, s->open_flags, 0644); + if (fd < 0) { + ret = -errno; + if (ret == -EROFS) + ret = -EACCES; + return ret; + } + s->fd = fd; + s->aligned_buf = NULL; + + if ((bdrv_flags & BDRV_O_NOCACHE)) { + /* + * Allocate a buffer for read/modify/write cycles. Chose the size + * pessimistically as we don't know the block size yet. + */ + s->aligned_buf_size = 32 * MAX_BLOCKSIZE; + s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size); + if (s->aligned_buf == NULL) { + goto out_close; + } + } + + /* We're falling back to POSIX AIO in some cases so init always */ + if (paio_init() < 0) { + goto out_free_buf; + } + +#ifdef CONFIG_LINUX_AIO + /* + * Currently Linux do AIO only for files opened with O_DIRECT + * specified so check NOCACHE flag too + */ + if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == + (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { + + s->aio_ctx = laio_init(); + if (!s->aio_ctx) { + goto out_free_buf; + } + s->use_aio = 1; + } else +#endif + { +#ifdef CONFIG_LINUX_AIO + s->use_aio = 0; +#endif + } + +#ifdef CONFIG_XFS + if (platform_test_xfs_fd(s->fd)) { + s->is_xfs = 1; + } +#endif + + return 0; + +out_free_buf: + qemu_vfree(s->aligned_buf); +out_close: + qemu_close(fd); + return -errno; +} + +static int raw_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + + s->type = FTYPE_FILE; + return raw_open_common(bs, filename, flags, 0); +} + +/* XXX: use host sector size if necessary with: +#ifdef DIOCGSECTORSIZE + { + unsigned int sectorsize = 512; + if (!ioctl(fd, DIOCGSECTORSIZE, §orsize) && + sectorsize > bufsize) + bufsize = sectorsize; + } +#endif +#ifdef CONFIG_COCOA + uint32_t blockSize = 512; + if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { + bufsize = blockSize; + } +#endif +*/ + +/* + * Check if all memory in this vector is sector aligned. + */ +static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + return 0; + } + } + + return 1; +} + +static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + /* + * If O_DIRECT is used the buffer needs to be aligned on a sector + * boundary. Check if this is the case or tell the low-level + * driver that it needs to copy the buffer. + */ + if (s->aligned_buf) { + if (!qiov_is_aligned(bs, qiov)) { + type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_AIO + } else if (s->use_aio) { + return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, + nb_sectors, cb, opaque, type); +#endif + } + } + + return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, + cb, opaque, type); +} + +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_READ); +} + +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_WRITE); +} + +static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + if (s->fd >= 0) { + qemu_close(s->fd); + s->fd = -1; + if (s->aligned_buf != NULL) + qemu_vfree(s->aligned_buf); + } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + struct stat st; + + if (fstat(s->fd, &st)) { + return -errno; + } + + if (S_ISREG(st.st_mode)) { + if (ftruncate(s->fd, offset) < 0) { + return -errno; + } + } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + if (offset > raw_getlength(bs)) { + return -EINVAL; + } + } else { + return -ENOTSUP; + } + + return 0; +} + +#ifdef __OpenBSD__ +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + struct stat st; + + if (fstat(fd, &st)) + return -1; + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + struct disklabel dl; + + if (ioctl(fd, DIOCGDINFO, &dl)) + return -1; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + } else + return st.st_size; +} +#elif defined(__NetBSD__) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + struct stat st; + + if (fstat(fd, &st)) + return -1; + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + struct dkwedge_info dkw; + + if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { + return dkw.dkw_size * 512; + } else { + struct disklabel dl; + + if (ioctl(fd, DIOCGDINFO, &dl)) + return -1; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + } + } else + return st.st_size; +} +#elif defined(__sun__) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + struct dk_minfo minfo; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + /* + * Use the DKIOCGMEDIAINFO ioctl to read the size. + */ + ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); + if (ret != -1) { + return minfo.dki_lbsize * minfo.dki_capacity; + } + + /* + * There are reports that lseek on some devices fails, but + * irc discussion said that contingency on contingency was overkill. + */ + return lseek(s->fd, 0, SEEK_END); +} +#elif defined(CONFIG_BSD) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + int64_t size; + struct stat sb; +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) + int reopened = 0; +#endif + int ret; + + ret = fd_open(bs); + if (ret < 0) + return ret; + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +again: +#endif + if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { +#ifdef DIOCGMEDIASIZE + if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) +#elif defined(DIOCGPART) + { + struct partinfo pi; + if (ioctl(fd, DIOCGPART, &pi) == 0) + size = pi.media_size; + else + size = 0; + } + if (size == 0) +#endif +#if defined(__APPLE__) && defined(__MACH__) + size = LONG_LONG_MAX; +#else + size = lseek(fd, 0LL, SEEK_END); +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) + switch(s->type) { + case FTYPE_CD: + /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ + if (size == 2048LL * (unsigned)-1) + size = 0; + /* XXX no disc? maybe we need to reopen... */ + if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { + reopened = 1; + goto again; + } + } +#endif + } else { + size = lseek(fd, 0, SEEK_END); + } + return size; +} +#else +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + return lseek(s->fd, 0, SEEK_END); +} +#endif + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ + struct stat st; + BDRVRawState *s = bs->opaque; + + if (fstat(s->fd, &st) < 0) { + return -errno; + } + return (int64_t)st.st_blocks * 512; +} + +static int raw_create(const char *filename, QEMUOptionParameter *options) +{ + int fd; + int result = 0; + int64_t total_size = 0; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) { + result = -errno; + } else { + if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + result = -errno; + } + if (qemu_close(fd) != 0) { + result = -errno; + } + } + return result; +} + +/* + * Returns true iff the specified sector is present in the disk image. Drivers + * not implementing the functionality are assumed to not support backing files, + * hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + off_t start, data, hole; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + +#ifdef CONFIG_FIEMAP + + BDRVRawState *s = bs->opaque; + struct { + struct fiemap fm; + struct fiemap_extent fe; + } f; + + f.fm.fm_start = start; + f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; + f.fm.fm_flags = 0; + f.fm.fm_extent_count = 1; + f.fm.fm_reserved = 0; + if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { + /* Assume everything is allocated. */ + *pnum = nb_sectors; + return 1; + } + + if (f.fm.fm_mapped_extents == 0) { + /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length. + * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! + */ + off_t length = lseek(s->fd, 0, SEEK_END); + hole = f.fm.fm_start; + data = MIN(f.fm.fm_start + f.fm.fm_length, length); + } else { + data = f.fe.fe_logical; + hole = f.fe.fe_logical + f.fe.fe_length; + } + +#elif defined SEEK_HOLE && defined SEEK_DATA + + BDRVRawState *s = bs->opaque; + + hole = lseek(s->fd, start, SEEK_HOLE); + if (hole == -1) { + /* -ENXIO indicates that sector_num was past the end of the file. + * There is a virtual hole there. */ + assert(errno != -ENXIO); + + /* Most likely EINVAL. Assume everything is allocated. */ + *pnum = nb_sectors; + return 1; + } + + if (hole > start) { + data = start; + } else { + /* On a hole. We need another syscall to find its end. */ + data = lseek(s->fd, start, SEEK_DATA); + if (data == -1) { + data = lseek(s->fd, 0, SEEK_END); + } + } +#else + *pnum = nb_sectors; + return 1; +#endif + + if (data <= start) { + /* On a data extent, compute sectors to the end of the extent. */ + *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); + return 1; + } else { + /* On a hole, compute sectors to the beginning of the next extent. */ + *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + return 0; + } +} + +#ifdef CONFIG_XFS +static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = sector_num << 9; + fl.l_len = (int64_t)nb_sectors << 9; + + if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { + DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} +#endif + +static coroutine_fn int raw_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ +#ifdef CONFIG_XFS + BDRVRawState *s = bs->opaque; + + if (s->is_xfs) { + return xfs_discard(s, sector_num, nb_sectors); + } +#endif + + return 0; +} + +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe = NULL, /* no probe for protocols */ + .bdrv_file_open = raw_open, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + .bdrv_co_discard = raw_co_discard, + .bdrv_co_is_allocated = raw_co_is_allocated, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .create_options = raw_create_options, +}; + +/***********************************************/ +/* host device */ + +#if defined(__APPLE__) && defined(__MACH__) +static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ); +static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ); + +kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ) +{ + kern_return_t kernResult; + mach_port_t masterPort; + CFMutableDictionaryRef classesToMatch; + + kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); + if ( KERN_SUCCESS != kernResult ) { + printf( "IOMasterPort returned %d\n", kernResult ); + } + + classesToMatch = IOServiceMatching( kIOCDMediaClass ); + if ( classesToMatch == NULL ) { + printf( "IOServiceMatching returned a NULL dictionary.\n" ); + } else { + CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue ); + } + kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator ); + if ( KERN_SUCCESS != kernResult ) + { + printf( "IOServiceGetMatchingServices returned %d\n", kernResult ); + } + + return kernResult; +} + +kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ) +{ + io_object_t nextMedia; + kern_return_t kernResult = KERN_FAILURE; + *bsdPath = '\0'; + nextMedia = IOIteratorNext( mediaIterator ); + if ( nextMedia ) + { + CFTypeRef bsdPathAsCFString; + bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); + if ( bsdPathAsCFString ) { + size_t devPathLength; + strcpy( bsdPath, _PATH_DEV ); + strcat( bsdPath, "r" ); + devPathLength = strlen( bsdPath ); + if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { + kernResult = KERN_SUCCESS; + } + CFRelease( bsdPathAsCFString ); + } + IOObjectRelease( nextMedia ); + } + + return kernResult; +} + +#endif + +static int hdev_probe_device(const char *filename) +{ + struct stat st; + + /* allow a dedicated CD-ROM driver to match with a higher priority */ + if (strstart(filename, "/dev/cdrom", NULL)) + return 50; + + if (stat(filename, &st) >= 0 && + (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { + return 100; + } + + return 0; +} + +static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + +#if defined(__APPLE__) && defined(__MACH__) + if (strstart(filename, "/dev/cdrom", NULL)) { + kern_return_t kernResult; + io_iterator_t mediaIterator; + char bsdPath[ MAXPATHLEN ]; + int fd; + + kernResult = FindEjectableCDMedia( &mediaIterator ); + kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) ); + + if ( bsdPath[ 0 ] != '\0' ) { + strcat(bsdPath,"s0"); + /* some CDs don't have a partition 0 */ + fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd < 0) { + bsdPath[strlen(bsdPath)-1] = '1'; + } else { + qemu_close(fd); + } + filename = bsdPath; + } + + if ( mediaIterator ) + IOObjectRelease( mediaIterator ); + } +#endif + + s->type = FTYPE_FILE; +#if defined(__linux__) + { + char resolved_path[ MAXPATHLEN ], *temp; + + temp = realpath(filename, resolved_path); + if (temp && strstart(temp, "/dev/sg", NULL)) { + bs->sg = 1; + } + } +#endif + + return raw_open_common(bs, filename, flags, 0); +} + +#if defined(__linux__) +/* Note: we do not have a reliable method to detect if the floppy is + present. The current method is to try to open the floppy at every + I/O and to keep it opened during a few hundreds of ms. */ +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int last_media_present; + + if (s->type != FTYPE_FD) + return 0; + last_media_present = (s->fd >= 0); + if (s->fd >= 0 && + (get_clock() - s->fd_open_time) >= FD_OPEN_TIMEOUT) { + qemu_close(s->fd); + s->fd = -1; +#ifdef DEBUG_FLOPPY + printf("Floppy closed\n"); +#endif + } + if (s->fd < 0) { + if (s->fd_got_error && + (get_clock() - s->fd_error_time) < FD_OPEN_TIMEOUT) { +#ifdef DEBUG_FLOPPY + printf("No floppy (open delayed)\n"); +#endif + return -EIO; + } + s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK); + if (s->fd < 0) { + s->fd_error_time = get_clock(); + s->fd_got_error = 1; + if (last_media_present) + s->fd_media_changed = 1; +#ifdef DEBUG_FLOPPY + printf("No floppy\n"); +#endif + return -EIO; + } +#ifdef DEBUG_FLOPPY + printf("Floppy opened\n"); +#endif + } + if (!last_media_present) + s->fd_media_changed = 1; + s->fd_open_time = get_clock(); + s->fd_got_error = 0; + return 0; +} + +static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BDRVRawState *s = bs->opaque; + + return ioctl(s->fd, req, buf); +} + +static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + return paio_ioctl(bs, s->fd, req, buf, cb, opaque); +} + +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + /* this is just to ensure s->fd is sane (its called by io ops) */ + if (s->fd >= 0) + return 0; + return -EIO; +} +#else /* !linux && !FreeBSD */ + +static int fd_open(BlockDriverState *bs) +{ + return 0; +} + +#endif /* !linux && !FreeBSD */ + +static int hdev_create(const char *filename, QEMUOptionParameter *options) +{ + int fd; + int ret = 0; + struct stat stat_buf; + int64_t total_size = 0; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, "size")) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + fd = qemu_open(filename, O_WRONLY | O_BINARY); + if (fd < 0) + return -errno; + + if (fstat(fd, &stat_buf) < 0) + ret = -errno; + else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) + ret = -ENODEV; + else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) + ret = -ENOSPC; + + qemu_close(fd); + return ret; +} + +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = hdev_probe_device, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + /* generic scsi device */ +#ifdef __linux__ + .bdrv_ioctl = hdev_ioctl, + .bdrv_aio_ioctl = hdev_aio_ioctl, +#endif +}; + +#ifdef __linux__ +static int floppy_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int ret; + + s->type = FTYPE_FD; + + /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ + ret = raw_open_common(bs, filename, flags, O_NONBLOCK); + if (ret) + return ret; + + /* close fd so that we can reopen it as needed */ + qemu_close(s->fd); + s->fd = -1; + s->fd_media_changed = 1; + + return 0; +} + +static int floppy_probe_device(const char *filename) +{ + int fd, ret; + int prio = 0; + struct floppy_struct fdparam; + struct stat st; + + if (strstart(filename, "/dev/fd", NULL) && + !strstart(filename, "/dev/fdset/", NULL)) { + prio = 50; + } + + fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + goto out; + } + ret = fstat(fd, &st); + if (ret == -1 || !S_ISBLK(st.st_mode)) { + goto outc; + } + + /* Attempt to detect via a floppy specific ioctl */ + ret = ioctl(fd, FDGETPRM, &fdparam); + if (ret >= 0) + prio = 100; + +outc: + qemu_close(fd); +out: + return prio; +} + + +static int floppy_is_inserted(BlockDriverState *bs) +{ + return fd_open(bs) >= 0; +} + +static int floppy_media_changed(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + /* + * XXX: we do not have a true media changed indication. + * It does not work if the floppy is changed without trying to read it. + */ + fd_open(bs); + ret = s->fd_media_changed; + s->fd_media_changed = 0; +#ifdef DEBUG_FLOPPY + printf("Floppy changed=%d\n", ret); +#endif + return ret; +} + +static void floppy_eject(BlockDriverState *bs, bool eject_flag) +{ + BDRVRawState *s = bs->opaque; + int fd; + + if (s->fd >= 0) { + qemu_close(s->fd); + s->fd = -1; + } + fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK); + if (fd >= 0) { + if (ioctl(fd, FDEJECT, 0) < 0) + perror("FDEJECT"); + qemu_close(fd); + } +} + +static BlockDriver bdrv_host_floppy = { + .format_name = "host_floppy", + .protocol_name = "host_floppy", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = floppy_probe_device, + .bdrv_file_open = floppy_open, + .bdrv_close = raw_close, + .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + /* removable device support */ + .bdrv_is_inserted = floppy_is_inserted, + .bdrv_media_changed = floppy_media_changed, + .bdrv_eject = floppy_eject, +}; + +static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + + s->type = FTYPE_CD; + + /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ + return raw_open_common(bs, filename, flags, O_NONBLOCK); +} + +static int cdrom_probe_device(const char *filename) +{ + int fd, ret; + int prio = 0; + struct stat st; + + fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + goto out; + } + ret = fstat(fd, &st); + if (ret == -1 || !S_ISBLK(st.st_mode)) { + goto outc; + } + + /* Attempt to detect via a CDROM specific ioctl */ + ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (ret >= 0) + prio = 100; + +outc: + qemu_close(fd); +out: + return prio; +} + +static int cdrom_is_inserted(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (ret == CDS_DISC_OK) + return 1; + return 0; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ + BDRVRawState *s = bs->opaque; + + if (eject_flag) { + if (ioctl(s->fd, CDROMEJECT, NULL) < 0) + perror("CDROMEJECT"); + } else { + if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) + perror("CDROMEJECT"); + } +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ + BDRVRawState *s = bs->opaque; + + if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { + /* + * Note: an error can happen if the distribution automatically + * mounts the CD-ROM + */ + /* perror("CDROM_LOCKDOOR"); */ + } +} + +static BlockDriver bdrv_host_cdrom = { + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = cdrom_probe_device, + .bdrv_file_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + /* removable device support */ + .bdrv_is_inserted = cdrom_is_inserted, + .bdrv_eject = cdrom_eject, + .bdrv_lock_medium = cdrom_lock_medium, + + /* generic scsi device */ + .bdrv_ioctl = hdev_ioctl, + .bdrv_aio_ioctl = hdev_aio_ioctl, +}; +#endif /* __linux__ */ + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int ret; + + s->type = FTYPE_CD; + + ret = raw_open_common(bs, filename, flags, 0); + if (ret) + return ret; + + /* make sure the door isn't locked at this time */ + ioctl(s->fd, CDIOCALLOW); + return 0; +} + +static int cdrom_probe_device(const char *filename) +{ + if (strstart(filename, "/dev/cd", NULL) || + strstart(filename, "/dev/acd", NULL)) + return 100; + return 0; +} + +static int cdrom_reopen(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd; + + /* + * Force reread of possibly changed/newly loaded disc, + * FreeBSD seems to not notice sometimes... + */ + if (s->fd >= 0) + qemu_close(s->fd); + fd = qemu_open(bs->filename, s->open_flags, 0644); + if (fd < 0) { + s->fd = -1; + return -EIO; + } + s->fd = fd; + + /* make sure the door isn't locked at this time */ + ioctl(s->fd, CDIOCALLOW); + return 0; +} + +static int cdrom_is_inserted(BlockDriverState *bs) +{ + return raw_getlength(bs) > 0; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ + BDRVRawState *s = bs->opaque; + + if (s->fd < 0) + return; + + (void) ioctl(s->fd, CDIOCALLOW); + + if (eject_flag) { + if (ioctl(s->fd, CDIOCEJECT) < 0) + perror("CDIOCEJECT"); + } else { + if (ioctl(s->fd, CDIOCCLOSE) < 0) + perror("CDIOCCLOSE"); + } + + cdrom_reopen(bs); +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ + BDRVRawState *s = bs->opaque; + + if (s->fd < 0) + return; + if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { + /* + * Note: an error can happen if the distribution automatically + * mounts the CD-ROM + */ + /* perror("CDROM_LOCKDOOR"); */ + } +} + +static BlockDriver bdrv_host_cdrom = { + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = cdrom_probe_device, + .bdrv_file_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + /* removable device support */ + .bdrv_is_inserted = cdrom_is_inserted, + .bdrv_eject = cdrom_eject, + .bdrv_lock_medium = cdrom_lock_medium, +}; +#endif /* __FreeBSD__ */ + +static void bdrv_file_init(void) +{ + /* + * Register all the drivers. Note that order is important, the driver + * registered last will get probed first. + */ + bdrv_register(&bdrv_file); + bdrv_register(&bdrv_host_device); +#ifdef __linux__ + bdrv_register(&bdrv_host_floppy); + bdrv_register(&bdrv_host_cdrom); +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) + bdrv_register(&bdrv_host_cdrom); +#endif +} + +block_init(bdrv_file_init); diff --git a/block/raw-win32.c b/block/raw-win32.c new file mode 100644 index 000000000..c56bf8337 --- /dev/null +++ b/block/raw-win32.c @@ -0,0 +1,431 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu-timer.h" +#include "block_int.h" +#include "module.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_HARDDISK 2 + +typedef struct BDRVRawState { + HANDLE hfile; + int type; + char drive_path[16]; /* format: "d:\" */ +} BDRVRawState; + +int qemu_ftruncate64(int fd, int64_t length) +{ + LARGE_INTEGER li; + DWORD dw; + LONG high; + HANDLE h; + BOOL res; + + if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0) + return -1; + + h = (HANDLE)_get_osfhandle(fd); + + /* get current position, ftruncate do not change position */ + li.HighPart = 0; + li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT); + if (li.LowPart == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + return -1; + } + + high = length >> 32; + dw = SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN); + if (dw == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + return -1; + } + res = SetEndOfFile(h); + + /* back to old position */ + SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN); + return res ? 0 : -1; +} + +static int set_sparse(int fd) +{ + DWORD returned; + return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE, + NULL, 0, NULL, 0, &returned, NULL); +} + +static int raw_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int access_flags; + DWORD overlapped; + + s->type = FTYPE_FILE; + + if (flags & BDRV_O_RDWR) { + access_flags = GENERIC_READ | GENERIC_WRITE; + } else { + access_flags = GENERIC_READ; + } + + overlapped = FILE_ATTRIBUTE_NORMAL; + if (flags & BDRV_O_NOCACHE) + overlapped |= FILE_FLAG_NO_BUFFERING; + if (!(flags & BDRV_O_CACHE_WB)) + overlapped |= FILE_FLAG_WRITE_THROUGH; + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + OPEN_EXISTING, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) + return -EACCES; + return -1; + } + return 0; +} + +static int raw_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + OVERLAPPED ov; + DWORD ret_count; + int ret; + int64_t offset = sector_num * 512; + int count = nb_sectors * 512; + + memset(&ov, 0, sizeof(ov)); + ov.Offset = offset; + ov.OffsetHigh = offset >> 32; + ret = ReadFile(s->hfile, buf, count, &ret_count, &ov); + if (!ret) + return ret_count; + if (ret_count == count) + ret_count = 0; + return ret_count; +} + +static int raw_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + OVERLAPPED ov; + DWORD ret_count; + int ret; + int64_t offset = sector_num * 512; + int count = nb_sectors * 512; + + memset(&ov, 0, sizeof(ov)); + ov.Offset = offset; + ov.OffsetHigh = offset >> 32; + ret = WriteFile(s->hfile, buf, count, &ret_count, &ov); + if (!ret) + return ret_count; + if (ret_count == count) + ret_count = 0; + return ret_count; +} + +static int raw_flush(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = FlushFileBuffers(s->hfile); + if (ret == 0) { + return -EIO; + } + + return 0; +} + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + CloseHandle(s->hfile); +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + LONG low, high; + + low = offset; + high = offset >> 32; + if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN)) + return -EIO; + if (!SetEndOfFile(s->hfile)) + return -EIO; + return 0; +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + LARGE_INTEGER l; + ULARGE_INTEGER available, total, total_free; + DISK_GEOMETRY_EX dg; + DWORD count; + BOOL status; + + switch(s->type) { + case FTYPE_FILE: + l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart); + if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) + return -EIO; + break; + case FTYPE_CD: + if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free)) + return -EIO; + l.QuadPart = total.QuadPart; + break; + case FTYPE_HARDDISK: + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + l = dg.DiskSize; + } + break; + default: + return -EIO; + } + return l.QuadPart; +} + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ + typedef DWORD (WINAPI * get_compressed_t)(const char *filename, + DWORD * high); + get_compressed_t get_compressed; + struct _stati64 st; + const char *filename = bs->filename; + /* WinNT support GetCompressedFileSize to determine allocate size */ + get_compressed = + (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), + "GetCompressedFileSizeA"); + if (get_compressed) { + DWORD high, low; + low = get_compressed(filename, &high); + if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR) { + return (((int64_t) high) << 32) + low; + } + } + + if (_stati64(filename, &st) < 0) { + return -1; + } + return st.st_size; +} + +static int raw_create(const char *filename, QEMUOptionParameter *options) +{ + int fd; + int64_t total_size = 0; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / 512; + } + options++; + } + + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) + return -EIO; + set_sparse(fd); + ftruncate(fd, total_size * 512); + qemu_close(fd); + return 0; +} + +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", + .instance_size = sizeof(BDRVRawState), + .bdrv_file_open = raw_open, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_co_flush_to_disk = raw_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .create_options = raw_create_options, +}; + +/***********************************************/ +/* host device */ + +static int find_cdrom(char *cdrom_name, int cdrom_name_size) +{ + char drives[256], *pdrv = drives; + UINT type; + + memset(drives, 0, sizeof(drives)); + GetLogicalDriveStrings(sizeof(drives), drives); + while(pdrv[0] != '\0') { + type = GetDriveType(pdrv); + switch(type) { + case DRIVE_CDROM: + snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]); + return 0; + break; + } + pdrv += lstrlen(pdrv) + 1; + } + return -1; +} + +static int find_device_type(BlockDriverState *bs, const char *filename) +{ + BDRVRawState *s = bs->opaque; + UINT type; + const char *p; + + if (strstart(filename, "\\\\.\\", &p) || + strstart(filename, "//./", &p)) { + if (stristart(p, "PhysicalDrive", NULL)) + return FTYPE_HARDDISK; + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]); + type = GetDriveType(s->drive_path); + switch (type) { + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + return FTYPE_HARDDISK; + case DRIVE_CDROM: + return FTYPE_CD; + default: + return FTYPE_FILE; + } + } else { + return FTYPE_FILE; + } +} + +static int hdev_probe_device(const char *filename) +{ + if (strstart(filename, "/dev/cdrom", NULL)) + return 100; + if (is_windows_drive(filename)) + return 100; + return 0; +} + +static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int access_flags, create_flags; + DWORD overlapped; + char device_name[64]; + + if (strstart(filename, "/dev/cdrom", NULL)) { + if (find_cdrom(device_name, sizeof(device_name)) < 0) + return -ENOENT; + filename = device_name; + } else { + /* transform drive letters into device name */ + if (((filename[0] >= 'a' && filename[0] <= 'z') || + (filename[0] >= 'A' && filename[0] <= 'Z')) && + filename[1] == ':' && filename[2] == '\0') { + snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]); + filename = device_name; + } + } + s->type = find_device_type(bs, filename); + + if (flags & BDRV_O_RDWR) { + access_flags = GENERIC_READ | GENERIC_WRITE; + } else { + access_flags = GENERIC_READ; + } + create_flags = OPEN_EXISTING; + + overlapped = FILE_ATTRIBUTE_NORMAL; + if (flags & BDRV_O_NOCACHE) + overlapped |= FILE_FLAG_NO_BUFFERING; + if (!(flags & BDRV_O_CACHE_WB)) + overlapped |= FILE_FLAG_WRITE_THROUGH; + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + create_flags, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) + return -EACCES; + return -1; + } + return 0; +} + +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = hdev_probe_device, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_has_zero_init = hdev_has_zero_init, + + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_co_flush_to_disk = raw_flush, + + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, +}; + +static void bdrv_file_init(void) +{ + bdrv_register(&bdrv_file); + bdrv_register(&bdrv_host_device); +} + +block_init(bdrv_file_init); diff --git a/block/raw.c b/block/raw.c new file mode 100644 index 000000000..ff34ea41e --- /dev/null +++ b/block/raw.c @@ -0,0 +1,145 @@ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +static int raw_open(BlockDriverState *bs, int flags) +{ + bs->sg = bs->file->sg; + return 0; +} + +static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov); +} + +static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); +} + +static void raw_close(BlockDriverState *bs) +{ +} + +static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file); +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file, offset); +} + +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + return 1; /* everything can be opened as raw image */ +} + +static int coroutine_fn raw_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + return bdrv_co_discard(bs->file, sector_num, nb_sectors); +} + +static int raw_is_inserted(BlockDriverState *bs) +{ + return bdrv_is_inserted(bs->file); +} + +static int raw_media_changed(BlockDriverState *bs) +{ + return bdrv_media_changed(bs->file); +} + +static void raw_eject(BlockDriverState *bs, bool eject_flag) +{ + bdrv_eject(bs->file, eject_flag); +} + +static void raw_lock_medium(BlockDriverState *bs, bool locked) +{ + bdrv_lock_medium(bs->file, locked); +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + return bdrv_ioctl(bs->file, req, buf); +} + +static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); +} + +static int raw_create(const char *filename, QEMUOptionParameter *options) +{ + return bdrv_create_file(filename, options); +} + +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static int raw_has_zero_init(BlockDriverState *bs) +{ + return bdrv_has_zero_init(bs->file); +} + +static BlockDriver bdrv_raw = { + .format_name = "raw", + + /* It's really 0, but we need to make g_malloc() happy */ + .instance_size = 1, + + .bdrv_open = raw_open, + .bdrv_close = raw_close, + + .bdrv_co_readv = raw_co_readv, + .bdrv_co_writev = raw_co_writev, + .bdrv_co_is_allocated = raw_co_is_allocated, + .bdrv_co_discard = raw_co_discard, + + .bdrv_probe = raw_probe, + .bdrv_getlength = raw_getlength, + .bdrv_truncate = raw_truncate, + + .bdrv_is_inserted = raw_is_inserted, + .bdrv_media_changed = raw_media_changed, + .bdrv_eject = raw_eject, + .bdrv_lock_medium = raw_lock_medium, + + .bdrv_ioctl = raw_ioctl, + .bdrv_aio_ioctl = raw_aio_ioctl, + + .bdrv_create = raw_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = raw_has_zero_init, +}; + +static void bdrv_raw_init(void) +{ + bdrv_register(&bdrv_raw); +} + +block_init(bdrv_raw_init); diff --git a/block/rbd.c b/block/rbd.c new file mode 100644 index 000000000..5a0f79fc8 --- /dev/null +++ b/block/rbd.c @@ -0,0 +1,970 @@ +/* + * QEMU Block driver for RADOS (Ceph) + * + * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>, + * Josh Durgin <josh.durgin@dreamhost.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include <inttypes.h> + +#include "qemu-common.h" +#include "qemu-error.h" +#include "block_int.h" + +#include <rbd/librbd.h> + +/* + * When specifying the image filename use: + * + * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]] + * + * poolname must be the name of an existing rados pool. + * + * devicename is the name of the rbd image. + * + * Each option given is used to configure rados, and may be any valid + * Ceph option, "id", or "conf". + * + * The "id" option indicates what user we should authenticate as to + * the Ceph cluster. If it is excluded we will use the Ceph default + * (normally 'admin'). + * + * The "conf" option specifies a Ceph configuration file to read. If + * it is not specified, we will read from the default Ceph locations + * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration + * file, specify conf=/dev/null. + * + * Configuration values containing :, @, or = can be escaped with a + * leading "\". + */ + +/* rbd_aio_discard added in 0.1.2 */ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) +#define LIBRBD_SUPPORTS_DISCARD +#else +#undef LIBRBD_SUPPORTS_DISCARD +#endif + +#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) + +#define RBD_MAX_CONF_NAME_SIZE 128 +#define RBD_MAX_CONF_VAL_SIZE 512 +#define RBD_MAX_CONF_SIZE 1024 +#define RBD_MAX_POOL_NAME_SIZE 128 +#define RBD_MAX_SNAP_NAME_SIZE 128 +#define RBD_MAX_SNAPS 100 + +typedef enum { + RBD_AIO_READ, + RBD_AIO_WRITE, + RBD_AIO_DISCARD +} RBDAIOCmd; + +typedef struct RBDAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + int ret; + QEMUIOVector *qiov; + char *bounce; + RBDAIOCmd cmd; + int64_t sector_num; + int error; + struct BDRVRBDState *s; + int cancelled; +} RBDAIOCB; + +typedef struct RADOSCB { + int rcbid; + RBDAIOCB *acb; + struct BDRVRBDState *s; + int done; + int64_t size; + char *buf; + int ret; +} RADOSCB; + +#define RBD_FD_READ 0 +#define RBD_FD_WRITE 1 + +typedef struct BDRVRBDState { + int fds[2]; + rados_t cluster; + rados_ioctx_t io_ctx; + rbd_image_t image; + char name[RBD_MAX_IMAGE_NAME_SIZE]; + int qemu_aio_count; + char *snap; + int event_reader_pos; + RADOSCB *event_rcb; +} BDRVRBDState; + +static void rbd_aio_bh_cb(void *opaque); + +static int qemu_rbd_next_tok(char *dst, int dst_len, + char *src, char delim, + const char *name, + char **p) +{ + int l; + char *end; + + *p = NULL; + + if (delim != '\0') { + for (end = src; *end; ++end) { + if (*end == delim) { + break; + } + if (*end == '\\' && end[1] != '\0') { + end++; + } + } + if (*end == delim) { + *p = end + 1; + *end = '\0'; + } + } + l = strlen(src); + if (l >= dst_len) { + error_report("%s too long", name); + return -EINVAL; + } else if (l == 0) { + error_report("%s too short", name); + return -EINVAL; + } + + pstrcpy(dst, dst_len, src); + + return 0; +} + +static void qemu_rbd_unescape(char *src) +{ + char *p; + + for (p = src; *src; ++src, ++p) { + if (*src == '\\' && src[1] != '\0') { + src++; + } + *p = *src; + } + *p = '\0'; +} + +static int qemu_rbd_parsename(const char *filename, + char *pool, int pool_len, + char *snap, int snap_len, + char *name, int name_len, + char *conf, int conf_len) +{ + const char *start; + char *p, *buf; + int ret; + + if (!strstart(filename, "rbd:", &start)) { + return -EINVAL; + } + + buf = g_strdup(start); + p = buf; + *snap = '\0'; + *conf = '\0'; + + ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); + if (ret < 0 || !p) { + ret = -EINVAL; + goto done; + } + qemu_rbd_unescape(pool); + + if (strchr(p, '@')) { + ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p); + if (ret < 0) { + goto done; + } + ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p); + qemu_rbd_unescape(snap); + } else { + ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p); + } + qemu_rbd_unescape(name); + if (ret < 0 || !p) { + goto done; + } + + ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p); + +done: + g_free(buf); + return ret; +} + +static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) +{ + const char *p = conf; + + while (*p) { + int len; + const char *end = strchr(p, ':'); + + if (end) { + len = end - p; + } else { + len = strlen(p); + } + + if (strncmp(p, "id=", 3) == 0) { + len -= 3; + strncpy(clientname, p + 3, len); + clientname[len] = '\0'; + return clientname; + } + if (end == NULL) { + break; + } + p = end + 1; + } + return NULL; +} + +static int qemu_rbd_set_conf(rados_t cluster, const char *conf) +{ + char *p, *buf; + char name[RBD_MAX_CONF_NAME_SIZE]; + char value[RBD_MAX_CONF_VAL_SIZE]; + int ret = 0; + + buf = g_strdup(conf); + p = buf; + + while (p) { + ret = qemu_rbd_next_tok(name, sizeof(name), p, + '=', "conf option name", &p); + if (ret < 0) { + break; + } + qemu_rbd_unescape(name); + + if (!p) { + error_report("conf option %s has no value", name); + ret = -EINVAL; + break; + } + + ret = qemu_rbd_next_tok(value, sizeof(value), p, + ':', "conf option value", &p); + if (ret < 0) { + break; + } + qemu_rbd_unescape(value); + + if (strcmp(name, "conf") == 0) { + ret = rados_conf_read_file(cluster, value); + if (ret < 0) { + error_report("error reading conf file %s", value); + break; + } + } else if (strcmp(name, "id") == 0) { + /* ignore, this is parsed by qemu_rbd_parse_clientname() */ + } else { + ret = rados_conf_set(cluster, name, value); + if (ret < 0) { + error_report("invalid conf option %s", name); + ret = -EINVAL; + break; + } + } + } + + g_free(buf); + return ret; +} + +static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) +{ + int64_t bytes = 0; + int64_t objsize; + int obj_order = 0; + char pool[RBD_MAX_POOL_NAME_SIZE]; + char name[RBD_MAX_IMAGE_NAME_SIZE]; + char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; + char conf[RBD_MAX_CONF_SIZE]; + char clientname_buf[RBD_MAX_CONF_SIZE]; + char *clientname; + rados_t cluster; + rados_ioctx_t io_ctx; + int ret; + + if (qemu_rbd_parsename(filename, pool, sizeof(pool), + snap_buf, sizeof(snap_buf), + name, sizeof(name), + conf, sizeof(conf)) < 0) { + return -EINVAL; + } + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + bytes = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + objsize = options->value.n; + if ((objsize - 1) & objsize) { /* not a power of 2? */ + error_report("obj size needs to be power of 2"); + return -EINVAL; + } + if (objsize < 4096) { + error_report("obj size too small"); + return -EINVAL; + } + obj_order = ffs(objsize) - 1; + } + } + options++; + } + + clientname = qemu_rbd_parse_clientname(conf, clientname_buf); + if (rados_create(&cluster, clientname) < 0) { + error_report("error initializing"); + return -EIO; + } + + if (strstr(conf, "conf=") == NULL) { + /* try default location, but ignore failure */ + rados_conf_read_file(cluster, NULL); + } + + if (conf[0] != '\0' && + qemu_rbd_set_conf(cluster, conf) < 0) { + error_report("error setting config options"); + rados_shutdown(cluster); + return -EIO; + } + + if (rados_connect(cluster) < 0) { + error_report("error connecting"); + rados_shutdown(cluster); + return -EIO; + } + + if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { + error_report("error opening pool %s", pool); + rados_shutdown(cluster); + return -EIO; + } + + ret = rbd_create(io_ctx, name, bytes, &obj_order); + rados_ioctx_destroy(io_ctx); + rados_shutdown(cluster); + + return ret; +} + +/* + * This aio completion is being called from qemu_rbd_aio_event_reader() + * and runs in qemu context. It schedules a bh, but just in case the aio + * was not cancelled before. + */ +static void qemu_rbd_complete_aio(RADOSCB *rcb) +{ + RBDAIOCB *acb = rcb->acb; + int64_t r; + + if (acb->cancelled) { + qemu_vfree(acb->bounce); + qemu_aio_release(acb); + goto done; + } + + r = rcb->ret; + + if (acb->cmd == RBD_AIO_WRITE || + acb->cmd == RBD_AIO_DISCARD) { + if (r < 0) { + acb->ret = r; + acb->error = 1; + } else if (!acb->error) { + acb->ret = rcb->size; + } + } else { + if (r < 0) { + memset(rcb->buf, 0, rcb->size); + acb->ret = r; + acb->error = 1; + } else if (r < rcb->size) { + memset(rcb->buf + r, 0, rcb->size - r); + if (!acb->error) { + acb->ret = rcb->size; + } + } else if (!acb->error) { + acb->ret = r; + } + } + /* Note that acb->bh can be NULL in case where the aio was cancelled */ + acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); + qemu_bh_schedule(acb->bh); +done: + g_free(rcb); +} + +/* + * aio fd read handler. It runs in the qemu context and calls the + * completion handling of completed rados aio operations. + */ +static void qemu_rbd_aio_event_reader(void *opaque) +{ + BDRVRBDState *s = opaque; + + ssize_t ret; + + do { + char *p = (char *)&s->event_rcb; + + /* now read the rcb pointer that was sent from a non qemu thread */ + ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, + sizeof(s->event_rcb) - s->event_reader_pos); + if (ret > 0) { + s->event_reader_pos += ret; + if (s->event_reader_pos == sizeof(s->event_rcb)) { + s->event_reader_pos = 0; + qemu_rbd_complete_aio(s->event_rcb); + s->qemu_aio_count--; + } + } + } while (ret < 0 && errno == EINTR); +} + +static int qemu_rbd_aio_flush_cb(void *opaque) +{ + BDRVRBDState *s = opaque; + + return (s->qemu_aio_count > 0); +} + +static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRBDState *s = bs->opaque; + char pool[RBD_MAX_POOL_NAME_SIZE]; + char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; + char conf[RBD_MAX_CONF_SIZE]; + char clientname_buf[RBD_MAX_CONF_SIZE]; + char *clientname; + int r; + + if (qemu_rbd_parsename(filename, pool, sizeof(pool), + snap_buf, sizeof(snap_buf), + s->name, sizeof(s->name), + conf, sizeof(conf)) < 0) { + return -EINVAL; + } + + clientname = qemu_rbd_parse_clientname(conf, clientname_buf); + r = rados_create(&s->cluster, clientname); + if (r < 0) { + error_report("error initializing"); + return r; + } + + s->snap = NULL; + if (snap_buf[0] != '\0') { + s->snap = g_strdup(snap_buf); + } + + /* + * Fallback to more conservative semantics if setting cache + * options fails. Ignore errors from setting rbd_cache because the + * only possible error is that the option does not exist, and + * librbd defaults to no caching. If write through caching cannot + * be set up, fall back to no caching. + */ + if (flags & BDRV_O_NOCACHE) { + rados_conf_set(s->cluster, "rbd_cache", "false"); + } else { + rados_conf_set(s->cluster, "rbd_cache", "true"); + if (!(flags & BDRV_O_CACHE_WB)) { + r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0"); + if (r < 0) { + rados_conf_set(s->cluster, "rbd_cache", "false"); + } + } + } + + if (strstr(conf, "conf=") == NULL) { + /* try default location, but ignore failure */ + rados_conf_read_file(s->cluster, NULL); + } + + if (conf[0] != '\0') { + r = qemu_rbd_set_conf(s->cluster, conf); + if (r < 0) { + error_report("error setting config options"); + goto failed_shutdown; + } + } + + r = rados_connect(s->cluster); + if (r < 0) { + error_report("error connecting"); + goto failed_shutdown; + } + + r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); + if (r < 0) { + error_report("error opening pool %s", pool); + goto failed_shutdown; + } + + r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); + if (r < 0) { + error_report("error reading header from %s", s->name); + goto failed_open; + } + + bs->read_only = (s->snap != NULL); + + s->event_reader_pos = 0; + r = qemu_pipe(s->fds); + if (r < 0) { + error_report("error opening eventfd"); + goto failed; + } + fcntl(s->fds[0], F_SETFL, O_NONBLOCK); + fcntl(s->fds[1], F_SETFL, O_NONBLOCK); + qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, + NULL, qemu_rbd_aio_flush_cb, s); + + + return 0; + +failed: + rbd_close(s->image); +failed_open: + rados_ioctx_destroy(s->io_ctx); +failed_shutdown: + rados_shutdown(s->cluster); + g_free(s->snap); + return r; +} + +static void qemu_rbd_close(BlockDriverState *bs) +{ + BDRVRBDState *s = bs->opaque; + + close(s->fds[0]); + close(s->fds[1]); + qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL); + + rbd_close(s->image); + rados_ioctx_destroy(s->io_ctx); + g_free(s->snap); + rados_shutdown(s->cluster); +} + +/* + * Cancel aio. Since we don't reference acb in a non qemu threads, + * it is safe to access it here. + */ +static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) +{ + RBDAIOCB *acb = (RBDAIOCB *) blockacb; + acb->cancelled = 1; +} + +static AIOPool rbd_aio_pool = { + .aiocb_size = sizeof(RBDAIOCB), + .cancel = qemu_rbd_aio_cancel, +}; + +static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) +{ + int ret = 0; + while (1) { + fd_set wfd; + int fd = s->fds[RBD_FD_WRITE]; + + /* send the op pointer to the qemu thread that is responsible + for the aio/op completion. Must do it in a qemu thread context */ + ret = write(fd, (void *)&rcb, sizeof(rcb)); + if (ret >= 0) { + break; + } + if (errno == EINTR) { + continue; + } + if (errno != EAGAIN) { + break; + } + + FD_ZERO(&wfd); + FD_SET(fd, &wfd); + do { + ret = select(fd + 1, NULL, &wfd, NULL, NULL); + } while (ret < 0 && errno == EINTR); + } + + return ret; +} + +/* + * This is the callback function for rbd_aio_read and _write + * + * Note: this function is being called from a non qemu thread so + * we need to be careful about what we do here. Generally we only + * write to the block notification pipe, and do the rest of the + * io completion handling from qemu_rbd_aio_event_reader() which + * runs in a qemu context. + */ +static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) +{ + int ret; + rcb->ret = rbd_aio_get_return_value(c); + rbd_aio_release(c); + ret = qemu_rbd_send_pipe(rcb->s, rcb); + if (ret < 0) { + error_report("failed writing to acb->s->fds"); + g_free(rcb); + } +} + +/* Callback when all queued rbd_aio requests are complete */ + +static void rbd_aio_bh_cb(void *opaque) +{ + RBDAIOCB *acb = opaque; + + if (acb->cmd == RBD_AIO_READ) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + + qemu_aio_release(acb); +} + +static int rbd_aio_discard_wrapper(rbd_image_t image, + uint64_t off, + uint64_t len, + rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_DISCARD + return rbd_aio_discard(image, off, len, comp); +#else + return -ENOTSUP; +#endif +} + +static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + RBDAIOCmd cmd) +{ + RBDAIOCB *acb; + RADOSCB *rcb; + rbd_completion_t c; + int64_t off, size; + char *buf; + int r; + + BDRVRBDState *s = bs->opaque; + + acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque); + acb->cmd = cmd; + acb->qiov = qiov; + if (cmd == RBD_AIO_DISCARD) { + acb->bounce = NULL; + } else { + acb->bounce = qemu_blockalign(bs, qiov->size); + } + acb->ret = 0; + acb->error = 0; + acb->s = s; + acb->cancelled = 0; + acb->bh = NULL; + + if (cmd == RBD_AIO_WRITE) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + } + + buf = acb->bounce; + + off = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + s->qemu_aio_count++; /* All the RADOSCB */ + + rcb = g_malloc(sizeof(RADOSCB)); + rcb->done = 0; + rcb->acb = acb; + rcb->buf = buf; + rcb->s = acb->s; + rcb->size = size; + r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); + if (r < 0) { + goto failed; + } + + switch (cmd) { + case RBD_AIO_WRITE: + r = rbd_aio_write(s->image, off, size, buf, c); + break; + case RBD_AIO_READ: + r = rbd_aio_read(s->image, off, size, buf, c); + break; + case RBD_AIO_DISCARD: + r = rbd_aio_discard_wrapper(s->image, off, size, c); + break; + default: + r = -EINVAL; + } + + if (r < 0) { + goto failed; + } + + return &acb->common; + +failed: + g_free(rcb); + s->qemu_aio_count--; + qemu_aio_release(acb); + return NULL; +} + +static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + RBD_AIO_READ); +} + +static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + RBD_AIO_WRITE); +} + +static int qemu_rbd_co_flush(BlockDriverState *bs) +{ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) + /* rbd_flush added in 0.1.1 */ + BDRVRBDState *s = bs->opaque; + return rbd_flush(s->image); +#else + return 0; +#endif +} + +static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVRBDState *s = bs->opaque; + rbd_image_info_t info; + int r; + + r = rbd_stat(s->image, &info, sizeof(info)); + if (r < 0) { + return r; + } + + bdi->cluster_size = info.obj_size; + return 0; +} + +static int64_t qemu_rbd_getlength(BlockDriverState *bs) +{ + BDRVRBDState *s = bs->opaque; + rbd_image_info_t info; + int r; + + r = rbd_stat(s->image, &info, sizeof(info)); + if (r < 0) { + return r; + } + + return info.size; +} + +static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_resize(s->image, offset); + if (r < 0) { + return r; + } + + return 0; +} + +static int qemu_rbd_snap_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BDRVRBDState *s = bs->opaque; + int r; + + if (sn_info->name[0] == '\0') { + return -EINVAL; /* we need a name for rbd snapshots */ + } + + /* + * rbd snapshots are using the name as the user controlled unique identifier + * we can't use the rbd snapid for that purpose, as it can't be set + */ + if (sn_info->id_str[0] != '\0' && + strcmp(sn_info->id_str, sn_info->name) != 0) { + return -EINVAL; + } + + if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { + return -ERANGE; + } + + r = rbd_snap_create(s->image, sn_info->name); + if (r < 0) { + error_report("failed to create snap: %s", strerror(-r)); + return r; + } + + return 0; +} + +static int qemu_rbd_snap_remove(BlockDriverState *bs, + const char *snapshot_name) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_snap_remove(s->image, snapshot_name); + return r; +} + +static int qemu_rbd_snap_rollback(BlockDriverState *bs, + const char *snapshot_name) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_snap_rollback(s->image, snapshot_name); + return r; +} + +static int qemu_rbd_snap_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_tab) +{ + BDRVRBDState *s = bs->opaque; + QEMUSnapshotInfo *sn_info, *sn_tab = NULL; + int i, snap_count; + rbd_snap_info_t *snaps; + int max_snaps = RBD_MAX_SNAPS; + + do { + snaps = g_malloc(sizeof(*snaps) * max_snaps); + snap_count = rbd_snap_list(s->image, snaps, &max_snaps); + if (snap_count < 0) { + g_free(snaps); + } + } while (snap_count == -ERANGE); + + if (snap_count <= 0) { + goto done; + } + + sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo)); + + for (i = 0; i < snap_count; i++) { + const char *snap_name = snaps[i].name; + + sn_info = sn_tab + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); + pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); + + sn_info->vm_state_size = snaps[i].size; + sn_info->date_sec = 0; + sn_info->date_nsec = 0; + sn_info->vm_clock_nsec = 0; + } + rbd_snap_list_end(snaps); + + done: + *psn_tab = sn_tab; + return snap_count; +} + +#ifdef LIBRBD_SUPPORTS_DISCARD +static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, + RBD_AIO_DISCARD); +} +#endif + +static QEMUOptionParameter qemu_rbd_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "RBD object size" + }, + {NULL} +}; + +static BlockDriver bdrv_rbd = { + .format_name = "rbd", + .instance_size = sizeof(BDRVRBDState), + .bdrv_file_open = qemu_rbd_open, + .bdrv_close = qemu_rbd_close, + .bdrv_create = qemu_rbd_create, + .bdrv_get_info = qemu_rbd_getinfo, + .create_options = qemu_rbd_create_options, + .bdrv_getlength = qemu_rbd_getlength, + .bdrv_truncate = qemu_rbd_truncate, + .protocol_name = "rbd", + + .bdrv_aio_readv = qemu_rbd_aio_readv, + .bdrv_aio_writev = qemu_rbd_aio_writev, + .bdrv_co_flush_to_disk = qemu_rbd_co_flush, + +#ifdef LIBRBD_SUPPORTS_DISCARD + .bdrv_aio_discard = qemu_rbd_aio_discard, +#endif + + .bdrv_snapshot_create = qemu_rbd_snap_create, + .bdrv_snapshot_delete = qemu_rbd_snap_remove, + .bdrv_snapshot_list = qemu_rbd_snap_list, + .bdrv_snapshot_goto = qemu_rbd_snap_rollback, +}; + +static void bdrv_rbd_init(void) +{ + bdrv_register(&bdrv_rbd); +} + +block_init(bdrv_rbd_init); diff --git a/block/sheepdog.c b/block/sheepdog.c new file mode 100644 index 000000000..df4f44107 --- /dev/null +++ b/block/sheepdog.c @@ -0,0 +1,2083 @@ +/* + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu-error.h" +#include "qemu_socket.h" +#include "block_int.h" +#include "bitops.h" + +#define SD_PROTO_VER 0x01 + +#define SD_DEFAULT_ADDR "localhost" +#define SD_DEFAULT_PORT "7000" + +#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 +#define SD_OP_READ_OBJ 0x02 +#define SD_OP_WRITE_OBJ 0x03 + +#define SD_OP_NEW_VDI 0x11 +#define SD_OP_LOCK_VDI 0x12 +#define SD_OP_RELEASE_VDI 0x13 +#define SD_OP_GET_VDI_INFO 0x14 +#define SD_OP_READ_VDIS 0x15 +#define SD_OP_FLUSH_VDI 0x16 + +#define SD_FLAG_CMD_WRITE 0x01 +#define SD_FLAG_CMD_COW 0x02 +#define SD_FLAG_CMD_CACHE 0x04 + +#define SD_RES_SUCCESS 0x00 /* Success */ +#define SD_RES_UNKNOWN 0x01 /* Unknown error */ +#define SD_RES_NO_OBJ 0x02 /* No object found */ +#define SD_RES_EIO 0x03 /* I/O error */ +#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */ +#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ +#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ +#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */ +#define SD_RES_NO_VDI 0x08 /* No vdi found */ +#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */ +#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */ +#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */ +#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ +#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */ +#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ +#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ +#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */ +#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ +#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ +#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */ +#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ +#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ +#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */ +#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ +#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ + +/* + * Object ID rules + * + * 0 - 19 (20 bits): data object space + * 20 - 31 (12 bits): reserved data object space + * 32 - 55 (24 bits): vdi object space + * 56 - 59 ( 4 bits): reserved vdi object space + * 60 - 63 ( 4 bits): object type identifier space + */ + +#define VDI_SPACE_SHIFT 32 +#define VDI_BIT (UINT64_C(1) << 63) +#define VMSTATE_BIT (UINT64_C(1) << 62) +#define MAX_DATA_OBJS (UINT64_C(1) << 20) +#define MAX_CHILDREN 1024 +#define SD_MAX_VDI_LEN 256 +#define SD_MAX_VDI_TAG_LEN 256 +#define SD_NR_VDIS (1U << 24) +#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) +#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +#define SECTOR_SIZE 512 + +#define SD_INODE_SIZE (sizeof(SheepdogInode)) +#define CURRENT_VDI_ID 0 + +typedef struct SheepdogReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t opcode_specific[8]; +} SheepdogReq; + +typedef struct SheepdogRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint32_t opcode_specific[7]; +} SheepdogRsp; + +typedef struct SheepdogObjReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint64_t oid; + uint64_t cow_oid; + uint32_t copies; + uint32_t rsvd; + uint64_t offset; +} SheepdogObjReq; + +typedef struct SheepdogObjRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint32_t copies; + uint32_t pad[6]; +} SheepdogObjRsp; + +typedef struct SheepdogVdiReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint64_t vdi_size; + uint32_t base_vdi_id; + uint32_t copies; + uint32_t snapid; + uint32_t pad[3]; +} SheepdogVdiReq; + +typedef struct SheepdogVdiRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint32_t rsvd; + uint32_t vdi_id; + uint32_t pad[5]; +} SheepdogVdiRsp; + +typedef struct SheepdogInode { + char name[SD_MAX_VDI_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; + uint64_t ctime; + uint64_t snap_ctime; + uint64_t vm_clock_nsec; + uint64_t vdi_size; + uint64_t vm_state_size; + uint16_t copy_policy; + uint8_t nr_copies; + uint8_t block_size_shift; + uint32_t snap_id; + uint32_t vdi_id; + uint32_t parent_vdi_id; + uint32_t child_vdi_id[MAX_CHILDREN]; + uint32_t data_vdi_id[MAX_DATA_OBJS]; +} SheepdogInode; + +/* + * 64 bit FNV-1a non-zero initial basis + */ +#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) + +/* + * 64 bit Fowler/Noll/Vo FNV-1a hash code + */ +static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) +{ + unsigned char *bp = buf; + unsigned char *be = bp + len; + while (bp < be) { + hval ^= (uint64_t) *bp++; + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + return hval; +} + +static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx) +{ + return inode->vdi_id == inode->data_vdi_id[idx]; +} + +static inline int is_data_obj(uint64_t oid) +{ + return !(VDI_BIT & oid); +} + +static inline uint64_t data_oid_to_idx(uint64_t oid) +{ + return oid & (MAX_DATA_OBJS - 1); +} + +static inline uint64_t vid_to_vdi_oid(uint32_t vid) +{ + return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); +} + +static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) +{ + return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) +{ + return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline int is_snapshot(struct SheepdogInode *inode) +{ + return !!inode->snap_ctime; +} + +#undef dprintf +#ifdef DEBUG_SDOG +#define dprintf(fmt, args...) \ + do { \ + fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ + } while (0) +#else +#define dprintf(fmt, args...) +#endif + +typedef struct SheepdogAIOCB SheepdogAIOCB; + +typedef struct AIOReq { + SheepdogAIOCB *aiocb; + unsigned int iov_offset; + + uint64_t oid; + uint64_t base_oid; + uint64_t offset; + unsigned int data_len; + uint8_t flags; + uint32_t id; + + QLIST_ENTRY(AIOReq) aio_siblings; +} AIOReq; + +enum AIOCBState { + AIOCB_WRITE_UDATA, + AIOCB_READ_UDATA, +}; + +struct SheepdogAIOCB { + BlockDriverAIOCB common; + + QEMUIOVector *qiov; + + int64_t sector_num; + int nb_sectors; + + int ret; + enum AIOCBState aiocb_type; + + Coroutine *coroutine; + void (*aio_done_func)(SheepdogAIOCB *); + + int canceled; + int nr_pending; +}; + +typedef struct BDRVSheepdogState { + SheepdogInode inode; + + uint32_t min_dirty_data_idx; + uint32_t max_dirty_data_idx; + + char name[SD_MAX_VDI_LEN]; + int is_snapshot; + uint8_t cache_enabled; + + char *addr; + char *port; + int fd; + int flush_fd; + + CoMutex lock; + Coroutine *co_send; + Coroutine *co_recv; + + uint32_t aioreq_seq_num; + QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; + QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head; +} BDRVSheepdogState; + +static const char * sd_strerror(int err) +{ + int i; + + static const struct { + int err; + const char *desc; + } errors[] = { + {SD_RES_SUCCESS, "Success"}, + {SD_RES_UNKNOWN, "Unknown error"}, + {SD_RES_NO_OBJ, "No object found"}, + {SD_RES_EIO, "I/O error"}, + {SD_RES_VDI_EXIST, "VDI exists already"}, + {SD_RES_INVALID_PARMS, "Invalid parameters"}, + {SD_RES_SYSTEM_ERROR, "System error"}, + {SD_RES_VDI_LOCKED, "VDI is already locked"}, + {SD_RES_NO_VDI, "No vdi found"}, + {SD_RES_NO_BASE_VDI, "No base VDI found"}, + {SD_RES_VDI_READ, "Failed read the requested VDI"}, + {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, + {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, + {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, + {SD_RES_NO_TAG, "Failed to find the requested tag"}, + {SD_RES_STARTUP, "The system is still booting"}, + {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, + {SD_RES_SHUTDOWN, "The system is shutting down"}, + {SD_RES_NO_MEM, "Out of memory on the server"}, + {SD_RES_FULL_VDI, "We already have the maximum vdis"}, + {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, + {SD_RES_NO_SPACE, "Server has no space for new objects"}, + {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, + {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, + {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, + }; + + for (i = 0; i < ARRAY_SIZE(errors); ++i) { + if (errors[i].err == err) { + return errors[i].desc; + } + } + + return "Invalid error code"; +} + +/* + * Sheepdog I/O handling: + * + * 1. In sd_co_rw_vector, we send the I/O requests to the server and + * link the requests to the inflight_list in the + * BDRVSheepdogState. The function exits without waiting for + * receiving the response. + * + * 2. We receive the response in aio_read_response, the fd handler to + * the sheepdog connection. If metadata update is needed, we send + * the write request to the vdi object in sd_write_done, the write + * completion function. We switch back to sd_co_readv/writev after + * all the requests belonging to the AIOCB are finished. + */ + +static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, + uint64_t oid, unsigned int data_len, + uint64_t offset, uint8_t flags, + uint64_t base_oid, unsigned int iov_offset) +{ + AIOReq *aio_req; + + aio_req = g_malloc(sizeof(*aio_req)); + aio_req->aiocb = acb; + aio_req->iov_offset = iov_offset; + aio_req->oid = oid; + aio_req->base_oid = base_oid; + aio_req->offset = offset; + aio_req->data_len = data_len; + aio_req->flags = flags; + aio_req->id = s->aioreq_seq_num++; + + acb->nr_pending++; + return aio_req; +} + +static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + + QLIST_REMOVE(aio_req, aio_siblings); + g_free(aio_req); + + acb->nr_pending--; +} + +static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) +{ + if (!acb->canceled) { + qemu_coroutine_enter(acb->coroutine, NULL); + } + qemu_aio_release(acb); +} + +static void sd_aio_cancel(BlockDriverAIOCB *blockacb) +{ + SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; + + /* + * Sheepdog cannot cancel the requests which are already sent to + * the servers, so we just complete the request with -EIO here. + */ + acb->ret = -EIO; + qemu_coroutine_enter(acb->coroutine, NULL); + acb->canceled = 1; +} + +static AIOPool sd_aio_pool = { + .aiocb_size = sizeof(SheepdogAIOCB), + .cancel = sd_aio_cancel, +}; + +static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + SheepdogAIOCB *acb; + + acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque); + + acb->qiov = qiov; + + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + + acb->aio_done_func = NULL; + acb->canceled = 0; + acb->coroutine = qemu_coroutine_self(); + acb->ret = 0; + acb->nr_pending = 0; + return acb; +} + +static int connect_to_sdog(const char *addr, const char *port) +{ + char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; + int fd, ret; + struct addrinfo hints, *res, *res0; + + if (!addr) { + addr = SD_DEFAULT_ADDR; + port = SD_DEFAULT_PORT; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_STREAM; + + ret = getaddrinfo(addr, port, &hints, &res0); + if (ret) { + error_report("unable to get address info %s, %s", + addr, strerror(errno)); + return -errno; + } + + for (res = res0; res; res = res->ai_next) { + ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf), + sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV); + if (ret) { + continue; + } + + fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (fd < 0) { + continue; + } + + reconnect: + ret = connect(fd, res->ai_addr, res->ai_addrlen); + if (ret < 0) { + if (errno == EINTR) { + goto reconnect; + } + close(fd); + break; + } + + dprintf("connected to %s:%s\n", addr, port); + goto success; + } + fd = -errno; + error_report("failed connect to %s:%s", addr, port); +success: + freeaddrinfo(res0); + return fd; +} + +static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen) +{ + int ret; + + ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); + if (ret < sizeof(*hdr)) { + error_report("failed to send a req, %s", strerror(errno)); + return ret; + } + + ret = qemu_co_send(sockfd, data, *wlen); + if (ret < *wlen) { + error_report("failed to send a req, %s", strerror(errno)); + } + + return ret; +} + +static void restart_co_req(void *opaque) +{ + Coroutine *co = opaque; + + qemu_coroutine_enter(co, NULL); +} + +typedef struct SheepdogReqCo { + int sockfd; + SheepdogReq *hdr; + void *data; + unsigned int *wlen; + unsigned int *rlen; + int ret; + bool finished; +} SheepdogReqCo; + +static coroutine_fn void do_co_req(void *opaque) +{ + int ret; + Coroutine *co; + SheepdogReqCo *srco = opaque; + int sockfd = srco->sockfd; + SheepdogReq *hdr = srco->hdr; + void *data = srco->data; + unsigned int *wlen = srco->wlen; + unsigned int *rlen = srco->rlen; + + co = qemu_coroutine_self(); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co); + + socket_set_block(sockfd); + ret = send_co_req(sockfd, hdr, data, wlen); + if (ret < 0) { + goto out; + } + + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co); + + ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); + if (ret < sizeof(*hdr)) { + error_report("failed to get a rsp, %s", strerror(errno)); + ret = -errno; + goto out; + } + + if (*rlen > hdr->data_length) { + *rlen = hdr->data_length; + } + + if (*rlen) { + ret = qemu_co_recv(sockfd, data, *rlen); + if (ret < *rlen) { + error_report("failed to get the data, %s", strerror(errno)); + ret = -errno; + goto out; + } + } + ret = 0; +out: + qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); + socket_set_nonblock(sockfd); + + srco->ret = ret; + srco->finished = true; +} + +static int do_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen, unsigned int *rlen) +{ + Coroutine *co; + SheepdogReqCo srco = { + .sockfd = sockfd, + .hdr = hdr, + .data = data, + .wlen = wlen, + .rlen = rlen, + .ret = 0, + .finished = false, + }; + + if (qemu_in_coroutine()) { + do_co_req(&srco); + } else { + co = qemu_coroutine_create(do_co_req); + qemu_coroutine_enter(co, &srco); + while (!srco.finished) { + qemu_aio_wait(); + } + } + + return srco.ret; +} + +static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, + struct iovec *iov, int niov, int create, + enum AIOCBState aiocb_type); + + +static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) +{ + AIOReq *aio_req; + + QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) { + if (aio_req->oid == oid) { + return aio_req; + } + } + + return NULL; +} + +/* + * This function searchs pending requests to the object `oid', and + * sends them. + */ +static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) +{ + AIOReq *aio_req; + SheepdogAIOCB *acb; + int ret; + + while ((aio_req = find_pending_req(s, oid)) != NULL) { + acb = aio_req->aiocb; + /* move aio_req from pending list to inflight one */ + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + ret = add_aio_request(s, aio_req, acb->qiov->iov, + acb->qiov->niov, 0, acb->aiocb_type); + if (ret < 0) { + error_report("add_aio_request is failed"); + free_aio_req(s, aio_req); + if (!acb->nr_pending) { + sd_finish_aiocb(acb); + } + } + } +} + +/* + * Receive responses of the I/O requests. + * + * This function is registered as a fd handler, and called from the + * main loop when s->fd is ready for reading responses. + */ +static void coroutine_fn aio_read_response(void *opaque) +{ + SheepdogObjRsp rsp; + BDRVSheepdogState *s = opaque; + int fd = s->fd; + int ret; + AIOReq *aio_req = NULL; + SheepdogAIOCB *acb; + unsigned long idx; + + if (QLIST_EMPTY(&s->inflight_aio_head)) { + goto out; + } + + /* read a header */ + ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); + if (ret < 0) { + error_report("failed to get the header, %s", strerror(errno)); + goto out; + } + + /* find the right aio_req from the inflight aio list */ + QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { + if (aio_req->id == rsp.id) { + break; + } + } + if (!aio_req) { + error_report("cannot find aio_req %x", rsp.id); + goto out; + } + + acb = aio_req->aiocb; + + switch (acb->aiocb_type) { + case AIOCB_WRITE_UDATA: + /* this coroutine context is no longer suitable for co_recv + * because we may send data to update vdi objects */ + s->co_recv = NULL; + if (!is_data_obj(aio_req->oid)) { + break; + } + idx = data_oid_to_idx(aio_req->oid); + + if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) { + /* + * If the object is newly created one, we need to update + * the vdi object (metadata object). min_dirty_data_idx + * and max_dirty_data_idx are changed to include updated + * index between them. + */ + s->inode.data_vdi_id[idx] = s->inode.vdi_id; + s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); + s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); + + /* + * Some requests may be blocked because simultaneous + * create requests are not allowed, so we search the + * pending requests here. + */ + send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx)); + } + break; + case AIOCB_READ_UDATA: + ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, + aio_req->iov_offset, rsp.data_length); + if (ret < 0) { + error_report("failed to get the data, %s", strerror(errno)); + goto out; + } + break; + } + + if (rsp.result != SD_RES_SUCCESS) { + acb->ret = -EIO; + error_report("%s", sd_strerror(rsp.result)); + } + + free_aio_req(s, aio_req); + if (!acb->nr_pending) { + /* + * We've finished all requests which belong to the AIOCB, so + * we can switch back to sd_co_readv/writev now. + */ + acb->aio_done_func(acb); + } +out: + s->co_recv = NULL; +} + +static void co_read_response(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + if (!s->co_recv) { + s->co_recv = qemu_coroutine_create(aio_read_response); + } + + qemu_coroutine_enter(s->co_recv, opaque); +} + +static void co_write_request(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + qemu_coroutine_enter(s->co_send, NULL); +} + +static int aio_flush_request(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + return !QLIST_EMPTY(&s->inflight_aio_head) || + !QLIST_EMPTY(&s->pending_aio_head); +} + +static int set_nodelay(int fd) +{ + int ret, opt; + + opt = 1; + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); + return ret; +} + +/* + * Return a socket discriptor to read/write objects. + * + * We cannot use this discriptor for other operations because + * the block driver may be on waiting response from the server. + */ +static int get_sheep_fd(BDRVSheepdogState *s) +{ + int ret, fd; + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + error_report("%s", strerror(errno)); + return fd; + } + + socket_set_nonblock(fd); + + ret = set_nodelay(fd); + if (ret) { + error_report("%s", strerror(errno)); + closesocket(fd); + return -errno; + } + + qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); + return fd; +} + +/* + * Parse a filename + * + * filename must be one of the following formats: + * 1. [vdiname] + * 2. [vdiname]:[snapid] + * 3. [vdiname]:[tag] + * 4. [hostname]:[port]:[vdiname] + * 5. [hostname]:[port]:[vdiname]:[snapid] + * 6. [hostname]:[port]:[vdiname]:[tag] + * + * You can boot from the snapshot images by specifying `snapid` or + * `tag'. + * + * You can run VMs outside the Sheepdog cluster by specifying + * `hostname' and `port' (experimental). + */ +static int parse_vdiname(BDRVSheepdogState *s, const char *filename, + char *vdi, uint32_t *snapid, char *tag) +{ + char *p, *q; + int nr_sep; + + p = q = g_strdup(filename); + + /* count the number of separators */ + nr_sep = 0; + while (*p) { + if (*p == ':') { + nr_sep++; + } + p++; + } + p = q; + + /* use the first two tokens as hostname and port number. */ + if (nr_sep >= 2) { + s->addr = p; + p = strchr(p, ':'); + *p++ = '\0'; + + s->port = p; + p = strchr(p, ':'); + *p++ = '\0'; + } else { + s->addr = NULL; + s->port = 0; + } + + strncpy(vdi, p, SD_MAX_VDI_LEN); + + p = strchr(vdi, ':'); + if (p) { + *p++ = '\0'; + *snapid = strtoul(p, NULL, 10); + if (*snapid == 0) { + strncpy(tag, p, SD_MAX_VDI_TAG_LEN); + } + } else { + *snapid = CURRENT_VDI_ID; /* search current vdi */ + } + + if (s->addr == NULL) { + g_free(q); + } + + return 0; +} + +static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, + char *tag, uint32_t *vid, int for_snapshot) +{ + int ret, fd; + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + return fd; + } + + memset(buf, 0, sizeof(buf)); + strncpy(buf, filename, SD_MAX_VDI_LEN); + strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); + + memset(&hdr, 0, sizeof(hdr)); + if (for_snapshot) { + hdr.opcode = SD_OP_GET_VDI_INFO; + } else { + hdr.opcode = SD_OP_LOCK_VDI; + } + wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; + hdr.proto_ver = SD_PROTO_VER; + hdr.data_length = wlen; + hdr.snapid = snapid; + hdr.flags = SD_FLAG_CMD_WRITE; + + ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + if (ret) { + goto out; + } + + if (rsp->result != SD_RES_SUCCESS) { + error_report("cannot get vdi info, %s, %s %d %s", + sd_strerror(rsp->result), filename, snapid, tag); + if (rsp->result == SD_RES_NO_VDI) { + ret = -ENOENT; + } else { + ret = -EIO; + } + goto out; + } + *vid = rsp->vdi_id; + + ret = 0; +out: + closesocket(fd); + return ret; +} + +static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, + struct iovec *iov, int niov, int create, + enum AIOCBState aiocb_type) +{ + int nr_copies = s->inode.nr_copies; + SheepdogObjReq hdr; + unsigned int wlen; + int ret; + uint64_t oid = aio_req->oid; + unsigned int datalen = aio_req->data_len; + uint64_t offset = aio_req->offset; + uint8_t flags = aio_req->flags; + uint64_t old_oid = aio_req->base_oid; + + if (!nr_copies) { + error_report("bug"); + } + + memset(&hdr, 0, sizeof(hdr)); + + if (aiocb_type == AIOCB_READ_UDATA) { + wlen = 0; + hdr.opcode = SD_OP_READ_OBJ; + hdr.flags = flags; + } else if (create) { + wlen = datalen; + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + hdr.flags = SD_FLAG_CMD_WRITE | flags; + } else { + wlen = datalen; + hdr.opcode = SD_OP_WRITE_OBJ; + hdr.flags = SD_FLAG_CMD_WRITE | flags; + } + + if (s->cache_enabled) { + hdr.flags |= SD_FLAG_CMD_CACHE; + } + + hdr.oid = oid; + hdr.cow_oid = old_oid; + hdr.copies = s->inode.nr_copies; + + hdr.data_length = datalen; + hdr.offset = offset; + + hdr.id = aio_req->id; + + qemu_co_mutex_lock(&s->lock); + s->co_send = qemu_coroutine_self(); + qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, + aio_flush_request, s); + socket_set_cork(s->fd, 1); + + /* send a header */ + ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + error_report("failed to send a req, %s", strerror(errno)); + return -errno; + } + + if (wlen) { + ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + error_report("failed to send a data, %s", strerror(errno)); + return -errno; + } + } + + socket_set_cork(s->fd, 0); + qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, + aio_flush_request, s); + qemu_co_mutex_unlock(&s->lock); + + return 0; +} + +static int read_write_object(int fd, char *buf, uint64_t oid, int copies, + unsigned int datalen, uint64_t offset, + int write, int create, uint8_t cache) +{ + SheepdogObjReq hdr; + SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; + unsigned int wlen, rlen; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + + if (write) { + wlen = datalen; + rlen = 0; + hdr.flags = SD_FLAG_CMD_WRITE; + if (create) { + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + } else { + hdr.opcode = SD_OP_WRITE_OBJ; + } + } else { + wlen = 0; + rlen = datalen; + hdr.opcode = SD_OP_READ_OBJ; + } + + if (cache) { + hdr.flags |= SD_FLAG_CMD_CACHE; + } + + hdr.oid = oid; + hdr.data_length = datalen; + hdr.offset = offset; + hdr.copies = copies; + + ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + if (ret) { + error_report("failed to send a request to the sheep"); + return ret; + } + + switch (rsp->result) { + case SD_RES_SUCCESS: + return 0; + default: + error_report("%s", sd_strerror(rsp->result)); + return -EIO; + } +} + +static int read_object(int fd, char *buf, uint64_t oid, int copies, + unsigned int datalen, uint64_t offset, uint8_t cache) +{ + return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0, + cache); +} + +static int write_object(int fd, char *buf, uint64_t oid, int copies, + unsigned int datalen, uint64_t offset, int create, + uint8_t cache) +{ + return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create, + cache); +} + +static int sd_open(BlockDriverState *bs, const char *filename, int flags) +{ + int ret, fd; + uint32_t vid = 0; + BDRVSheepdogState *s = bs->opaque; + char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + uint32_t snapid; + char *buf = NULL; + + strstart(filename, "sheepdog:", (const char **)&filename); + + QLIST_INIT(&s->inflight_aio_head); + QLIST_INIT(&s->pending_aio_head); + s->fd = -1; + + memset(vdi, 0, sizeof(vdi)); + memset(tag, 0, sizeof(tag)); + if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) { + ret = -EINVAL; + goto out; + } + s->fd = get_sheep_fd(s); + if (s->fd < 0) { + ret = s->fd; + goto out; + } + + ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0); + if (ret) { + goto out; + } + + if (flags & BDRV_O_CACHE_WB) { + s->cache_enabled = 1; + s->flush_fd = connect_to_sdog(s->addr, s->port); + if (s->flush_fd < 0) { + error_report("failed to connect"); + ret = s->flush_fd; + goto out; + } + } + + if (snapid || tag[0] != '\0') { + dprintf("%" PRIx32 " snapshot inode was open.\n", vid); + s->is_snapshot = 1; + } + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + error_report("failed to connect"); + ret = fd; + goto out; + } + + buf = g_malloc(SD_INODE_SIZE); + ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0, + s->cache_enabled); + + closesocket(fd); + + if (ret) { + goto out; + } + + memcpy(&s->inode, buf, sizeof(s->inode)); + s->min_dirty_data_idx = UINT32_MAX; + s->max_dirty_data_idx = 0; + + bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE; + strncpy(s->name, vdi, sizeof(s->name)); + qemu_co_mutex_init(&s->lock); + g_free(buf); + return 0; +out: + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + if (s->fd >= 0) { + closesocket(s->fd); + } + g_free(buf); + return ret; +} + +static int do_sd_create(char *filename, int64_t vdi_size, + uint32_t base_vid, uint32_t *vdi_id, int snapshot, + const char *addr, const char *port) +{ + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + int fd, ret; + unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN]; + + fd = connect_to_sdog(addr, port); + if (fd < 0) { + return fd; + } + + memset(buf, 0, sizeof(buf)); + strncpy(buf, filename, SD_MAX_VDI_LEN); + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_NEW_VDI; + hdr.base_vdi_id = base_vid; + + wlen = SD_MAX_VDI_LEN; + + hdr.flags = SD_FLAG_CMD_WRITE; + hdr.snapid = snapshot; + + hdr.data_length = wlen; + hdr.vdi_size = vdi_size; + + ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + + closesocket(fd); + + if (ret) { + return ret; + } + + if (rsp->result != SD_RES_SUCCESS) { + error_report("%s, %s", sd_strerror(rsp->result), filename); + return -EIO; + } + + if (vdi_id) { + *vdi_id = rsp->vdi_id; + } + + return 0; +} + +static int sd_prealloc(const char *filename) +{ + BlockDriverState *bs = NULL; + uint32_t idx, max_idx; + int64_t vdi_size; + void *buf = g_malloc0(SD_DATA_OBJ_SIZE); + int ret; + + ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR); + if (ret < 0) { + goto out; + } + + vdi_size = bdrv_getlength(bs); + if (vdi_size < 0) { + ret = vdi_size; + goto out; + } + max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE); + + for (idx = 0; idx < max_idx; idx++) { + /* + * The created image can be a cloned image, so we need to read + * a data from the source image. + */ + ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + if (ret < 0) { + goto out; + } + ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + if (ret < 0) { + goto out; + } + } +out: + if (bs) { + bdrv_delete(bs); + } + g_free(buf); + + return ret; +} + +static int sd_create(const char *filename, QEMUOptionParameter *options) +{ + int ret = 0; + uint32_t vid = 0, base_vid = 0; + int64_t vdi_size = 0; + char *backing_file = NULL; + BDRVSheepdogState *s; + char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + uint32_t snapid; + int prealloc = 0; + const char *vdiname; + + s = g_malloc0(sizeof(BDRVSheepdogState)); + + strstart(filename, "sheepdog:", &vdiname); + + memset(vdi, 0, sizeof(vdi)); + memset(tag, 0, sizeof(tag)); + if (parse_vdiname(s, vdiname, vdi, &snapid, tag) < 0) { + error_report("invalid filename"); + ret = -EINVAL; + goto out; + } + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + vdi_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "full")) { + prealloc = 1; + } else { + error_report("Invalid preallocation mode: '%s'", + options->value.s); + ret = -EINVAL; + goto out; + } + } + options++; + } + + if (vdi_size > SD_MAX_VDI_SIZE) { + error_report("too big image size"); + ret = -EINVAL; + goto out; + } + + if (backing_file) { + BlockDriverState *bs; + BDRVSheepdogState *s; + BlockDriver *drv; + + /* Currently, only Sheepdog backing image is supported. */ + drv = bdrv_find_protocol(backing_file); + if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { + error_report("backing_file must be a sheepdog image"); + ret = -EINVAL; + goto out; + } + + ret = bdrv_file_open(&bs, backing_file, 0); + if (ret < 0) { + goto out; + } + + s = bs->opaque; + + if (!is_snapshot(&s->inode)) { + error_report("cannot clone from a non snapshot vdi"); + bdrv_delete(bs); + ret = -EINVAL; + goto out; + } + + base_vid = s->inode.vdi_id; + bdrv_delete(bs); + } + + ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port); + if (!prealloc || ret) { + goto out; + } + + ret = sd_prealloc(filename); +out: + g_free(s); + return ret; +} + +static void sd_close(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + unsigned int wlen, rlen = 0; + int fd, ret; + + dprintf("%s\n", s->name); + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + return; + } + + memset(&hdr, 0, sizeof(hdr)); + + hdr.opcode = SD_OP_RELEASE_VDI; + wlen = strlen(s->name) + 1; + hdr.data_length = wlen; + hdr.flags = SD_FLAG_CMD_WRITE; + + ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); + + closesocket(fd); + + if (!ret && rsp->result != SD_RES_SUCCESS && + rsp->result != SD_RES_VDI_NOT_LOCKED) { + error_report("%s, %s", sd_strerror(rsp->result), s->name); + } + + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + closesocket(s->fd); + if (s->cache_enabled) { + closesocket(s->flush_fd); + } + g_free(s->addr); +} + +static int64_t sd_getlength(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + + return s->inode.vdi_size; +} + +static int sd_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVSheepdogState *s = bs->opaque; + int ret, fd; + unsigned int datalen; + + if (offset < s->inode.vdi_size) { + error_report("shrinking is not supported"); + return -EINVAL; + } else if (offset > SD_MAX_VDI_SIZE) { + error_report("too big image size"); + return -EINVAL; + } + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + return fd; + } + + /* we don't need to update entire object */ + datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); + s->inode.vdi_size = offset; + ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), + s->inode.nr_copies, datalen, 0, 0, s->cache_enabled); + close(fd); + + if (ret < 0) { + error_report("failed to update an inode."); + } + + return ret; +} + +/* + * This function is called after writing data objects. If we need to + * update metadata, this sends a write request to the vdi object. + * Otherwise, this switches back to sd_co_readv/writev. + */ +static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) +{ + int ret; + BDRVSheepdogState *s = acb->common.bs->opaque; + struct iovec iov; + AIOReq *aio_req; + uint32_t offset, data_len, mn, mx; + + mn = s->min_dirty_data_idx; + mx = s->max_dirty_data_idx; + if (mn <= mx) { + /* we need to update the vdi object. */ + offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + + mn * sizeof(s->inode.data_vdi_id[0]); + data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); + + s->min_dirty_data_idx = UINT32_MAX; + s->max_dirty_data_idx = 0; + + iov.iov_base = &s->inode; + iov.iov_len = sizeof(s->inode); + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), + data_len, offset, 0, 0, offset); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA); + if (ret) { + free_aio_req(s, aio_req); + acb->ret = -EIO; + goto out; + } + + acb->aio_done_func = sd_finish_aiocb; + acb->aiocb_type = AIOCB_WRITE_UDATA; + return; + } +out: + sd_finish_aiocb(acb); +} + +/* + * Create a writable VDI from a snapshot + */ +static int sd_create_branch(BDRVSheepdogState *s) +{ + int ret, fd; + uint32_t vid; + char *buf; + + dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); + + buf = g_malloc(SD_INODE_SIZE); + + ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1, + s->addr, s->port); + if (ret) { + goto out; + } + + dprintf("%" PRIx32 " is created.\n", vid); + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + error_report("failed to connect"); + ret = fd; + goto out; + } + + ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, + SD_INODE_SIZE, 0, s->cache_enabled); + + closesocket(fd); + + if (ret < 0) { + goto out; + } + + memcpy(&s->inode, buf, sizeof(s->inode)); + + s->is_snapshot = 0; + ret = 0; + dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id); + +out: + g_free(buf); + + return ret; +} + +/* + * Send I/O requests to the server. + * + * This function sends requests to the server, links the requests to + * the inflight_list in BDRVSheepdogState, and exits without + * waiting the response. The responses are received in the + * `aio_read_response' function which is called from the main loop as + * a fd handler. + * + * Returns 1 when we need to wait a response, 0 when there is no sent + * request and -errno in error cases. + */ +static int coroutine_fn sd_co_rw_vector(void *p) +{ + SheepdogAIOCB *acb = p; + int ret = 0; + unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE; + unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE; + uint64_t oid; + uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE; + BDRVSheepdogState *s = acb->common.bs->opaque; + SheepdogInode *inode = &s->inode; + AIOReq *aio_req; + + if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { + /* + * In the case we open the snapshot VDI, Sheepdog creates the + * writable VDI when we do a write operation first. + */ + ret = sd_create_branch(s); + if (ret) { + acb->ret = -EIO; + goto out; + } + } + + /* + * Make sure we don't free the aiocb before we are done with all requests. + * This additional reference is dropped at the end of this function. + */ + acb->nr_pending++; + + while (done != total) { + uint8_t flags = 0; + uint64_t old_oid = 0; + int create = 0; + + oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); + + len = MIN(total - done, SD_DATA_OBJ_SIZE - offset); + + switch (acb->aiocb_type) { + case AIOCB_READ_UDATA: + if (!inode->data_vdi_id[idx]) { + qemu_iovec_memset(acb->qiov, done, 0, len); + goto done; + } + break; + case AIOCB_WRITE_UDATA: + if (!inode->data_vdi_id[idx]) { + create = 1; + } else if (!is_data_obj_writable(inode, idx)) { + /* Copy-On-Write */ + create = 1; + old_oid = oid; + flags = SD_FLAG_CMD_COW; + } + break; + default: + break; + } + + if (create) { + dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", + inode->vdi_id, oid, + vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); + oid = vid_to_data_oid(inode->vdi_id, idx); + dprintf("new oid %" PRIx64 "\n", oid); + } + + aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); + + if (create) { + AIOReq *areq; + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { + if (areq->oid == oid) { + /* + * Sheepdog cannot handle simultaneous create + * requests to the same object. So we cannot send + * the request until the previous request + * finishes. + */ + aio_req->flags = 0; + aio_req->base_oid = 0; + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, + aio_siblings); + goto done; + } + } + } + + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, + create, acb->aiocb_type); + if (ret < 0) { + error_report("add_aio_request is failed"); + free_aio_req(s, aio_req); + acb->ret = -EIO; + goto out; + } + done: + offset = 0; + idx++; + done += len; + } +out: + if (!--acb->nr_pending) { + return acb->ret; + } + return 1; +} + +static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + SheepdogAIOCB *acb; + int ret; + + if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { + ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE); + if (ret < 0) { + return ret; + } + bs->total_sectors = sector_num + nb_sectors; + } + + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb->aio_done_func = sd_write_done; + acb->aiocb_type = AIOCB_WRITE_UDATA; + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; +} + +static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + SheepdogAIOCB *acb; + int ret; + + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb->aiocb_type = AIOCB_READ_UDATA; + acb->aio_done_func = sd_finish_aiocb; + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; +} + +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogObjReq hdr = { 0 }; + SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; + SheepdogInode *inode = &s->inode; + int ret; + unsigned int wlen = 0, rlen = 0; + + if (!s->cache_enabled) { + return 0; + } + + hdr.opcode = SD_OP_FLUSH_VDI; + hdr.oid = vid_to_vdi_oid(inode->vdi_id); + + ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); + if (ret) { + error_report("failed to send a request to the sheep"); + return ret; + } + + if (rsp->result == SD_RES_INVALID_PARMS) { + dprintf("disable write cache since the server doesn't support it\n"); + + s->cache_enabled = 0; + closesocket(s->flush_fd); + return 0; + } + + if (rsp->result != SD_RES_SUCCESS) { + error_report("%s", sd_strerror(rsp->result)); + return -EIO; + } + + return 0; +} + +static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ + BDRVSheepdogState *s = bs->opaque; + int ret, fd; + uint32_t new_vid; + SheepdogInode *inode; + unsigned int datalen; + + dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " + "is_snapshot %d\n", sn_info->name, sn_info->id_str, + s->name, sn_info->vm_state_size, s->is_snapshot); + + if (s->is_snapshot) { + error_report("You can't create a snapshot of a snapshot VDI, " + "%s (%" PRIu32 ").", s->name, s->inode.vdi_id); + + return -EINVAL; + } + + dprintf("%s %s\n", sn_info->name, sn_info->id_str); + + s->inode.vm_state_size = sn_info->vm_state_size; + s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; + strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); + /* we don't need to update entire object */ + datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); + + /* refresh inode. */ + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + ret = fd; + goto cleanup; + } + + ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), + s->inode.nr_copies, datalen, 0, 0, s->cache_enabled); + if (ret < 0) { + error_report("failed to write snapshot's inode."); + goto cleanup; + } + + ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1, + s->addr, s->port); + if (ret < 0) { + error_report("failed to create inode for snapshot. %s", + strerror(errno)); + goto cleanup; + } + + inode = (SheepdogInode *)g_malloc(datalen); + + ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid), + s->inode.nr_copies, datalen, 0, s->cache_enabled); + + if (ret < 0) { + error_report("failed to read new inode info. %s", strerror(errno)); + goto cleanup; + } + + memcpy(&s->inode, inode, datalen); + dprintf("s->inode: name %s snap_id %x oid %x\n", + s->inode.name, s->inode.snap_id, s->inode.vdi_id); + +cleanup: + closesocket(fd); + return ret; +} + +static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVSheepdogState *s = bs->opaque; + BDRVSheepdogState *old_s; + char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + char *buf = NULL; + uint32_t vid; + uint32_t snapid = 0; + int ret = 0, fd; + + old_s = g_malloc(sizeof(BDRVSheepdogState)); + + memcpy(old_s, s, sizeof(BDRVSheepdogState)); + + memset(vdi, 0, sizeof(vdi)); + strncpy(vdi, s->name, sizeof(vdi)); + + memset(tag, 0, sizeof(tag)); + snapid = strtoul(snapshot_id, NULL, 10); + if (!snapid) { + strncpy(tag, s->name, sizeof(tag)); + } + + ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1); + if (ret) { + error_report("Failed to find_vdi_name"); + goto out; + } + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + error_report("failed to connect"); + ret = fd; + goto out; + } + + buf = g_malloc(SD_INODE_SIZE); + ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, + SD_INODE_SIZE, 0, s->cache_enabled); + + closesocket(fd); + + if (ret) { + goto out; + } + + memcpy(&s->inode, buf, sizeof(s->inode)); + + if (!s->inode.vm_state_size) { + error_report("Invalid snapshot"); + ret = -ENOENT; + goto out; + } + + s->is_snapshot = 1; + + g_free(buf); + g_free(old_s); + + return 0; +out: + /* recover bdrv_sd_state */ + memcpy(s, old_s, sizeof(BDRVSheepdogState)); + g_free(buf); + g_free(old_s); + + error_report("failed to open. recover old bdrv_sd_state."); + + return ret; +} + +static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + /* FIXME: Delete specified snapshot id. */ + return 0; +} + +static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogReq req; + int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); + QEMUSnapshotInfo *sn_tab = NULL; + unsigned wlen, rlen; + int found = 0; + static SheepdogInode inode; + unsigned long *vdi_inuse; + unsigned int start_nr; + uint64_t hval; + uint32_t vid; + + vdi_inuse = g_malloc(max); + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + ret = fd; + goto out; + } + + rlen = max; + wlen = 0; + + memset(&req, 0, sizeof(req)); + + req.opcode = SD_OP_READ_VDIS; + req.data_length = max; + + ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen); + + closesocket(fd); + if (ret) { + goto out; + } + + sn_tab = g_malloc0(nr * sizeof(*sn_tab)); + + /* calculate a vdi id with hash function */ + hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); + start_nr = hval & (SD_NR_VDIS - 1); + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + error_report("failed to connect"); + ret = fd; + goto out; + } + + for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) { + if (!test_bit(vid, vdi_inuse)) { + break; + } + + /* we don't need to read entire object */ + ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid), + 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, + s->cache_enabled); + + if (ret) { + continue; + } + + if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) { + sn_tab[found].date_sec = inode.snap_ctime >> 32; + sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; + sn_tab[found].vm_state_size = inode.vm_state_size; + sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; + + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", + inode.snap_id); + strncpy(sn_tab[found].name, inode.tag, + MIN(sizeof(sn_tab[found].name), sizeof(inode.tag))); + found++; + } + } + + closesocket(fd); +out: + *psn_tab = sn_tab; + + g_free(vdi_inuse); + + if (ret < 0) { + return ret; + } + + return found; +} + +static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, + int64_t pos, int size, int load) +{ + int fd, create; + int ret = 0, remaining = size; + unsigned int data_len; + uint64_t vmstate_oid; + uint32_t vdi_index; + uint64_t offset; + + fd = connect_to_sdog(s->addr, s->port); + if (fd < 0) { + return fd; + } + + while (remaining) { + vdi_index = pos / SD_DATA_OBJ_SIZE; + offset = pos % SD_DATA_OBJ_SIZE; + + data_len = MIN(remaining, SD_DATA_OBJ_SIZE); + + vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index); + + create = (offset == 0); + if (load) { + ret = read_object(fd, (char *)data, vmstate_oid, + s->inode.nr_copies, data_len, offset, + s->cache_enabled); + } else { + ret = write_object(fd, (char *)data, vmstate_oid, + s->inode.nr_copies, data_len, offset, create, + s->cache_enabled); + } + + if (ret < 0) { + error_report("failed to save vmstate %s", strerror(errno)); + goto cleanup; + } + + pos += data_len; + remaining -= data_len; + } + ret = size; +cleanup: + closesocket(fd); + return ret; +} + +static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data, + int64_t pos, int size) +{ + BDRVSheepdogState *s = bs->opaque; + + return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0); +} + +static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, + int64_t pos, int size) +{ + BDRVSheepdogState *s = bs->opaque; + + return do_load_save_vmstate(s, data, pos, size, 1); +} + + +static QEMUOptionParameter sd_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { NULL } +}; + +BlockDriver bdrv_sheepdog = { + .format_name = "sheepdog", + .protocol_name = "sheepdog", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_file_open = sd_open, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_getlength = sd_getlength, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .create_options = sd_create_options, +}; + +static void bdrv_sheepdog_init(void) +{ + bdrv_register(&bdrv_sheepdog); +} +block_init(bdrv_sheepdog_init); diff --git a/block/stream.c b/block/stream.c new file mode 100644 index 000000000..c4f87dd5b --- /dev/null +++ b/block/stream.c @@ -0,0 +1,209 @@ +/* + * Image streaming + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block_int.h" +#include "qemu/ratelimit.h" + +enum { + /* + * Size of data buffer for populating the image file. This should be large + * enough to process multiple clusters in a single call, so that populating + * contiguous regions of the image is efficient. + */ + STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct StreamBlockJob { + BlockJob common; + RateLimit limit; + BlockDriverState *base; + char backing_file_id[1024]; +} StreamBlockJob; + +static int coroutine_fn stream_populate(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + void *buf) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + QEMUIOVector qiov; + + qemu_iovec_init_external(&qiov, &iov, 1); + + /* Copy-on-read the unallocated clusters */ + return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); +} + +static void close_unused_images(BlockDriverState *top, BlockDriverState *base, + const char *base_id) +{ + BlockDriverState *intermediate; + intermediate = top->backing_hd; + + while (intermediate) { + BlockDriverState *unused; + + /* reached base */ + if (intermediate == base) { + break; + } + + unused = intermediate; + intermediate = intermediate->backing_hd; + unused->backing_hd = NULL; + bdrv_delete(unused); + } + top->backing_hd = base; +} + +static void coroutine_fn stream_run(void *opaque) +{ + StreamBlockJob *s = opaque; + BlockDriverState *bs = s->common.bs; + BlockDriverState *base = s->base; + int64_t sector_num, end; + int ret = 0; + int n = 0; + void *buf; + + s->common.len = bdrv_getlength(bs); + if (s->common.len < 0) { + block_job_complete(&s->common, s->common.len); + return; + } + + end = s->common.len >> BDRV_SECTOR_BITS; + buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); + + /* Turn on copy-on-read for the whole block device so that guest read + * requests help us make progress. Only do this when copying the entire + * backing chain since the copy-on-read operation does not take base into + * account. + */ + if (!base) { + bdrv_enable_copy_on_read(bs); + } + + for (sector_num = 0; sector_num < end; sector_num += n) { + uint64_t delay_ns = 0; + bool copy; + +wait: + /* Note that even when no rate limit is applied we need to yield + * with no pending I/O here so that qemu_aio_flush() returns. + */ + block_job_sleep_ns(&s->common, rt_clock, delay_ns); + if (block_job_is_cancelled(&s->common)) { + break; + } + + ret = bdrv_co_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + if (ret == 1) { + /* Allocated in the top, no need to copy. */ + copy = false; + } else { + /* Copy if allocated in the intermediate images. Limit to the + * known-unallocated area [sector_num, sector_num+n). */ + ret = bdrv_co_is_allocated_above(bs->backing_hd, base, + sector_num, n, &n); + + /* Finish early if end of backing file has been reached */ + if (ret == 0 && n == 0) { + n = end - sector_num; + } + + copy = (ret == 1); + } + trace_stream_one_iteration(s, sector_num, n, ret); + if (ret >= 0 && copy) { + if (s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, n); + if (delay_ns > 0) { + goto wait; + } + } + ret = stream_populate(bs, sector_num, n, buf); + } + if (ret < 0) { + break; + } + ret = 0; + + /* Publish progress */ + s->common.offset += n * BDRV_SECTOR_SIZE; + } + + if (!base) { + bdrv_disable_copy_on_read(bs); + } + + if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) { + const char *base_id = NULL, *base_fmt = NULL; + if (base) { + base_id = s->backing_file_id; + if (base->drv) { + base_fmt = base->drv->format_name; + } + } + ret = bdrv_change_backing_file(bs, base_id, base_fmt); + close_unused_images(bs, base, base_id); + } + + qemu_vfree(buf); + block_job_complete(&s->common, ret); +} + +static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + + if (speed < 0) { + error_set(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static BlockJobType stream_job_type = { + .instance_size = sizeof(StreamBlockJob), + .job_type = "stream", + .set_speed = stream_set_speed, +}; + +void stream_start(BlockDriverState *bs, BlockDriverState *base, + const char *base_id, int64_t speed, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + StreamBlockJob *s; + + s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp); + if (!s) { + return; + } + + s->base = base; + if (base_id) { + pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id); + } + + s->common.co = qemu_coroutine_create(stream_run); + trace_stream_start(bs, base, s, s->common.co, opaque); + qemu_coroutine_enter(s->common.co, s); +} diff --git a/block/vdi.c b/block/vdi.c new file mode 100644 index 000000000..c4f1529db --- /dev/null +++ b/block/vdi.c @@ -0,0 +1,786 @@ +/* + * Block driver for the Virtual Disk Image (VDI) format + * + * Copyright (c) 2009, 2012 Stefan Weil + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Reference: + * http://forums.virtualbox.org/viewtopic.php?t=8046 + * + * This driver supports create / read / write operations on VDI images. + * + * Todo (see also TODO in code): + * + * Some features like snapshots are still missing. + * + * Deallocation of zero-filled blocks and shrinking images are missing, too + * (might be added to common block layer). + * + * Allocation of blocks could be optimized (less writes to block map and + * header). + * + * Read and write of adjacents blocks could be done in one operation + * (current code uses one operation per block (1 MiB). + * + * The code is not thread safe (missing locks for changes in header and + * block table, no problem with current QEMU). + * + * Hints: + * + * Blocks (VDI documentation) correspond to clusters (QEMU). + * QEMU's backing files could be implemented using VDI snapshot files (TODO). + * VDI snapshot files may also contain the complete machine state. + * Maybe this machine state can be converted to QEMU PC machine snapshot data. + * + * The driver keeps a block cache (little endian entries) in memory. + * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM, + * so this seems to be reasonable. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include "migration.h" + +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#else +/* TODO: move uuid emulation to some central place in QEMU. */ +#include "sysemu.h" /* UUID_FMT */ +typedef unsigned char uuid_t[16]; +void uuid_generate(uuid_t out); +int uuid_is_null(const uuid_t uu); +void uuid_unparse(const uuid_t uu, char *out); +#endif + +/* Code configuration options. */ + +/* Enable debug messages. */ +//~ #define CONFIG_VDI_DEBUG + +/* Support write operations on VDI images. */ +#define CONFIG_VDI_WRITE + +/* Support non-standard block (cluster) size. This is untested. + * Maybe it will be needed for very large images. + */ +//~ #define CONFIG_VDI_BLOCK_SIZE + +/* Support static (fixed, pre-allocated) images. */ +#define CONFIG_VDI_STATIC_IMAGE + +/* Command line option for static images. */ +#define BLOCK_OPT_STATIC "static" + +#define KiB 1024 +#define MiB (KiB * KiB) + +#define SECTOR_SIZE 512 +#define DEFAULT_CLUSTER_SIZE (1 * MiB) + +#if defined(CONFIG_VDI_DEBUG) +#define logout(fmt, ...) \ + fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +/* Image signature. */ +#define VDI_SIGNATURE 0xbeda107f + +/* Image version. */ +#define VDI_VERSION_1_1 0x00010001 + +/* Image type. */ +#define VDI_TYPE_DYNAMIC 1 +#define VDI_TYPE_STATIC 2 + +/* Innotek / SUN images use these strings in header.text: + * "<<< innotek VirtualBox Disk Image >>>\n" + * "<<< Sun xVM VirtualBox Disk Image >>>\n" + * "<<< Sun VirtualBox Disk Image >>>\n" + * The value does not matter, so QEMU created images use a different text. + */ +#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n" + +/* A never-allocated block; semantically arbitrary content. */ +#define VDI_UNALLOCATED 0xffffffffU + +/* A discarded (no longer allocated) block; semantically zero-filled. */ +#define VDI_DISCARDED 0xfffffffeU + +#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) + +#if !defined(CONFIG_UUID) +void uuid_generate(uuid_t out) +{ + memset(out, 0, sizeof(uuid_t)); +} + +int uuid_is_null(const uuid_t uu) +{ + uuid_t null_uuid = { 0 }; + return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0; +} + +void uuid_unparse(const uuid_t uu, char *out) +{ + snprintf(out, 37, UUID_FMT, + uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], + uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); +} +#endif + +typedef struct { + char text[0x40]; + uint32_t signature; + uint32_t version; + uint32_t header_size; + uint32_t image_type; + uint32_t image_flags; + char description[256]; + uint32_t offset_bmap; + uint32_t offset_data; + uint32_t cylinders; /* disk geometry, unused here */ + uint32_t heads; /* disk geometry, unused here */ + uint32_t sectors; /* disk geometry, unused here */ + uint32_t sector_size; + uint32_t unused1; + uint64_t disk_size; + uint32_t block_size; + uint32_t block_extra; /* unused here */ + uint32_t blocks_in_image; + uint32_t blocks_allocated; + uuid_t uuid_image; + uuid_t uuid_last_snap; + uuid_t uuid_link; + uuid_t uuid_parent; + uint64_t unused2[7]; +} VdiHeader; + +typedef struct { + /* The block map entries are little endian (even in memory). */ + uint32_t *bmap; + /* Size of block (bytes). */ + uint32_t block_size; + /* Size of block (sectors). */ + uint32_t block_sectors; + /* First sector of block map. */ + uint32_t bmap_sector; + /* VDI header (converted to host endianness). */ + VdiHeader header; + + Error *migration_blocker; +} BDRVVdiState; + +/* Change UUID from little endian (IPRT = VirtualBox format) to big endian + * format (network byte order, standard, see RFC 4122) and vice versa. + */ +static void uuid_convert(uuid_t uuid) +{ + bswap32s((uint32_t *)&uuid[0]); + bswap16s((uint16_t *)&uuid[4]); + bswap16s((uint16_t *)&uuid[6]); +} + +static void vdi_header_to_cpu(VdiHeader *header) +{ + le32_to_cpus(&header->signature); + le32_to_cpus(&header->version); + le32_to_cpus(&header->header_size); + le32_to_cpus(&header->image_type); + le32_to_cpus(&header->image_flags); + le32_to_cpus(&header->offset_bmap); + le32_to_cpus(&header->offset_data); + le32_to_cpus(&header->cylinders); + le32_to_cpus(&header->heads); + le32_to_cpus(&header->sectors); + le32_to_cpus(&header->sector_size); + le64_to_cpus(&header->disk_size); + le32_to_cpus(&header->block_size); + le32_to_cpus(&header->block_extra); + le32_to_cpus(&header->blocks_in_image); + le32_to_cpus(&header->blocks_allocated); + uuid_convert(header->uuid_image); + uuid_convert(header->uuid_last_snap); + uuid_convert(header->uuid_link); + uuid_convert(header->uuid_parent); +} + +static void vdi_header_to_le(VdiHeader *header) +{ + cpu_to_le32s(&header->signature); + cpu_to_le32s(&header->version); + cpu_to_le32s(&header->header_size); + cpu_to_le32s(&header->image_type); + cpu_to_le32s(&header->image_flags); + cpu_to_le32s(&header->offset_bmap); + cpu_to_le32s(&header->offset_data); + cpu_to_le32s(&header->cylinders); + cpu_to_le32s(&header->heads); + cpu_to_le32s(&header->sectors); + cpu_to_le32s(&header->sector_size); + cpu_to_le64s(&header->disk_size); + cpu_to_le32s(&header->block_size); + cpu_to_le32s(&header->block_extra); + cpu_to_le32s(&header->blocks_in_image); + cpu_to_le32s(&header->blocks_allocated); + cpu_to_le32s(&header->blocks_allocated); + uuid_convert(header->uuid_image); + uuid_convert(header->uuid_last_snap); + uuid_convert(header->uuid_link); + uuid_convert(header->uuid_parent); +} + +#if defined(CONFIG_VDI_DEBUG) +static void vdi_header_print(VdiHeader *header) +{ + char uuid[37]; + logout("text %s", header->text); + logout("signature 0x%04x\n", header->signature); + logout("header size 0x%04x\n", header->header_size); + logout("image type 0x%04x\n", header->image_type); + logout("image flags 0x%04x\n", header->image_flags); + logout("description %s\n", header->description); + logout("offset bmap 0x%04x\n", header->offset_bmap); + logout("offset data 0x%04x\n", header->offset_data); + logout("cylinders 0x%04x\n", header->cylinders); + logout("heads 0x%04x\n", header->heads); + logout("sectors 0x%04x\n", header->sectors); + logout("sector size 0x%04x\n", header->sector_size); + logout("image size 0x%" PRIx64 " B (%" PRIu64 " MiB)\n", + header->disk_size, header->disk_size / MiB); + logout("block size 0x%04x\n", header->block_size); + logout("block extra 0x%04x\n", header->block_extra); + logout("blocks tot. 0x%04x\n", header->blocks_in_image); + logout("blocks all. 0x%04x\n", header->blocks_allocated); + uuid_unparse(header->uuid_image, uuid); + logout("uuid image %s\n", uuid); + uuid_unparse(header->uuid_last_snap, uuid); + logout("uuid snap %s\n", uuid); + uuid_unparse(header->uuid_link, uuid); + logout("uuid link %s\n", uuid); + uuid_unparse(header->uuid_parent, uuid); + logout("uuid parent %s\n", uuid); +} +#endif + +static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + /* TODO: additional checks possible. */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + uint32_t blocks_allocated = 0; + uint32_t block; + uint32_t *bmap; + logout("\n"); + + if (fix) { + return -ENOTSUP; + } + + bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t)); + memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); + + /* Check block map and value of blocks_allocated. */ + for (block = 0; block < s->header.blocks_in_image; block++) { + uint32_t bmap_entry = le32_to_cpu(s->bmap[block]); + if (VDI_IS_ALLOCATED(bmap_entry)) { + if (bmap_entry < s->header.blocks_in_image) { + blocks_allocated++; + if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) { + bmap[bmap_entry] = bmap_entry; + } else { + fprintf(stderr, "ERROR: block index %" PRIu32 + " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry); + res->corruptions++; + } + } else { + fprintf(stderr, "ERROR: block index %" PRIu32 + " too large, is %" PRIu32 "\n", block, bmap_entry); + res->corruptions++; + } + } + } + if (blocks_allocated != s->header.blocks_allocated) { + fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32 + ", should be %" PRIu32 "\n", + blocks_allocated, s->header.blocks_allocated); + res->corruptions++; + } + + g_free(bmap); + + return 0; +} + +static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + /* TODO: vdi_get_info would be needed for machine snapshots. + vm_state_offset is still missing. */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + logout("\n"); + bdi->cluster_size = s->block_size; + bdi->vm_state_offset = 0; + return 0; +} + +static int vdi_make_empty(BlockDriverState *bs) +{ + /* TODO: missing code. */ + logout("\n"); + /* The return value for missing code must be 0, see block.c. */ + return 0; +} + +static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const VdiHeader *header = (const VdiHeader *)buf; + int result = 0; + + logout("\n"); + + if (buf_size < sizeof(*header)) { + /* Header too small, no VDI. */ + } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { + result = 100; + } + + if (result == 0) { + logout("no vdi image\n"); + } else { + logout("%s", header->text); + } + + return result; +} + +static int vdi_open(BlockDriverState *bs, int flags) +{ + BDRVVdiState *s = bs->opaque; + VdiHeader header; + size_t bmap_size; + + logout("\n"); + + if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) { + goto fail; + } + + vdi_header_to_cpu(&header); +#if defined(CONFIG_VDI_DEBUG) + vdi_header_print(&header); +#endif + + if (header.disk_size % SECTOR_SIZE != 0) { + /* 'VBoxManage convertfromraw' can create images with odd disk sizes. + We accept them but round the disk size to the next multiple of + SECTOR_SIZE. */ + logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size); + header.disk_size += SECTOR_SIZE - 1; + header.disk_size &= ~(SECTOR_SIZE - 1); + } + + if (header.version != VDI_VERSION_1_1) { + logout("unsupported version %u.%u\n", + header.version >> 16, header.version & 0xffff); + goto fail; + } else if (header.offset_bmap % SECTOR_SIZE != 0) { + /* We only support block maps which start on a sector boundary. */ + logout("unsupported block map offset 0x%x B\n", header.offset_bmap); + goto fail; + } else if (header.offset_data % SECTOR_SIZE != 0) { + /* We only support data blocks which start on a sector boundary. */ + logout("unsupported data offset 0x%x B\n", header.offset_data); + goto fail; + } else if (header.sector_size != SECTOR_SIZE) { + logout("unsupported sector size %u B\n", header.sector_size); + goto fail; + } else if (header.block_size != 1 * MiB) { + logout("unsupported block size %u B\n", header.block_size); + goto fail; + } else if (header.disk_size > + (uint64_t)header.blocks_in_image * header.block_size) { + logout("unsupported disk size %" PRIu64 " B\n", header.disk_size); + goto fail; + } else if (!uuid_is_null(header.uuid_link)) { + logout("link uuid != 0, unsupported\n"); + goto fail; + } else if (!uuid_is_null(header.uuid_parent)) { + logout("parent uuid != 0, unsupported\n"); + goto fail; + } + + bs->total_sectors = header.disk_size / SECTOR_SIZE; + + s->block_size = header.block_size; + s->block_sectors = header.block_size / SECTOR_SIZE; + s->bmap_sector = header.offset_bmap / SECTOR_SIZE; + s->header = header; + + bmap_size = header.blocks_in_image * sizeof(uint32_t); + bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE; + if (bmap_size > 0) { + s->bmap = g_malloc(bmap_size * SECTOR_SIZE); + } + if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) { + goto fail_free_bmap; + } + + /* Disable migration when vdi images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vdi", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + + return 0; + + fail_free_bmap: + g_free(s->bmap); + + fail: + return -1; +} + +static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + size_t bmap_index = sector_num / s->block_sectors; + size_t sector_in_block = sector_num % s->block_sectors; + int n_sectors = s->block_sectors - sector_in_block; + uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); + logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + *pnum = n_sectors; + return VDI_IS_ALLOCATED(bmap_entry); +} + +static int vdi_co_read(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors) +{ + BDRVVdiState *s = bs->opaque; + uint32_t bmap_entry; + uint32_t block_index; + uint32_t sector_in_block; + uint32_t n_sectors; + int ret = 0; + + logout("\n"); + + while (ret >= 0 && nb_sectors > 0) { + block_index = sector_num / s->block_sectors; + sector_in_block = sector_num % s->block_sectors; + n_sectors = s->block_sectors - sector_in_block; + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + + logout("will read %u sectors starting at sector %" PRIu64 "\n", + n_sectors, sector_num); + + /* prepare next AIO request */ + bmap_entry = le32_to_cpu(s->bmap[block_index]); + if (!VDI_IS_ALLOCATED(bmap_entry)) { + /* Block not allocated, return zeros, no need to wait. */ + memset(buf, 0, n_sectors * SECTOR_SIZE); + ret = 0; + } else { + uint64_t offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors + + sector_in_block; + ret = bdrv_read(bs->file, offset, buf, n_sectors); + } + logout("%u sectors read\n", n_sectors); + + nb_sectors -= n_sectors; + sector_num += n_sectors; + buf += n_sectors * SECTOR_SIZE; + } + + return ret; +} + +static int vdi_co_write(BlockDriverState *bs, + int64_t sector_num, const uint8_t *buf, int nb_sectors) +{ + BDRVVdiState *s = bs->opaque; + uint32_t bmap_entry; + uint32_t block_index; + uint32_t sector_in_block; + uint32_t n_sectors; + uint32_t bmap_first = VDI_UNALLOCATED; + uint32_t bmap_last = VDI_UNALLOCATED; + uint8_t *block = NULL; + int ret = 0; + + logout("\n"); + + while (ret >= 0 && nb_sectors > 0) { + block_index = sector_num / s->block_sectors; + sector_in_block = sector_num % s->block_sectors; + n_sectors = s->block_sectors - sector_in_block; + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + + logout("will write %u sectors starting at sector %" PRIu64 "\n", + n_sectors, sector_num); + + /* prepare next AIO request */ + bmap_entry = le32_to_cpu(s->bmap[block_index]); + if (!VDI_IS_ALLOCATED(bmap_entry)) { + /* Allocate new block and write to it. */ + uint64_t offset; + bmap_entry = s->header.blocks_allocated; + s->bmap[block_index] = cpu_to_le32(bmap_entry); + s->header.blocks_allocated++; + offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors; + if (block == NULL) { + block = g_malloc(s->block_size); + bmap_first = block_index; + } + bmap_last = block_index; + /* Copy data to be written to new block and zero unused parts. */ + memset(block, 0, sector_in_block * SECTOR_SIZE); + memcpy(block + sector_in_block * SECTOR_SIZE, + buf, n_sectors * SECTOR_SIZE); + memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, + (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + ret = bdrv_write(bs->file, offset, block, s->block_sectors); + } else { + uint64_t offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors + + sector_in_block; + ret = bdrv_write(bs->file, offset, buf, n_sectors); + } + + nb_sectors -= n_sectors; + sector_num += n_sectors; + buf += n_sectors * SECTOR_SIZE; + + logout("%u sectors written\n", n_sectors); + } + + logout("finished data write\n"); + if (ret < 0) { + return ret; + } + + if (block) { + /* One or more new blocks were allocated. */ + VdiHeader *header = (VdiHeader *) block; + uint8_t *base; + uint64_t offset; + + logout("now writing modified header\n"); + assert(VDI_IS_ALLOCATED(bmap_first)); + *header = s->header; + vdi_header_to_le(header); + ret = bdrv_write(bs->file, 0, block, 1); + g_free(block); + block = NULL; + + if (ret < 0) { + return ret; + } + + logout("now writing modified block map entry %u...%u\n", + bmap_first, bmap_last); + /* Write modified sectors from block map. */ + bmap_first /= (SECTOR_SIZE / sizeof(uint32_t)); + bmap_last /= (SECTOR_SIZE / sizeof(uint32_t)); + n_sectors = bmap_last - bmap_first + 1; + offset = s->bmap_sector + bmap_first; + base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; + logout("will write %u block map sectors starting from entry %u\n", + n_sectors, bmap_first); + ret = bdrv_write(bs->file, offset, base, n_sectors); + } + + return ret; +} + +static int vdi_create(const char *filename, QEMUOptionParameter *options) +{ + int fd; + int result = 0; + uint64_t bytes = 0; + uint32_t blocks; + size_t block_size = DEFAULT_CLUSTER_SIZE; + uint32_t image_type = VDI_TYPE_DYNAMIC; + VdiHeader header; + size_t i; + size_t bmap_size; + uint32_t *bmap; + + logout("\n"); + + /* Read out options. */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + bytes = options->value.n; +#if defined(CONFIG_VDI_BLOCK_SIZE) + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ + block_size = options->value.n; + } +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) + } else if (!strcmp(options->name, BLOCK_OPT_STATIC)) { + if (options->value.n) { + image_type = VDI_TYPE_STATIC; + } +#endif + } + options++; + } + + fd = qemu_open(filename, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + if (fd < 0) { + return -errno; + } + + /* We need enough blocks to store the given disk size, + so always round up. */ + blocks = (bytes + block_size - 1) / block_size; + + bmap_size = blocks * sizeof(uint32_t); + bmap_size = ((bmap_size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE -1)); + + memset(&header, 0, sizeof(header)); + pstrcpy(header.text, sizeof(header.text), VDI_TEXT); + header.signature = VDI_SIGNATURE; + header.version = VDI_VERSION_1_1; + header.header_size = 0x180; + header.image_type = image_type; + header.offset_bmap = 0x200; + header.offset_data = 0x200 + bmap_size; + header.sector_size = SECTOR_SIZE; + header.disk_size = bytes; + header.block_size = block_size; + header.blocks_in_image = blocks; + if (image_type == VDI_TYPE_STATIC) { + header.blocks_allocated = blocks; + } + uuid_generate(header.uuid_image); + uuid_generate(header.uuid_last_snap); + /* There is no need to set header.uuid_link or header.uuid_parent here. */ +#if defined(CONFIG_VDI_DEBUG) + vdi_header_print(&header); +#endif + vdi_header_to_le(&header); + if (write(fd, &header, sizeof(header)) < 0) { + result = -errno; + } + + bmap = NULL; + if (bmap_size > 0) { + bmap = (uint32_t *)g_malloc0(bmap_size); + } + for (i = 0; i < blocks; i++) { + if (image_type == VDI_TYPE_STATIC) { + bmap[i] = i; + } else { + bmap[i] = VDI_UNALLOCATED; + } + } + if (write(fd, bmap, bmap_size) < 0) { + result = -errno; + } + g_free(bmap); + if (image_type == VDI_TYPE_STATIC) { + if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) { + result = -errno; + } + } + + if (close(fd) < 0) { + result = -errno; + } + + return result; +} + +static void vdi_close(BlockDriverState *bs) +{ + BDRVVdiState *s = bs->opaque; + + g_free(s->bmap); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static QEMUOptionParameter vdi_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, +#if defined(CONFIG_VDI_BLOCK_SIZE) + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "VDI cluster (block) size", + .value = { .n = DEFAULT_CLUSTER_SIZE }, + }, +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) + { + .name = BLOCK_OPT_STATIC, + .type = OPT_FLAG, + .help = "VDI static (pre-allocated) image" + }, +#endif + /* TODO: An additional option to set UUID values might be useful. */ + { NULL } +}; + +static BlockDriver bdrv_vdi = { + .format_name = "vdi", + .instance_size = sizeof(BDRVVdiState), + .bdrv_probe = vdi_probe, + .bdrv_open = vdi_open, + .bdrv_close = vdi_close, + .bdrv_create = vdi_create, + .bdrv_co_is_allocated = vdi_co_is_allocated, + .bdrv_make_empty = vdi_make_empty, + + .bdrv_read = vdi_co_read, +#if defined(CONFIG_VDI_WRITE) + .bdrv_write = vdi_co_write, +#endif + + .bdrv_get_info = vdi_get_info, + + .create_options = vdi_create_options, + .bdrv_check = vdi_check, +}; + +static void bdrv_vdi_init(void) +{ + logout("\n"); + bdrv_register(&bdrv_vdi); +} + +block_init(bdrv_vdi_init); diff --git a/block/vmdk.c b/block/vmdk.c new file mode 100644 index 000000000..bba4c61a7 --- /dev/null +++ b/block/vmdk.c @@ -0,0 +1,1665 @@ +/* + * Block driver for the VMDK format + * + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2005 Filip Navara + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include "migration.h" +#include <zlib.h> + +#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') +#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') +#define VMDK4_COMPRESSION_DEFLATE 1 +#define VMDK4_FLAG_RGD (1 << 1) +#define VMDK4_FLAG_COMPRESS (1 << 16) +#define VMDK4_FLAG_MARKER (1 << 17) +#define VMDK4_GD_AT_END 0xffffffffffffffffULL + +typedef struct { + uint32_t version; + uint32_t flags; + uint32_t disk_sectors; + uint32_t granularity; + uint32_t l1dir_offset; + uint32_t l1dir_size; + uint32_t file_sectors; + uint32_t cylinders; + uint32_t heads; + uint32_t sectors_per_track; +} VMDK3Header; + +typedef struct { + uint32_t version; + uint32_t flags; + int64_t capacity; + int64_t granularity; + int64_t desc_offset; + int64_t desc_size; + int32_t num_gtes_per_gte; + int64_t rgd_offset; + int64_t gd_offset; + int64_t grain_offset; + char filler[1]; + char check_bytes[4]; + uint16_t compressAlgorithm; +} QEMU_PACKED VMDK4Header; + +#define L2_CACHE_SIZE 16 + +typedef struct VmdkExtent { + BlockDriverState *file; + bool flat; + bool compressed; + bool has_marker; + int64_t sectors; + int64_t end_sector; + int64_t flat_start_offset; + int64_t l1_table_offset; + int64_t l1_backup_table_offset; + uint32_t *l1_table; + uint32_t *l1_backup_table; + unsigned int l1_size; + uint32_t l1_entry_sectors; + + unsigned int l2_size; + uint32_t *l2_cache; + uint32_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + + unsigned int cluster_sectors; +} VmdkExtent; + +typedef struct BDRVVmdkState { + CoMutex lock; + int desc_offset; + bool cid_updated; + uint32_t parent_cid; + int num_extents; + /* Extent array with num_extents entries, ascend ordered by address */ + VmdkExtent *extents; + Error *migration_blocker; +} BDRVVmdkState; + +typedef struct VmdkMetaData { + uint32_t offset; + unsigned int l1_index; + unsigned int l2_index; + unsigned int l2_offset; + int valid; +} VmdkMetaData; + +typedef struct VmdkGrainMarker { + uint64_t lba; + uint32_t size; + uint8_t data[0]; +} VmdkGrainMarker; + +enum { + MARKER_END_OF_STREAM = 0, + MARKER_GRAIN_TABLE = 1, + MARKER_GRAIN_DIRECTORY = 2, + MARKER_FOOTER = 3, +}; + +static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + uint32_t magic; + + if (buf_size < 4) { + return 0; + } + magic = be32_to_cpu(*(uint32_t *)buf); + if (magic == VMDK3_MAGIC || + magic == VMDK4_MAGIC) { + return 100; + } else { + const char *p = (const char *)buf; + const char *end = p + buf_size; + while (p < end) { + if (*p == '#') { + /* skip comment line */ + while (p < end && *p != '\n') { + p++; + } + p++; + continue; + } + if (*p == ' ') { + while (p < end && *p == ' ') { + p++; + } + /* skip '\r' if windows line endings used. */ + if (p < end && *p == '\r') { + p++; + } + /* only accept blank lines before 'version=' line */ + if (p == end || *p != '\n') { + return 0; + } + p++; + continue; + } + if (end - p >= strlen("version=X\n")) { + if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || + strncmp("version=2\n", p, strlen("version=2\n")) == 0) { + return 100; + } + } + if (end - p >= strlen("version=X\r\n")) { + if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || + strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { + return 100; + } + } + return 0; + } + return 0; + } +} + +#define CHECK_CID 1 + +#define SECTOR_SIZE 512 +#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ +#define BUF_SIZE 4096 +#define HEADER_SIZE 512 /* first sector of 512 bytes */ + +static void vmdk_free_extents(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + VmdkExtent *e; + + for (i = 0; i < s->num_extents; i++) { + e = &s->extents[i]; + g_free(e->l1_table); + g_free(e->l2_cache); + g_free(e->l1_backup_table); + if (e->file != bs->file) { + bdrv_delete(e->file); + } + } + g_free(s->extents); +} + +static void vmdk_free_last_extent(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + + if (s->num_extents == 0) { + return; + } + s->num_extents--; + s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent)); +} + +static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) +{ + char desc[DESC_SIZE]; + uint32_t cid = 0xffffffff; + const char *p_name, *cid_str; + size_t cid_str_size; + BDRVVmdkState *s = bs->opaque; + int ret; + + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return 0; + } + + if (parent) { + cid_str = "parentCID"; + cid_str_size = sizeof("parentCID"); + } else { + cid_str = "CID"; + cid_str_size = sizeof("CID"); + } + + desc[DESC_SIZE - 1] = '\0'; + p_name = strstr(desc, cid_str); + if (p_name != NULL) { + p_name += cid_str_size; + sscanf(p_name, "%x", &cid); + } + + return cid; +} + +static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) +{ + char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *p_name, *tmp_str; + BDRVVmdkState *s = bs->opaque; + int ret; + + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + desc[DESC_SIZE - 1] = '\0'; + tmp_str = strstr(desc, "parentCID"); + if (tmp_str == NULL) { + return -EINVAL; + } + + pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + p_name = strstr(desc, "CID"); + if (p_name != NULL) { + p_name += sizeof("CID"); + snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid); + pstrcat(desc, sizeof(desc), tmp_desc); + } + + ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + return 0; +} + +static int vmdk_is_cid_valid(BlockDriverState *bs) +{ +#ifdef CHECK_CID + BDRVVmdkState *s = bs->opaque; + BlockDriverState *p_bs = bs->backing_hd; + uint32_t cur_pcid; + + if (p_bs) { + cur_pcid = vmdk_read_cid(p_bs, 0); + if (s->parent_cid != cur_pcid) { + /* CID not valid */ + return 0; + } + } +#endif + /* CID valid */ + return 1; +} + +static int vmdk_parent_open(BlockDriverState *bs) +{ + char *p_name; + char desc[DESC_SIZE + 1]; + BDRVVmdkState *s = bs->opaque; + int ret; + + desc[DESC_SIZE] = '\0'; + ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + p_name = strstr(desc, "parentFileNameHint"); + if (p_name != NULL) { + char *end_name; + + p_name += sizeof("parentFileNameHint") + 1; + end_name = strchr(p_name, '\"'); + if (end_name == NULL) { + return -EINVAL; + } + if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { + return -EINVAL; + } + + pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); + } + + return 0; +} + +/* Create and append extent to the extent array. Return the added VmdkExtent + * address. return NULL if allocation failed. */ +static VmdkExtent *vmdk_add_extent(BlockDriverState *bs, + BlockDriverState *file, bool flat, int64_t sectors, + int64_t l1_offset, int64_t l1_backup_offset, + uint32_t l1_size, + int l2_size, unsigned int cluster_sectors) +{ + VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; + + s->extents = g_realloc(s->extents, + (s->num_extents + 1) * sizeof(VmdkExtent)); + extent = &s->extents[s->num_extents]; + s->num_extents++; + + memset(extent, 0, sizeof(VmdkExtent)); + extent->file = file; + extent->flat = flat; + extent->sectors = sectors; + extent->l1_table_offset = l1_offset; + extent->l1_backup_table_offset = l1_backup_offset; + extent->l1_size = l1_size; + extent->l1_entry_sectors = l2_size * cluster_sectors; + extent->l2_size = l2_size; + extent->cluster_sectors = cluster_sectors; + + if (s->num_extents > 1) { + extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; + } else { + extent->end_sector = extent->sectors; + } + bs->total_sectors = extent->end_sector; + return extent; +} + +static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) +{ + int ret; + int l1_size, i; + + /* read the L1 table */ + l1_size = extent->l1_size * sizeof(uint32_t); + extent->l1_table = g_malloc(l1_size); + ret = bdrv_pread(extent->file, + extent->l1_table_offset, + extent->l1_table, + l1_size); + if (ret < 0) { + goto fail_l1; + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_table[i]); + } + + if (extent->l1_backup_table_offset) { + extent->l1_backup_table = g_malloc(l1_size); + ret = bdrv_pread(extent->file, + extent->l1_backup_table_offset, + extent->l1_backup_table, + l1_size); + if (ret < 0) { + goto fail_l1b; + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_backup_table[i]); + } + } + + extent->l2_cache = + g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t)); + return 0; + fail_l1b: + g_free(extent->l1_backup_table); + fail_l1: + g_free(extent->l1_table); + return ret; +} + +static int vmdk_open_vmdk3(BlockDriverState *bs, + BlockDriverState *file, + int flags) +{ + int ret; + uint32_t magic; + VMDK3Header header; + VmdkExtent *extent; + + ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + return ret; + } + extent = vmdk_add_extent(bs, + bs->file, false, + le32_to_cpu(header.disk_sectors), + le32_to_cpu(header.l1dir_offset) << 9, + 0, 1 << 6, 1 << 9, + le32_to_cpu(header.granularity)); + ret = vmdk_init_tables(bs, extent); + if (ret) { + /* free extent allocated by vmdk_add_extent */ + vmdk_free_last_extent(bs); + } + return ret; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, + int64_t desc_offset); + +static int vmdk_open_vmdk4(BlockDriverState *bs, + BlockDriverState *file, + int flags) +{ + int ret; + uint32_t magic; + uint32_t l1_size, l1_entry_sectors; + VMDK4Header header; + VmdkExtent *extent; + int64_t l1_backup_offset = 0; + + ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + return ret; + } + if (header.capacity == 0 && header.desc_offset) { + return vmdk_open_desc_file(bs, flags, header.desc_offset << 9); + } + + if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { + /* + * The footer takes precedence over the header, so read it in. The + * footer starts at offset -1024 from the end: One sector for the + * footer, and another one for the end-of-stream marker. + */ + struct { + struct { + uint64_t val; + uint32_t size; + uint32_t type; + uint8_t pad[512 - 16]; + } QEMU_PACKED footer_marker; + + uint32_t magic; + VMDK4Header header; + uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; + + struct { + uint64_t val; + uint32_t size; + uint32_t type; + uint8_t pad[512 - 16]; + } QEMU_PACKED eos_marker; + } QEMU_PACKED footer; + + ret = bdrv_pread(file, + bs->file->total_sectors * 512 - 1536, + &footer, sizeof(footer)); + if (ret < 0) { + return ret; + } + + /* Some sanity checks for the footer */ + if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || + le32_to_cpu(footer.footer_marker.size) != 0 || + le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || + le64_to_cpu(footer.eos_marker.val) != 0 || + le32_to_cpu(footer.eos_marker.size) != 0 || + le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) + { + return -EINVAL; + } + + header = footer.header; + } + + l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte) + * le64_to_cpu(header.granularity); + if (l1_entry_sectors == 0) { + return -EINVAL; + } + l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) + / l1_entry_sectors; + if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { + l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; + } + extent = vmdk_add_extent(bs, file, false, + le64_to_cpu(header.capacity), + le64_to_cpu(header.gd_offset) << 9, + l1_backup_offset, + l1_size, + le32_to_cpu(header.num_gtes_per_gte), + le64_to_cpu(header.granularity)); + extent->compressed = + le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; + ret = vmdk_init_tables(bs, extent); + if (ret) { + /* free extent allocated by vmdk_add_extent */ + vmdk_free_last_extent(bs); + } + return ret; +} + +/* find an option value out of descriptor file */ +static int vmdk_parse_description(const char *desc, const char *opt_name, + char *buf, int buf_size) +{ + char *opt_pos, *opt_end; + const char *end = desc + strlen(desc); + + opt_pos = strstr(desc, opt_name); + if (!opt_pos) { + return -1; + } + /* Skip "=\"" following opt_name */ + opt_pos += strlen(opt_name) + 2; + if (opt_pos >= end) { + return -1; + } + opt_end = opt_pos; + while (opt_end < end && *opt_end != '"') { + opt_end++; + } + if (opt_end == end || buf_size < opt_end - opt_pos + 1) { + return -1; + } + pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); + return 0; +} + +/* Open an extent file and append to bs array */ +static int vmdk_open_sparse(BlockDriverState *bs, + BlockDriverState *file, + int flags) +{ + uint32_t magic; + + if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) { + return -EIO; + } + + magic = be32_to_cpu(magic); + switch (magic) { + case VMDK3_MAGIC: + return vmdk_open_vmdk3(bs, file, flags); + break; + case VMDK4_MAGIC: + return vmdk_open_vmdk4(bs, file, flags); + break; + default: + return -EINVAL; + break; + } +} + +static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, + const char *desc_file_path) +{ + int ret; + char access[11]; + char type[11]; + char fname[512]; + const char *p = desc; + int64_t sectors = 0; + int64_t flat_offset; + char extent_path[PATH_MAX]; + BlockDriverState *extent_file; + + while (*p) { + /* parse extent line: + * RW [size in sectors] FLAT "file-name.vmdk" OFFSET + * or + * RW [size in sectors] SPARSE "file-name.vmdk" + */ + flat_offset = -1; + ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64, + access, §ors, type, fname, &flat_offset); + if (ret < 4 || strcmp(access, "RW")) { + goto next_line; + } else if (!strcmp(type, "FLAT")) { + if (ret != 5 || flat_offset < 0) { + return -EINVAL; + } + } else if (ret != 4) { + return -EINVAL; + } + + /* trim the quotation marks around */ + if (fname[0] == '"') { + memmove(fname, fname + 1, strlen(fname)); + if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') { + return -EINVAL; + } + fname[strlen(fname) - 1] = '\0'; + } + if (sectors <= 0 || + (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) || + (strcmp(access, "RW"))) { + goto next_line; + } + + path_combine(extent_path, sizeof(extent_path), + desc_file_path, fname); + ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags); + if (ret) { + return ret; + } + + /* save to extents array */ + if (!strcmp(type, "FLAT")) { + /* FLAT extent */ + VmdkExtent *extent; + + extent = vmdk_add_extent(bs, extent_file, true, sectors, + 0, 0, 0, 0, sectors); + extent->flat_start_offset = flat_offset << 9; + } else if (!strcmp(type, "SPARSE")) { + /* SPARSE extent */ + ret = vmdk_open_sparse(bs, extent_file, bs->open_flags); + if (ret) { + bdrv_delete(extent_file); + return ret; + } + } else { + fprintf(stderr, + "VMDK: Not supported extent type \"%s\""".\n", type); + return -ENOTSUP; + } +next_line: + /* move to next line */ + while (*p && *p != '\n') { + p++; + } + p++; + } + return 0; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, + int64_t desc_offset) +{ + int ret; + char buf[2048]; + char ct[128]; + BDRVVmdkState *s = bs->opaque; + + ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + buf[2047] = '\0'; + if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { + return -EINVAL; + } + if (strcmp(ct, "monolithicFlat") && + strcmp(ct, "twoGbMaxExtentSparse") && + strcmp(ct, "twoGbMaxExtentFlat")) { + fprintf(stderr, + "VMDK: Not supported image type \"%s\""".\n", ct); + return -ENOTSUP; + } + s->desc_offset = 0; + return vmdk_parse_extents(buf, bs, bs->file->filename); +} + +static int vmdk_open(BlockDriverState *bs, int flags) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + + if (vmdk_open_sparse(bs, bs->file, flags) == 0) { + s->desc_offset = 0x200; + } else { + ret = vmdk_open_desc_file(bs, flags, 0); + if (ret) { + goto fail; + } + } + /* try to open parent images, if exist */ + ret = vmdk_parent_open(bs); + if (ret) { + goto fail; + } + s->parent_cid = vmdk_read_cid(bs, 1); + qemu_co_mutex_init(&s->lock); + + /* Disable migration when VMDK images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vmdk", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + + return 0; + +fail: + vmdk_free_extents(bs); + return ret; +} + +static int get_whole_cluster(BlockDriverState *bs, + VmdkExtent *extent, + uint64_t cluster_offset, + uint64_t offset, + bool allocate) +{ + /* 128 sectors * 512 bytes each = grain size 64KB */ + uint8_t whole_grain[extent->cluster_sectors * 512]; + + /* we will be here if it's first write on non-exist grain(cluster). + * try to read from parent image, if exist */ + if (bs->backing_hd) { + int ret; + + if (!vmdk_is_cid_valid(bs)) { + return -1; + } + + /* floor offset to cluster */ + offset -= offset % (extent->cluster_sectors * 512); + ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, + extent->cluster_sectors); + if (ret < 0) { + return -1; + } + + /* Write grain only into the active image */ + ret = bdrv_write(extent->file, cluster_offset, whole_grain, + extent->cluster_sectors); + if (ret < 0) { + return -1; + } + } + return 0; +} + +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) +{ + /* update L2 table */ + if (bdrv_pwrite_sync( + extent->file, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), + sizeof(m_data->offset) + ) < 0) { + return -1; + } + /* update backup L2 table */ + if (extent->l1_backup_table_offset != 0) { + m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; + if (bdrv_pwrite_sync( + extent->file, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), sizeof(m_data->offset) + ) < 0) { + return -1; + } + } + + return 0; +} + +static int get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + int allocate, + uint64_t *cluster_offset) +{ + unsigned int l1_index, l2_offset, l2_index; + int min_index, i, j; + uint32_t min_count, *l2_table, tmp = 0; + + if (m_data) { + m_data->valid = 0; + } + if (extent->flat) { + *cluster_offset = extent->flat_start_offset; + return 0; + } + + offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; + l1_index = (offset >> 9) / extent->l1_entry_sectors; + if (l1_index >= extent->l1_size) { + return -1; + } + l2_offset = extent->l1_table[l1_index]; + if (!l2_offset) { + return -1; + } + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == extent->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++extent->l2_cache_counts[i] == 0xffffffff) { + for (j = 0; j < L2_CACHE_SIZE; j++) { + extent->l2_cache_counts[j] >>= 1; + } + } + l2_table = extent->l2_cache + (i * extent->l2_size); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (extent->l2_cache_counts[i] < min_count) { + min_count = extent->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = extent->l2_cache + (min_index * extent->l2_size); + if (bdrv_pread( + extent->file, + (int64_t)l2_offset * 512, + l2_table, + extent->l2_size * sizeof(uint32_t) + ) != extent->l2_size * sizeof(uint32_t)) { + return -1; + } + + extent->l2_cache_offsets[min_index] = l2_offset; + extent->l2_cache_counts[min_index] = 1; + found: + l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; + *cluster_offset = le32_to_cpu(l2_table[l2_index]); + + if (!*cluster_offset) { + if (!allocate) { + return -1; + } + + /* Avoid the L2 tables update for the images that have snapshots. */ + *cluster_offset = bdrv_getlength(extent->file); + if (!extent->compressed) { + bdrv_truncate( + extent->file, + *cluster_offset + (extent->cluster_sectors << 9) + ); + } + + *cluster_offset >>= 9; + tmp = cpu_to_le32(*cluster_offset); + l2_table[l2_index] = tmp; + + /* First of all we write grain itself, to avoid race condition + * that may to corrupt the image. + * This problem may occur because of insufficient space on host disk + * or inappropriate VM shutdown. + */ + if (get_whole_cluster( + bs, extent, *cluster_offset, offset, allocate) == -1) { + return -1; + } + + if (m_data) { + m_data->offset = tmp; + m_data->l1_index = l1_index; + m_data->l2_index = l2_index; + m_data->l2_offset = l2_offset; + m_data->valid = 1; + } + } + *cluster_offset <<= 9; + return 0; +} + +static VmdkExtent *find_extent(BDRVVmdkState *s, + int64_t sector_num, VmdkExtent *start_hint) +{ + VmdkExtent *extent = start_hint; + + if (!extent) { + extent = &s->extents[0]; + } + while (extent < &s->extents[s->num_extents]) { + if (sector_num < extent->end_sector) { + return extent; + } + extent++; + } + return NULL; +} + +static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVVmdkState *s = bs->opaque; + int64_t index_in_cluster, n, ret; + uint64_t offset; + VmdkExtent *extent; + + extent = find_extent(s, sector_num, NULL); + if (!extent) { + return 0; + } + qemu_co_mutex_lock(&s->lock); + ret = get_cluster_offset(bs, extent, NULL, + sector_num * 512, 0, &offset); + qemu_co_mutex_unlock(&s->lock); + /* get_cluster_offset returning 0 means success */ + ret = !ret; + + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + *pnum = n; + return ret; +} + +static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, + int64_t offset_in_cluster, const uint8_t *buf, + int nb_sectors, int64_t sector_num) +{ + int ret; + VmdkGrainMarker *data = NULL; + uLongf buf_len; + const uint8_t *write_buf = buf; + int write_len = nb_sectors * 512; + + if (extent->compressed) { + if (!extent->has_marker) { + ret = -EINVAL; + goto out; + } + buf_len = (extent->cluster_sectors << 9) * 2; + data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); + if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || + buf_len == 0) { + ret = -EINVAL; + goto out; + } + data->lba = sector_num; + data->size = buf_len; + write_buf = (uint8_t *)data; + write_len = buf_len + sizeof(VmdkGrainMarker); + } + ret = bdrv_pwrite(extent->file, + cluster_offset + offset_in_cluster, + write_buf, + write_len); + if (ret != write_len) { + ret = ret < 0 ? ret : -EIO; + goto out; + } + ret = 0; + out: + g_free(data); + return ret; +} + +static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, + int64_t offset_in_cluster, uint8_t *buf, + int nb_sectors) +{ + int ret; + int cluster_bytes, buf_bytes; + uint8_t *cluster_buf, *compressed_data; + uint8_t *uncomp_buf; + uint32_t data_len; + VmdkGrainMarker *marker; + uLongf buf_len; + + + if (!extent->compressed) { + ret = bdrv_pread(extent->file, + cluster_offset + offset_in_cluster, + buf, nb_sectors * 512); + if (ret == nb_sectors * 512) { + return 0; + } else { + return -EIO; + } + } + cluster_bytes = extent->cluster_sectors * 512; + /* Read two clusters in case GrainMarker + compressed data > one cluster */ + buf_bytes = cluster_bytes * 2; + cluster_buf = g_malloc(buf_bytes); + uncomp_buf = g_malloc(cluster_bytes); + ret = bdrv_pread(extent->file, + cluster_offset, + cluster_buf, buf_bytes); + if (ret < 0) { + goto out; + } + compressed_data = cluster_buf; + buf_len = cluster_bytes; + data_len = cluster_bytes; + if (extent->has_marker) { + marker = (VmdkGrainMarker *)cluster_buf; + compressed_data = marker->data; + data_len = le32_to_cpu(marker->size); + } + if (!data_len || data_len > buf_bytes) { + ret = -EINVAL; + goto out; + } + ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); + if (ret != Z_OK) { + ret = -EINVAL; + goto out; + + } + if (offset_in_cluster < 0 || + offset_in_cluster + nb_sectors * 512 > buf_len) { + ret = -EINVAL; + goto out; + } + memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); + ret = 0; + + out: + g_free(uncomp_buf); + g_free(cluster_buf); + return ret; +} + +static int vmdk_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + int ret; + uint64_t n, index_in_cluster; + VmdkExtent *extent = NULL; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + ret = get_cluster_offset( + bs, extent, NULL, + sector_num << 9, 0, &cluster_offset); + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + if (ret) { + /* if not allocated, try to read from parent image, if exist */ + if (bs->backing_hd) { + if (!vmdk_is_cid_valid(bs)) { + return -EINVAL; + } + ret = bdrv_read(bs->backing_hd, sector_num, buf, n); + if (ret < 0) { + return ret; + } + } else { + memset(buf, 0, 512 * n); + } + } else { + ret = vmdk_read_extent(extent, + cluster_offset, index_in_cluster * 512, + buf, n); + if (ret) { + return ret; + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vmdk_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int vmdk_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent = NULL; + int n, ret; + int64_t index_in_cluster; + uint64_t cluster_offset; + VmdkMetaData m_data; + + if (sector_num > bs->total_sectors) { + fprintf(stderr, + "(VMDK) Wrong offset: sector_num=0x%" PRIx64 + " total_sectors=0x%" PRIx64 "\n", + sector_num, bs->total_sectors); + return -EIO; + } + + while (nb_sectors > 0) { + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + ret = get_cluster_offset( + bs, + extent, + &m_data, + sector_num << 9, !extent->compressed, + &cluster_offset); + if (extent->compressed) { + if (ret == 0) { + /* Refuse write to allocated cluster for streamOptimized */ + fprintf(stderr, + "VMDK: can't write to allocated cluster" + " for streamOptimized\n"); + return -EIO; + } else { + /* allocate */ + ret = get_cluster_offset( + bs, + extent, + &m_data, + sector_num << 9, 1, + &cluster_offset); + } + } + if (ret) { + return -EINVAL; + } + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + + ret = vmdk_write_extent(extent, + cluster_offset, index_in_cluster * 512, + buf, n, sector_num); + if (ret) { + return ret; + } + if (m_data.valid) { + /* update L2 tables */ + if (vmdk_L2update(extent, &m_data) == -1) { + return -EIO; + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + + /* update CID on the first write every time the virtual disk is + * opened */ + if (!s->cid_updated) { + ret = vmdk_write_cid(bs, time(NULL)); + if (ret < 0) { + return ret; + } + s->cid_updated = true; + } + } + return 0; +} + +static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vmdk_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + + +static int vmdk_create_extent(const char *filename, int64_t filesize, + bool flat, bool compress) +{ + int ret, i; + int fd = 0; + VMDK4Header header; + uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; + + fd = qemu_open(filename, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + if (fd < 0) { + return -errno; + } + if (flat) { + ret = ftruncate(fd, filesize); + if (ret < 0) { + ret = -errno; + } + goto exit; + } + magic = cpu_to_be32(VMDK4_MAGIC); + memset(&header, 0, sizeof(header)); + header.version = 1; + header.flags = + 3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0); + header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; + header.capacity = filesize / 512; + header.granularity = 128; + header.num_gtes_per_gte = 512; + + grains = (filesize / 512 + header.granularity - 1) / header.granularity; + gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9; + gt_count = + (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte; + gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; + + header.desc_offset = 1; + header.desc_size = 20; + header.rgd_offset = header.desc_offset + header.desc_size; + header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); + header.grain_offset = + ((header.gd_offset + gd_size + (gt_size * gt_count) + + header.granularity - 1) / header.granularity) * + header.granularity; + /* swap endianness for all header fields */ + header.version = cpu_to_le32(header.version); + header.flags = cpu_to_le32(header.flags); + header.capacity = cpu_to_le64(header.capacity); + header.granularity = cpu_to_le64(header.granularity); + header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte); + header.desc_offset = cpu_to_le64(header.desc_offset); + header.desc_size = cpu_to_le64(header.desc_size); + header.rgd_offset = cpu_to_le64(header.rgd_offset); + header.gd_offset = cpu_to_le64(header.gd_offset); + header.grain_offset = cpu_to_le64(header.grain_offset); + header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); + + header.check_bytes[0] = 0xa; + header.check_bytes[1] = 0x20; + header.check_bytes[2] = 0xd; + header.check_bytes[3] = 0xa; + + /* write all the data */ + ret = qemu_write_full(fd, &magic, sizeof(magic)); + if (ret != sizeof(magic)) { + ret = -errno; + goto exit; + } + ret = qemu_write_full(fd, &header, sizeof(header)); + if (ret != sizeof(header)) { + ret = -errno; + goto exit; + } + + ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9); + if (ret < 0) { + ret = -errno; + goto exit; + } + + /* write grain directory */ + lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size; + i < gt_count; i++, tmp += gt_size) { + ret = qemu_write_full(fd, &tmp, sizeof(tmp)); + if (ret != sizeof(tmp)) { + ret = -errno; + goto exit; + } + } + + /* write backup grain directory */ + lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size; + i < gt_count; i++, tmp += gt_size) { + ret = qemu_write_full(fd, &tmp, sizeof(tmp)); + if (ret != sizeof(tmp)) { + ret = -errno; + goto exit; + } + } + + ret = 0; + exit: + qemu_close(fd); + return ret; +} + +static int filename_decompose(const char *filename, char *path, char *prefix, + char *postfix, size_t buf_len) +{ + const char *p, *q; + + if (filename == NULL || !strlen(filename)) { + fprintf(stderr, "Vmdk: no filename provided.\n"); + return -1; + } + p = strrchr(filename, '/'); + if (p == NULL) { + p = strrchr(filename, '\\'); + } + if (p == NULL) { + p = strrchr(filename, ':'); + } + if (p != NULL) { + p++; + if (p - filename >= buf_len) { + return -1; + } + pstrcpy(path, p - filename + 1, filename); + } else { + p = filename; + path[0] = '\0'; + } + q = strrchr(p, '.'); + if (q == NULL) { + pstrcpy(prefix, buf_len, p); + postfix[0] = '\0'; + } else { + if (q - p >= buf_len) { + return -1; + } + pstrcpy(prefix, q - p + 1, p); + pstrcpy(postfix, buf_len, q); + } + return 0; +} + +static int relative_path(char *dest, int dest_size, + const char *base, const char *target) +{ + int i = 0; + int n = 0; + const char *p, *q; +#ifdef _WIN32 + const char *sep = "\\"; +#else + const char *sep = "/"; +#endif + + if (!(dest && base && target)) { + return -1; + } + if (path_is_absolute(target)) { + dest[dest_size - 1] = '\0'; + strncpy(dest, target, dest_size - 1); + return 0; + } + while (base[i] == target[i]) { + i++; + } + p = &base[i]; + q = &target[i]; + while (*p) { + if (*p == *sep) { + n++; + } + p++; + } + dest[0] = '\0'; + for (; n; n--) { + pstrcat(dest, dest_size, ".."); + pstrcat(dest, dest_size, sep); + } + pstrcat(dest, dest_size, q); + return 0; +} + +static int vmdk_create(const char *filename, QEMUOptionParameter *options) +{ + int fd, idx = 0; + char desc[BUF_SIZE]; + int64_t total_size = 0, filesize; + const char *backing_file = NULL; + const char *fmt = NULL; + int flags = 0; + int ret = 0; + bool flat, split, compress; + char ext_desc_lines[BUF_SIZE] = ""; + char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX]; + const int64_t split_size = 0x80000000; /* VMDK has constant split size */ + const char *desc_extent_line; + char parent_desc_line[BUF_SIZE] = ""; + uint32_t parent_cid = 0xffffffff; + const char desc_template[] = + "# Disk DescriptorFile\n" + "version=1\n" + "CID=%x\n" + "parentCID=%x\n" + "createType=\"%s\"\n" + "%s" + "\n" + "# Extent description\n" + "%s" + "\n" + "# The Disk Data Base\n" + "#DDB\n" + "\n" + "ddb.virtualHWVersion = \"%d\"\n" + "ddb.geometry.cylinders = \"%" PRId64 "\"\n" + "ddb.geometry.heads = \"16\"\n" + "ddb.geometry.sectors = \"63\"\n" + "ddb.adapterType = \"ide\"\n"; + + if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) { + return -EINVAL; + } + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) { + flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0; + } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { + fmt = options->value.s; + } + options++; + } + if (!fmt) { + /* Default format to monolithicSparse */ + fmt = "monolithicSparse"; + } else if (strcmp(fmt, "monolithicFlat") && + strcmp(fmt, "monolithicSparse") && + strcmp(fmt, "twoGbMaxExtentSparse") && + strcmp(fmt, "twoGbMaxExtentFlat") && + strcmp(fmt, "streamOptimized")) { + fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt); + return -EINVAL; + } + split = !(strcmp(fmt, "twoGbMaxExtentFlat") && + strcmp(fmt, "twoGbMaxExtentSparse")); + flat = !(strcmp(fmt, "monolithicFlat") && + strcmp(fmt, "twoGbMaxExtentFlat")); + compress = !strcmp(fmt, "streamOptimized"); + if (flat) { + desc_extent_line = "RW %lld FLAT \"%s\" 0\n"; + } else { + desc_extent_line = "RW %lld SPARSE \"%s\"\n"; + } + if (flat && backing_file) { + /* not supporting backing file for flat image */ + return -ENOTSUP; + } + if (backing_file) { + char parent_filename[PATH_MAX]; + BlockDriverState *bs = bdrv_new(""); + ret = bdrv_open(bs, backing_file, 0, NULL); + if (ret != 0) { + bdrv_delete(bs); + return ret; + } + if (strcmp(bs->drv->format_name, "vmdk")) { + bdrv_delete(bs); + return -EINVAL; + } + parent_cid = vmdk_read_cid(bs, 0); + bdrv_delete(bs); + relative_path(parent_filename, sizeof(parent_filename), + filename, backing_file); + snprintf(parent_desc_line, sizeof(parent_desc_line), + "parentFileNameHint=\"%s\"", parent_filename); + } + + /* Create extents */ + filesize = total_size; + while (filesize > 0) { + char desc_line[BUF_SIZE]; + char ext_filename[PATH_MAX]; + char desc_filename[PATH_MAX]; + int64_t size = filesize; + + if (split && size > split_size) { + size = split_size; + } + if (split) { + snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s", + prefix, flat ? 'f' : 's', ++idx, postfix); + } else if (flat) { + snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s", + prefix, postfix); + } else { + snprintf(desc_filename, sizeof(desc_filename), "%s%s", + prefix, postfix); + } + snprintf(ext_filename, sizeof(ext_filename), "%s%s", + path, desc_filename); + + if (vmdk_create_extent(ext_filename, size, flat, compress)) { + return -EINVAL; + } + filesize -= size; + + /* Format description line */ + snprintf(desc_line, sizeof(desc_line), + desc_extent_line, size / 512, desc_filename); + pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line); + } + /* generate descriptor file */ + snprintf(desc, sizeof(desc), desc_template, + (unsigned int)time(NULL), + parent_cid, + fmt, + parent_desc_line, + ext_desc_lines, + (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + total_size / (int64_t)(63 * 16 * 512)); + if (split || flat) { + fd = qemu_open(filename, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + } else { + fd = qemu_open(filename, + O_WRONLY | O_BINARY | O_LARGEFILE, + 0644); + } + if (fd < 0) { + return -errno; + } + /* the descriptor offset = 0x200 */ + if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) { + ret = -errno; + goto exit; + } + ret = qemu_write_full(fd, desc, strlen(desc)); + if (ret != strlen(desc)) { + ret = -errno; + goto exit; + } + ret = 0; +exit: + qemu_close(fd); + return ret; +} + +static void vmdk_close(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + + vmdk_free_extents(bs); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i, err; + int ret = 0; + + for (i = 0; i < s->num_extents; i++) { + err = bdrv_co_flush(s->extents[i].file); + if (err < 0) { + ret = err; + } + } + return ret; +} + +static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) +{ + int i; + int64_t ret = 0; + int64_t r; + BDRVVmdkState *s = bs->opaque; + + ret = bdrv_get_allocated_file_size(bs->file); + if (ret < 0) { + return ret; + } + for (i = 0; i < s->num_extents; i++) { + if (s->extents[i].file == bs->file) { + continue; + } + r = bdrv_get_allocated_file_size(s->extents[i].file); + if (r < 0) { + return r; + } + ret += r; + } + return ret; +} + +static QEMUOptionParameter vmdk_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_COMPAT6, + .type = OPT_FLAG, + .help = "VMDK version 6 image" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = OPT_STRING, + .help = + "VMDK flat extent format, can be one of " + "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " + }, + { NULL } +}; + +static BlockDriver bdrv_vmdk = { + .format_name = "vmdk", + .instance_size = sizeof(BDRVVmdkState), + .bdrv_probe = vmdk_probe, + .bdrv_open = vmdk_open, + .bdrv_read = vmdk_co_read, + .bdrv_write = vmdk_co_write, + .bdrv_close = vmdk_close, + .bdrv_create = vmdk_create, + .bdrv_co_flush_to_disk = vmdk_co_flush, + .bdrv_co_is_allocated = vmdk_co_is_allocated, + .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, + + .create_options = vmdk_create_options, +}; + +static void bdrv_vmdk_init(void) +{ + bdrv_register(&bdrv_vmdk); +} + +block_init(bdrv_vmdk_init); diff --git a/block/vpc.c b/block/vpc.c new file mode 100644 index 000000000..c0b82c4f5 --- /dev/null +++ b/block/vpc.c @@ -0,0 +1,799 @@ +/* + * Block driver for Connectix / Microsoft Virtual PC images + * + * Copyright (c) 2005 Alex Beregszaszi + * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include "migration.h" + +/**************************************************************/ + +#define HEADER_SIZE 512 + +//#define CACHE + +enum vhd_type { + VHD_FIXED = 2, + VHD_DYNAMIC = 3, + VHD_DIFFERENCING = 4, +}; + +// Seconds since Jan 1, 2000 0:00:00 (UTC) +#define VHD_TIMESTAMP_BASE 946684800 + +// always big-endian +struct vhd_footer { + char creator[8]; // "conectix" + uint32_t features; + uint32_t version; + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Seconds since Jan 1, 2000 0:00:00 (UTC) + uint32_t timestamp; + + char creator_app[4]; // "vpc " + uint16_t major; + uint16_t minor; + char creator_os[4]; // "Wi2k" + + uint64_t orig_size; + uint64_t size; + + uint16_t cyls; + uint8_t heads; + uint8_t secs_per_cyl; + + uint32_t type; + + // Checksum of the Hard Disk Footer ("one's complement of the sum of all + // the bytes in the footer without the checksum field") + uint32_t checksum; + + // UUID used to identify a parent hard disk (backing file) + uint8_t uuid[16]; + + uint8_t in_saved_state; +}; + +struct vhd_dyndisk_header { + char magic[8]; // "cxsparse" + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Offset of the Block Allocation Table (BAT) + uint64_t table_offset; + + uint32_t version; + uint32_t max_table_entries; // 32bit/entry + + // 2 MB by default, must be a power of two + uint32_t block_size; + + uint32_t checksum; + uint8_t parent_uuid[16]; + uint32_t parent_timestamp; + uint32_t reserved; + + // Backing file name (in UTF-16) + uint8_t parent_name[512]; + + struct { + uint32_t platform; + uint32_t data_space; + uint32_t data_length; + uint32_t reserved; + uint64_t data_offset; + } parent_locator[8]; +}; + +typedef struct BDRVVPCState { + CoMutex lock; + uint8_t footer_buf[HEADER_SIZE]; + uint64_t free_data_block_offset; + int max_table_entries; + uint32_t *pagetable; + uint64_t bat_offset; + uint64_t last_bitmap_offset; + + uint32_t block_size; + uint32_t bitmap_size; + +#ifdef CACHE + uint8_t *pageentry_u8; + uint32_t *pageentry_u32; + uint16_t *pageentry_u16; + + uint64_t last_bitmap; +#endif + + Error *migration_blocker; +} BDRVVPCState; + +static uint32_t vpc_checksum(uint8_t* buf, size_t size) +{ + uint32_t res = 0; + int i; + + for (i = 0; i < size; i++) + res += buf[i]; + + return ~res; +} + + +static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8)) + return 100; + return 0; +} + +static int vpc_open(BlockDriverState *bs, int flags) +{ + BDRVVPCState *s = bs->opaque; + int i; + struct vhd_footer* footer; + struct vhd_dyndisk_header* dyndisk_header; + uint8_t buf[HEADER_SIZE]; + uint32_t checksum; + int err = -1; + int disk_type = VHD_DYNAMIC; + + if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) + goto fail; + + footer = (struct vhd_footer*) s->footer_buf; + if (strncmp(footer->creator, "conectix", 8)) { + int64_t offset = bdrv_getlength(bs->file); + if (offset < HEADER_SIZE) { + goto fail; + } + /* If a fixed disk, the footer is found only at the end of the file */ + if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE) + != HEADER_SIZE) { + goto fail; + } + if (strncmp(footer->creator, "conectix", 8)) { + goto fail; + } + disk_type = VHD_FIXED; + } + + checksum = be32_to_cpu(footer->checksum); + footer->checksum = 0; + if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) + fprintf(stderr, "block-vpc: The header checksum of '%s' is " + "incorrect.\n", bs->filename); + + /* Write 'checksum' back to footer, or else will leave it with zero. */ + footer->checksum = be32_to_cpu(checksum); + + // The visible size of a image in Virtual PC depends on the geometry + // rather than on the size stored in the footer (the size in the footer + // is too large usually) + bs->total_sectors = (int64_t) + be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + + if (bs->total_sectors >= 65535 * 16 * 255) { + err = -EFBIG; + goto fail; + } + + if (disk_type == VHD_DYNAMIC) { + if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, + HEADER_SIZE) != HEADER_SIZE) { + goto fail; + } + + dyndisk_header = (struct vhd_dyndisk_header *) buf; + + if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { + goto fail; + } + + s->block_size = be32_to_cpu(dyndisk_header->block_size); + s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; + + s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); + s->pagetable = g_malloc(s->max_table_entries * 4); + + s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); + if (bdrv_pread(bs->file, s->bat_offset, s->pagetable, + s->max_table_entries * 4) != s->max_table_entries * 4) { + goto fail; + } + + s->free_data_block_offset = + (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511; + + for (i = 0; i < s->max_table_entries; i++) { + be32_to_cpus(&s->pagetable[i]); + if (s->pagetable[i] != 0xFFFFFFFF) { + int64_t next = (512 * (int64_t) s->pagetable[i]) + + s->bitmap_size + s->block_size; + + if (next > s->free_data_block_offset) { + s->free_data_block_offset = next; + } + } + } + + s->last_bitmap_offset = (int64_t) -1; + +#ifdef CACHE + s->pageentry_u8 = g_malloc(512); + s->pageentry_u32 = s->pageentry_u8; + s->pageentry_u16 = s->pageentry_u8; + s->last_pagetable = -1; +#endif + } + + qemu_co_mutex_init(&s->lock); + + /* Disable migration when VHD images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vpc", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + + return 0; + fail: + return err; +} + +/* + * Returns the absolute byte offset of the given sector in the image file. + * If the sector is not allocated, -1 is returned instead. + * + * The parameter write must be 1 if the offset will be used for a write + * operation (the block bitmaps is updated then), 0 otherwise. + */ +static inline int64_t get_sector_offset(BlockDriverState *bs, + int64_t sector_num, int write) +{ + BDRVVPCState *s = bs->opaque; + uint64_t offset = sector_num * 512; + uint64_t bitmap_offset, block_offset; + uint32_t pagetable_index, pageentry_index; + + pagetable_index = offset / s->block_size; + pageentry_index = (offset % s->block_size) / 512; + + if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) + return -1; // not allocated + + bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; + block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + + // We must ensure that we don't write to any sectors which are marked as + // unused in the bitmap. We get away with setting all bits in the block + // bitmap each time we write to a new block. This might cause Virtual PC to + // miss sparse read optimization, but it's not a problem in terms of + // correctness. + if (write && (s->last_bitmap_offset != bitmap_offset)) { + uint8_t bitmap[s->bitmap_size]; + + s->last_bitmap_offset = bitmap_offset; + memset(bitmap, 0xff, s->bitmap_size); + bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); + } + +// printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n", +// sector_num, pagetable_index, pageentry_index, +// bitmap_offset, block_offset); + +// disabled by reason +#if 0 +#ifdef CACHE + if (bitmap_offset != s->last_bitmap) + { + lseek(s->fd, bitmap_offset, SEEK_SET); + + s->last_bitmap = bitmap_offset; + + // Scary! Bitmap is stored as big endian 32bit entries, + // while we used to look it up byte by byte + read(s->fd, s->pageentry_u8, 512); + for (i = 0; i < 128; i++) + be32_to_cpus(&s->pageentry_u32[i]); + } + + if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1) + return -1; +#else + lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET); + + read(s->fd, &bitmap_entry, 1); + + if ((bitmap_entry >> (pageentry_index % 8)) & 1) + return -1; // not allocated +#endif +#endif + + return block_offset; +} + +/* + * Writes the footer to the end of the image file. This is needed when the + * file grows as it overwrites the old footer + * + * Returns 0 on success and < 0 on error + */ +static int rewrite_footer(BlockDriverState* bs) +{ + int ret; + BDRVVPCState *s = bs->opaque; + int64_t offset = s->free_data_block_offset; + + ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Allocates a new block. This involves writing a new footer and updating + * the Block Allocation Table to use the space at the old end of the image + * file (overwriting the old footer) + * + * Returns the sectors' offset in the image file on success and < 0 on error + */ +static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +{ + BDRVVPCState *s = bs->opaque; + int64_t bat_offset; + uint32_t index, bat_value; + int ret; + uint8_t bitmap[s->bitmap_size]; + + // Check if sector_num is valid + if ((sector_num < 0) || (sector_num > bs->total_sectors)) + return -1; + + // Write entry into in-memory BAT + index = (sector_num * 512) / s->block_size; + if (s->pagetable[index] != 0xFFFFFFFF) + return -1; + + s->pagetable[index] = s->free_data_block_offset / 512; + + // Initialize the block's bitmap + memset(bitmap, 0xff, s->bitmap_size); + ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap, + s->bitmap_size); + if (ret < 0) { + return ret; + } + + // Write new footer (the old one will be overwritten) + s->free_data_block_offset += s->block_size + s->bitmap_size; + ret = rewrite_footer(bs); + if (ret < 0) + goto fail; + + // Write BAT entry to disk + bat_offset = s->bat_offset + (4 * index); + bat_value = be32_to_cpu(s->pagetable[index]); + ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4); + if (ret < 0) + goto fail; + + return get_sector_offset(bs, sector_num, 0); + +fail: + s->free_data_block_offset -= (s->block_size + s->bitmap_size); + return -1; +} + +static int vpc_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int ret; + int64_t offset; + int64_t sectors, sectors_per_block; + struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + + if (cpu_to_be32(footer->type) == VHD_FIXED) { + return bdrv_read(bs->file, sector_num, buf, nb_sectors); + } + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 0); + + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + + if (offset == -1) { + memset(buf, 0, sectors * BDRV_SECTOR_SIZE); + } else { + ret = bdrv_pread(bs->file, offset, buf, + sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { + return -1; + } + } + + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; + } + return 0; +} + +static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVPCState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vpc_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int vpc_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int64_t offset; + int64_t sectors, sectors_per_block; + int ret; + struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + + if (cpu_to_be32(footer->type) == VHD_FIXED) { + return bdrv_write(bs->file, sector_num, buf, nb_sectors); + } + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 1); + + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + + if (offset == -1) { + offset = alloc_block(bs, sector_num); + if (offset < 0) + return -1; + } + + ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { + return -1; + } + + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVPCState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vpc_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +/* + * Calculates the number of cylinders, heads and sectors per cylinder + * based on a given number of sectors. This is the algorithm described + * in the VHD specification. + * + * Note that the geometry doesn't always exactly match total_sectors but + * may round it down. + * + * Returns 0 on success, -EFBIG if the size is larger than 127 GB + */ +static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, + uint8_t* heads, uint8_t* secs_per_cyl) +{ + uint32_t cyls_times_heads; + + if (total_sectors > 65535 * 16 * 255) + return -EFBIG; + + if (total_sectors > 65535 * 16 * 63) { + *secs_per_cyl = 255; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } else { + *secs_per_cyl = 17; + cyls_times_heads = total_sectors / *secs_per_cyl; + *heads = (cyls_times_heads + 1023) / 1024; + + if (*heads < 4) + *heads = 4; + + if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { + *secs_per_cyl = 31; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + + if (cyls_times_heads >= (*heads * 1024)) { + *secs_per_cyl = 63; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + } + + *cyls = cyls_times_heads / *heads; + + return 0; +} + +static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) +{ + struct vhd_dyndisk_header* dyndisk_header = + (struct vhd_dyndisk_header*) buf; + size_t block_size, num_bat_entries; + int i; + int ret = -EIO; + + // Write the footer (twice: at the beginning and at the end) + block_size = 0x200000; + num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); + + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + goto fail; + } + + if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) { + goto fail; + } + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + goto fail; + } + + // Write the initial BAT + if (lseek(fd, 3 * 512, SEEK_SET) < 0) { + goto fail; + } + + memset(buf, 0xFF, 512); + for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { + if (write(fd, buf, 512) != 512) { + goto fail; + } + } + + // Prepare the Dynamic Disk Header + memset(buf, 0, 1024); + + memcpy(dyndisk_header->magic, "cxsparse", 8); + + /* + * Note: The spec is actually wrong here for data_offset, it says + * 0xFFFFFFFF, but MS tools expect all 64 bits to be set. + */ + dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL); + dyndisk_header->table_offset = be64_to_cpu(3 * 512); + dyndisk_header->version = be32_to_cpu(0x00010000); + dyndisk_header->block_size = be32_to_cpu(block_size); + dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries); + + dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024)); + + // Write the header + if (lseek(fd, 512, SEEK_SET) < 0) { + goto fail; + } + + if (write(fd, buf, 1024) != 1024) { + goto fail; + } + ret = 0; + + fail: + return ret; +} + +static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) +{ + int ret = -EIO; + + /* Add footer to total size */ + total_size += 512; + if (ftruncate(fd, total_size) != 0) { + ret = -errno; + goto fail; + } + if (lseek(fd, -512, SEEK_END) < 0) { + goto fail; + } + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + goto fail; + } + + ret = 0; + + fail: + return ret; +} + +static int vpc_create(const char *filename, QEMUOptionParameter *options) +{ + uint8_t buf[1024]; + struct vhd_footer *footer = (struct vhd_footer *) buf; + QEMUOptionParameter *disk_type_param; + int fd, i; + uint16_t cyls = 0; + uint8_t heads = 0; + uint8_t secs_per_cyl = 0; + int64_t total_sectors; + int64_t total_size; + int disk_type; + int ret = -EIO; + + /* Read out options */ + total_size = get_option_parameter(options, BLOCK_OPT_SIZE)->value.n; + + disk_type_param = get_option_parameter(options, BLOCK_OPT_SUBFMT); + if (disk_type_param && disk_type_param->value.s) { + if (!strcmp(disk_type_param->value.s, "dynamic")) { + disk_type = VHD_DYNAMIC; + } else if (!strcmp(disk_type_param->value.s, "fixed")) { + disk_type = VHD_FIXED; + } else { + return -EINVAL; + } + } else { + disk_type = VHD_DYNAMIC; + } + + /* Create the file */ + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) { + return -EIO; + } + + /* + * Calculate matching total_size and geometry. Increase the number of + * sectors requested until we get enough (or fail). This ensures that + * qemu-img convert doesn't truncate images, but rather rounds up. + */ + total_sectors = total_size / BDRV_SECTOR_SIZE; + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + if (calculate_geometry(total_sectors + i, &cyls, &heads, + &secs_per_cyl)) + { + ret = -EFBIG; + goto fail; + } + } + + total_sectors = (int64_t) cyls * heads * secs_per_cyl; + + /* Prepare the Hard Disk Footer */ + memset(buf, 0, 1024); + + memcpy(footer->creator, "conectix", 8); + /* TODO Check if "qemu" creator_app is ok for VPC */ + memcpy(footer->creator_app, "qemu", 4); + memcpy(footer->creator_os, "Wi2k", 4); + + footer->features = be32_to_cpu(0x02); + footer->version = be32_to_cpu(0x00010000); + if (disk_type == VHD_DYNAMIC) { + footer->data_offset = be64_to_cpu(HEADER_SIZE); + } else { + footer->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL); + } + footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE); + + /* Version of Virtual PC 2007 */ + footer->major = be16_to_cpu(0x0005); + footer->minor = be16_to_cpu(0x0003); + if (disk_type == VHD_DYNAMIC) { + footer->orig_size = be64_to_cpu(total_sectors * 512); + footer->size = be64_to_cpu(total_sectors * 512); + } else { + footer->orig_size = be64_to_cpu(total_size); + footer->size = be64_to_cpu(total_size); + } + footer->cyls = be16_to_cpu(cyls); + footer->heads = heads; + footer->secs_per_cyl = secs_per_cyl; + + footer->type = be32_to_cpu(disk_type); + + /* TODO uuid is missing */ + + footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); + + if (disk_type == VHD_DYNAMIC) { + ret = create_dynamic_disk(fd, buf, total_sectors); + } else { + ret = create_fixed_disk(fd, buf, total_size); + } + + fail: + qemu_close(fd); + return ret; +} + +static void vpc_close(BlockDriverState *bs) +{ + BDRVVPCState *s = bs->opaque; + g_free(s->pagetable); +#ifdef CACHE + g_free(s->pageentry_u8); +#endif + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static QEMUOptionParameter vpc_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = OPT_STRING, + .help = + "Type of virtual hard disk format. Supported formats are " + "{dynamic (default) | fixed} " + }, + { NULL } +}; + +static BlockDriver bdrv_vpc = { + .format_name = "vpc", + .instance_size = sizeof(BDRVVPCState), + + .bdrv_probe = vpc_probe, + .bdrv_open = vpc_open, + .bdrv_close = vpc_close, + .bdrv_create = vpc_create, + + .bdrv_read = vpc_co_read, + .bdrv_write = vpc_co_write, + + .create_options = vpc_create_options, +}; + +static void bdrv_vpc_init(void) +{ + bdrv_register(&bdrv_vpc); +} + +block_init(bdrv_vpc_init); diff --git a/block/vvfat.c b/block/vvfat.c new file mode 100644 index 000000000..59d3c5b8a --- /dev/null +++ b/block/vvfat.c @@ -0,0 +1,2911 @@ +/* vim:set shiftwidth=4 ts=8: */ +/* + * QEMU Block driver for virtual VFAT (shadows a local directory) + * + * Copyright (c) 2004,2005 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <sys/stat.h> +#include <dirent.h> +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include "migration.h" + +#ifndef S_IWGRP +#define S_IWGRP 0 +#endif +#ifndef S_IWOTH +#define S_IWOTH 0 +#endif + +/* TODO: add ":bootsector=blabla.img:" */ +/* LATER TODO: add automatic boot sector generation from + BOOTEASY.ASM and Ranish Partition Manager + Note that DOS assumes the system files to be the first files in the + file system (test if the boot sector still relies on that fact)! */ +/* MAYBE TODO: write block-visofs.c */ +/* TODO: call try_commit() only after a timeout */ + +/* #define DEBUG */ + +#ifdef DEBUG + +#define DLOG(a) a + +#undef stderr +#define stderr STDERR +FILE* stderr = NULL; + +static void checkpoint(void); + +#ifdef __MINGW32__ +void nonono(const char* file, int line, const char* msg) { + fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg); + exit(-5); +} +#undef assert +#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0) +#endif + +#else + +#define DLOG(a) + +#endif + +/* dynamic array functions */ +typedef struct array_t { + char* pointer; + unsigned int size,next,item_size; +} array_t; + +static inline void array_init(array_t* array,unsigned int item_size) +{ + array->pointer = NULL; + array->size=0; + array->next=0; + array->item_size=item_size; +} + +static inline void array_free(array_t* array) +{ + g_free(array->pointer); + array->size=array->next=0; +} + +/* does not automatically grow */ +static inline void* array_get(array_t* array,unsigned int index) { + assert(index < array->next); + return array->pointer + index * array->item_size; +} + +static inline int array_ensure_allocated(array_t* array, int index) +{ + if((index + 1) * array->item_size > array->size) { + int new_size = (index + 32) * array->item_size; + array->pointer = g_realloc(array->pointer, new_size); + if (!array->pointer) + return -1; + array->size = new_size; + array->next = index + 1; + } + + return 0; +} + +static inline void* array_get_next(array_t* array) { + unsigned int next = array->next; + void* result; + + if (array_ensure_allocated(array, next) < 0) + return NULL; + + array->next = next + 1; + result = array_get(array, next); + + return result; +} + +static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { + if((array->next+count)*array->item_size>array->size) { + int increment=count*array->item_size; + array->pointer=g_realloc(array->pointer,array->size+increment); + if(!array->pointer) + return NULL; + array->size+=increment; + } + memmove(array->pointer+(index+count)*array->item_size, + array->pointer+index*array->item_size, + (array->next-index)*array->item_size); + array->next+=count; + return array->pointer+index*array->item_size; +} + +/* this performs a "roll", so that the element which was at index_from becomes + * index_to, but the order of all other elements is preserved. */ +static inline int array_roll(array_t* array,int index_to,int index_from,int count) +{ + char* buf; + char* from; + char* to; + int is; + + if(!array || + index_to<0 || index_to>=array->next || + index_from<0 || index_from>=array->next) + return -1; + + if(index_to==index_from) + return 0; + + is=array->item_size; + from=array->pointer+index_from*is; + to=array->pointer+index_to*is; + buf=g_malloc(is*count); + memcpy(buf,from,is*count); + + if(index_to<index_from) + memmove(to+is*count,to,from-to); + else + memmove(from,from+is*count,to-from); + + memcpy(to,buf,is*count); + + g_free(buf); + + return 0; +} + +static inline int array_remove_slice(array_t* array,int index, int count) +{ + assert(index >=0); + assert(count > 0); + assert(index + count <= array->next); + if(array_roll(array,array->next-1,index,count)) + return -1; + array->next -= count; + return 0; +} + +static int array_remove(array_t* array,int index) +{ + return array_remove_slice(array, index, 1); +} + +/* return the index for a given member */ +static int array_index(array_t* array, void* pointer) +{ + size_t offset = (char*)pointer - array->pointer; + assert((offset % array->item_size) == 0); + assert(offset/array->item_size < array->next); + return offset/array->item_size; +} + +/* These structures are used to fake a disk and the VFAT filesystem. + * For this reason we need to use QEMU_PACKED. */ + +typedef struct bootsector_t { + uint8_t jump[3]; + uint8_t name[8]; + uint16_t sector_size; + uint8_t sectors_per_cluster; + uint16_t reserved_sectors; + uint8_t number_of_fats; + uint16_t root_entries; + uint16_t total_sectors16; + uint8_t media_type; + uint16_t sectors_per_fat; + uint16_t sectors_per_track; + uint16_t number_of_heads; + uint32_t hidden_sectors; + uint32_t total_sectors; + union { + struct { + uint8_t drive_number; + uint8_t current_head; + uint8_t signature; + uint32_t id; + uint8_t volume_label[11]; + } QEMU_PACKED fat16; + struct { + uint32_t sectors_per_fat; + uint16_t flags; + uint8_t major,minor; + uint32_t first_cluster_of_root_directory; + uint16_t info_sector; + uint16_t backup_boot_sector; + uint16_t ignored; + } QEMU_PACKED fat32; + } u; + uint8_t fat_type[8]; + uint8_t ignored[0x1c0]; + uint8_t magic[2]; +} QEMU_PACKED bootsector_t; + +typedef struct { + uint8_t head; + uint8_t sector; + uint8_t cylinder; +} mbr_chs_t; + +typedef struct partition_t { + uint8_t attributes; /* 0x80 = bootable */ + mbr_chs_t start_CHS; + uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */ + mbr_chs_t end_CHS; + uint32_t start_sector_long; + uint32_t length_sector_long; +} QEMU_PACKED partition_t; + +typedef struct mbr_t { + uint8_t ignored[0x1b8]; + uint32_t nt_id; + uint8_t ignored2[2]; + partition_t partition[4]; + uint8_t magic[2]; +} QEMU_PACKED mbr_t; + +typedef struct direntry_t { + uint8_t name[8]; + uint8_t extension[3]; + uint8_t attributes; + uint8_t reserved[2]; + uint16_t ctime; + uint16_t cdate; + uint16_t adate; + uint16_t begin_hi; + uint16_t mtime; + uint16_t mdate; + uint16_t begin; + uint32_t size; +} QEMU_PACKED direntry_t; + +/* this structure are used to transparently access the files */ + +typedef struct mapping_t { + /* begin is the first cluster, end is the last+1 */ + uint32_t begin,end; + /* as s->directory is growable, no pointer may be used here */ + unsigned int dir_index; + /* the clusters of a file may be in any order; this points to the first */ + int first_mapping_index; + union { + /* offset is + * - the offset in the file (in clusters) for a file, or + * - the next cluster of the directory for a directory, and + * - the address of the buffer for a faked entry + */ + struct { + uint32_t offset; + } file; + struct { + int parent_mapping_index; + int first_dir_index; + } dir; + } info; + /* path contains the full path, i.e. it always starts with s->path */ + char* path; + + enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2, + MODE_DIRECTORY = 4, MODE_FAKED = 8, + MODE_DELETED = 16, MODE_RENAMED = 32 } mode; + int read_only; +} mapping_t; + +#ifdef DEBUG +static void print_direntry(const struct direntry_t*); +static void print_mapping(const struct mapping_t* mapping); +#endif + +/* here begins the real VVFAT driver */ + +typedef struct BDRVVVFATState { + CoMutex lock; + BlockDriverState* bs; /* pointer to parent */ + unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */ + unsigned char first_sectors[0x40*0x200]; + + int fat_type; /* 16 or 32 */ + array_t fat,directory,mapping; + + unsigned int cluster_size; + unsigned int sectors_per_cluster; + unsigned int sectors_per_fat; + unsigned int sectors_of_root_directory; + uint32_t last_cluster_of_root_directory; + unsigned int faked_sectors; /* how many sectors are faked before file data */ + uint32_t sector_count; /* total number of sectors of the partition */ + uint32_t cluster_count; /* total number of clusters of this partition */ + uint32_t max_fat_value; + + int current_fd; + mapping_t* current_mapping; + unsigned char* cluster; /* points to current cluster */ + unsigned char* cluster_buffer; /* points to a buffer to hold temp data */ + unsigned int current_cluster; + + /* write support */ + BlockDriverState* write_target; + char* qcow_filename; + BlockDriverState* qcow; + void* fat2; + char* used_clusters; + array_t commits; + const char* path; + int downcase_short_names; + + Error *migration_blocker; +} BDRVVVFATState; + +/* take the sector position spos and convert it to Cylinder/Head/Sector position + * if the position is outside the specified geometry, fill maximum value for CHS + * and return 1 to signal overflow. + */ +static int sector2CHS(mbr_chs_t *chs, int spos, int cyls, int heads, int secs) +{ + int head,sector; + sector = spos % secs; spos /= secs; + head = spos % heads; spos /= heads; + if (spos >= cyls) { + /* Overflow, + it happens if 32bit sector positions are used, while CHS is only 24bit. + Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */ + chs->head = 0xFF; + chs->sector = 0xFF; + chs->cylinder = 0xFF; + return 1; + } + chs->head = (uint8_t)head; + chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) ); + chs->cylinder = (uint8_t)spos; + return 0; +} + +static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs) +{ + /* TODO: if the files mbr.img and bootsect.img exist, use them */ + mbr_t* real_mbr=(mbr_t*)s->first_sectors; + partition_t* partition = &(real_mbr->partition[0]); + int lba; + + memset(s->first_sectors,0,512); + + /* Win NT Disk Signature */ + real_mbr->nt_id= cpu_to_le32(0xbe1afdfa); + + partition->attributes=0x80; /* bootable */ + + /* LBA is used when partition is outside the CHS geometry */ + lba = sector2CHS(&partition->start_CHS, s->first_sectors_number - 1, + cyls, heads, secs); + lba |= sector2CHS(&partition->end_CHS, s->bs->total_sectors - 1, + cyls, heads, secs); + + /*LBA partitions are identified only by start/length_sector_long not by CHS*/ + partition->start_sector_long = cpu_to_le32(s->first_sectors_number - 1); + partition->length_sector_long = cpu_to_le32(s->bs->total_sectors + - s->first_sectors_number + 1); + + /* FAT12/FAT16/FAT32 */ + /* DOS uses different types when partition is LBA, + probably to prevent older versions from using CHS on them */ + partition->fs_type= s->fat_type==12 ? 0x1: + s->fat_type==16 ? (lba?0xe:0x06): + /*fat_tyoe==32*/ (lba?0xc:0x0b); + + real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa; +} + +/* direntry functions */ + +/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */ +static inline int short2long_name(char* dest,const char* src) +{ + int i; + int len; + for(i=0;i<129 && src[i];i++) { + dest[2*i]=src[i]; + dest[2*i+1]=0; + } + len=2*i; + dest[2*i]=dest[2*i+1]=0; + for(i=2*i+2;(i%26);i++) + dest[i]=0xff; + return len; +} + +static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename) +{ + char buffer[258]; + int length=short2long_name(buffer,filename), + number_of_entries=(length+25)/26,i; + direntry_t* entry; + + for(i=0;i<number_of_entries;i++) { + entry=array_get_next(&(s->directory)); + entry->attributes=0xf; + entry->reserved[0]=0; + entry->begin=0; + entry->name[0]=(number_of_entries-i)|(i==0?0x40:0); + } + for(i=0;i<26*number_of_entries;i++) { + int offset=(i%26); + if(offset<10) offset=1+offset; + else if(offset<22) offset=14+offset-10; + else offset=28+offset-22; + entry=array_get(&(s->directory),s->directory.next-1-(i/26)); + entry->name[offset]=buffer[i]; + } + return array_get(&(s->directory),s->directory.next-number_of_entries); +} + +static char is_free(const direntry_t* direntry) +{ + return direntry->name[0]==0xe5 || direntry->name[0]==0x00; +} + +static char is_volume_label(const direntry_t* direntry) +{ + return direntry->attributes == 0x28; +} + +static char is_long_name(const direntry_t* direntry) +{ + return direntry->attributes == 0xf; +} + +static char is_short_name(const direntry_t* direntry) +{ + return !is_volume_label(direntry) && !is_long_name(direntry) + && !is_free(direntry); +} + +static char is_directory(const direntry_t* direntry) +{ + return direntry->attributes & 0x10 && direntry->name[0] != 0xe5; +} + +static inline char is_dot(const direntry_t* direntry) +{ + return is_short_name(direntry) && direntry->name[0] == '.'; +} + +static char is_file(const direntry_t* direntry) +{ + return is_short_name(direntry) && !is_directory(direntry); +} + +static inline uint32_t begin_of_direntry(const direntry_t* direntry) +{ + return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16); +} + +static inline uint32_t filesize_of_direntry(const direntry_t* direntry) +{ + return le32_to_cpu(direntry->size); +} + +static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin) +{ + direntry->begin = cpu_to_le16(begin & 0xffff); + direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff); +} + +/* fat functions */ + +static inline uint8_t fat_chksum(const direntry_t* entry) +{ + uint8_t chksum=0; + int i; + + for(i=0;i<11;i++) { + unsigned char c; + + c = (i < 8) ? entry->name[i] : entry->extension[i-8]; + chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c; + } + + return chksum; +} + +/* if return_time==0, this returns the fat_date, else the fat_time */ +static uint16_t fat_datetime(time_t time,int return_time) { + struct tm* t; +#ifdef _WIN32 + t=localtime(&time); /* this is not thread safe */ +#else + struct tm t1; + t = &t1; + localtime_r(&time,t); +#endif + if(return_time) + return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); + return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); +} + +static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le32(value); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le16(value&0xffff); + } else { + int offset = (cluster*3/2); + unsigned char* p = array_get(&(s->fat), offset); + switch (cluster&1) { + case 0: + p[0] = value&0xff; + p[1] = (p[1]&0xf0) | ((value>>8)&0xf); + break; + case 1: + p[0] = (p[0]&0xf) | ((value&0xf)<<4); + p[1] = (value>>4); + break; + } + } +} + +static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + return le32_to_cpu(*entry); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + return le16_to_cpu(*entry); + } else { + const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry) +{ + if(fat_entry>s->max_fat_value-8) + return -1; + return 0; +} + +static inline void init_fat(BDRVVVFATState* s) +{ + if (s->fat_type == 12) { + array_init(&(s->fat),1); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 * 3 / 2 - 1); + } else { + array_init(&(s->fat),(s->fat_type==32?4:2)); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 / s->fat.item_size - 1); + } + memset(s->fat.pointer,0,s->fat.size); + + switch(s->fat_type) { + case 12: s->max_fat_value=0xfff; break; + case 16: s->max_fat_value=0xffff; break; + case 32: s->max_fat_value=0x0fffffff; break; + default: s->max_fat_value=0; /* error... */ + } + +} + +/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */ +/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */ +static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, + unsigned int directory_start, const char* filename, int is_dot) +{ + int i,j,long_index=s->directory.next; + direntry_t* entry = NULL; + direntry_t* entry_long = NULL; + + if(is_dot) { + entry=array_get_next(&(s->directory)); + memset(entry->name,0x20,11); + memcpy(entry->name,filename,strlen(filename)); + return entry; + } + + entry_long=create_long_filename(s,filename); + + i = strlen(filename); + for(j = i - 1; j>0 && filename[j]!='.';j--); + if (j > 0) + i = (j > 8 ? 8 : j); + else if (i > 8) + i = 8; + + entry=array_get_next(&(s->directory)); + memset(entry->name,0x20,11); + memcpy(entry->name, filename, i); + + if(j > 0) + for (i = 0; i < 3 && filename[j+1+i]; i++) + entry->extension[i] = filename[j+1+i]; + + /* upcase & remove unwanted characters */ + for(i=10;i>=0;i--) { + if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--); + if(entry->name[i]<=' ' || entry->name[i]>0x7f + || strchr(".*?<>|\":/\\[];,+='",entry->name[i])) + entry->name[i]='_'; + else if(entry->name[i]>='a' && entry->name[i]<='z') + entry->name[i]+='A'-'a'; + } + + /* mangle duplicates */ + while(1) { + direntry_t* entry1=array_get(&(s->directory),directory_start); + int j; + + for(;entry1<entry;entry1++) + if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11)) + break; /* found dupe */ + if(entry1==entry) /* no dupe found */ + break; + + /* use all 8 characters of name */ + if(entry->name[7]==' ') { + int j; + for(j=6;j>0 && entry->name[j]==' ';j--) + entry->name[j]='~'; + } + + /* increment number */ + for(j=7;j>0 && entry->name[j]=='9';j--) + entry->name[j]='0'; + if(j>0) { + if(entry->name[j]<'0' || entry->name[j]>'9') + entry->name[j]='0'; + else + entry->name[j]++; + } + } + + /* calculate checksum; propagate to long name */ + if(entry_long) { + uint8_t chksum=fat_chksum(entry); + + /* calculate anew, because realloc could have taken place */ + entry_long=array_get(&(s->directory),long_index); + while(entry_long<entry && is_long_name(entry_long)) { + entry_long->reserved[1]=chksum; + entry_long++; + } + } + + return entry; +} + +/* + * Read a directory. (the index of the corresponding mapping must be passed). + */ +static int read_directory(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + direntry_t* direntry; + const char* dirname = mapping->path; + int first_cluster = mapping->begin; + int parent_index = mapping->info.dir.parent_mapping_index; + mapping_t* parent_mapping = (mapping_t*) + (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL); + int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1; + + DIR* dir=opendir(dirname); + struct dirent* entry; + int i; + + assert(mapping->mode & MODE_DIRECTORY); + + if(!dir) { + mapping->end = mapping->begin; + return -1; + } + + i = mapping->info.dir.first_dir_index = + first_cluster == 0 ? 0 : s->directory.next; + + /* actually read the directory, and allocate the mappings */ + while((entry=readdir(dir))) { + unsigned int length=strlen(dirname)+2+strlen(entry->d_name); + char* buffer; + direntry_t* direntry; + struct stat st; + int is_dot=!strcmp(entry->d_name,"."); + int is_dotdot=!strcmp(entry->d_name,".."); + + if(first_cluster == 0 && (is_dotdot || is_dot)) + continue; + + buffer=(char*)g_malloc(length); + snprintf(buffer,length,"%s/%s",dirname,entry->d_name); + + if(stat(buffer,&st)<0) { + g_free(buffer); + continue; + } + + /* create directory entry for this file */ + direntry=create_short_and_long_name(s, i, entry->d_name, + is_dot || is_dotdot); + direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20); + direntry->reserved[0]=direntry->reserved[1]=0; + direntry->ctime=fat_datetime(st.st_ctime,1); + direntry->cdate=fat_datetime(st.st_ctime,0); + direntry->adate=fat_datetime(st.st_atime,0); + direntry->begin_hi=0; + direntry->mtime=fat_datetime(st.st_mtime,1); + direntry->mdate=fat_datetime(st.st_mtime,0); + if(is_dotdot) + set_begin_of_direntry(direntry, first_cluster_of_parent); + else if(is_dot) + set_begin_of_direntry(direntry, first_cluster); + else + direntry->begin=0; /* do that later */ + if (st.st_size > 0x7fffffff) { + fprintf(stderr, "File %s is larger than 2GB\n", buffer); + g_free(buffer); + closedir(dir); + return -2; + } + direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size); + + /* create mapping for this file */ + if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) { + s->current_mapping=(mapping_t*)array_get_next(&(s->mapping)); + s->current_mapping->begin=0; + s->current_mapping->end=st.st_size; + /* + * we get the direntry of the most recent direntry, which + * contains the short name and all the relevant information. + */ + s->current_mapping->dir_index=s->directory.next-1; + s->current_mapping->first_mapping_index = -1; + if (S_ISDIR(st.st_mode)) { + s->current_mapping->mode = MODE_DIRECTORY; + s->current_mapping->info.dir.parent_mapping_index = + mapping_index; + } else { + s->current_mapping->mode = MODE_UNDEFINED; + s->current_mapping->info.file.offset = 0; + } + s->current_mapping->path=buffer; + s->current_mapping->read_only = + (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; + } + } + closedir(dir); + + /* fill with zeroes up to the end of the cluster */ + while(s->directory.next%(0x10*s->sectors_per_cluster)) { + direntry_t* direntry=array_get_next(&(s->directory)); + memset(direntry,0,sizeof(direntry_t)); + } + +/* TODO: if there are more entries, bootsector has to be adjusted! */ +#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster) + if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) { + /* root directory */ + int cur = s->directory.next; + array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1); + s->directory.next = ROOT_ENTRIES; + memset(array_get(&(s->directory), cur), 0, + (ROOT_ENTRIES - cur) * sizeof(direntry_t)); + } + + /* reget the mapping, since s->mapping was possibly realloc()ed */ + mapping = (mapping_t*)array_get(&(s->mapping), mapping_index); + first_cluster += (s->directory.next - mapping->info.dir.first_dir_index) + * 0x20 / s->cluster_size; + mapping->end = first_cluster; + + direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index); + set_begin_of_direntry(direntry, mapping->begin); + + return 0; +} + +static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num) +{ + return (sector_num-s->faked_sectors)/s->sectors_per_cluster; +} + +static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) +{ + return s->faked_sectors + s->sectors_per_cluster * cluster_num; +} + +static int init_directories(BDRVVVFATState* s, + const char *dirname, int heads, int secs) +{ + bootsector_t* bootsector; + mapping_t* mapping; + unsigned int i; + unsigned int cluster; + + memset(&(s->first_sectors[0]),0,0x40*0x200); + + s->cluster_size=s->sectors_per_cluster*0x200; + s->cluster_buffer=g_malloc(s->cluster_size); + + /* + * The formula: sc = spf+1+spf*spc*(512*8/fat_type), + * where sc is sector_count, + * spf is sectors_per_fat, + * spc is sectors_per_clusters, and + * fat_type = 12, 16 or 32. + */ + i = 1+s->sectors_per_cluster*0x200*8/s->fat_type; + s->sectors_per_fat=(s->sector_count+i)/i; /* round up */ + + array_init(&(s->mapping),sizeof(mapping_t)); + array_init(&(s->directory),sizeof(direntry_t)); + + /* add volume label */ + { + direntry_t* entry=array_get_next(&(s->directory)); + entry->attributes=0x28; /* archive | volume label */ + memcpy(entry->name,"QEMU VVF",8); + memcpy(entry->extension,"AT ",3); + } + + /* Now build FAT, and write back information into directory */ + init_fat(s); + + s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2; + s->cluster_count=sector2cluster(s, s->sector_count); + + mapping = array_get_next(&(s->mapping)); + mapping->begin = 0; + mapping->dir_index = 0; + mapping->info.dir.parent_mapping_index = -1; + mapping->first_mapping_index = -1; + mapping->path = g_strdup(dirname); + i = strlen(mapping->path); + if (i > 0 && mapping->path[i - 1] == '/') + mapping->path[i - 1] = '\0'; + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + s->path = mapping->path; + + for (i = 0, cluster = 0; i < s->mapping.next; i++) { + /* MS-DOS expects the FAT to be 0 for the root directory + * (except for the media byte). */ + /* LATER TODO: still true for FAT32? */ + int fix_fat = (i != 0); + mapping = array_get(&(s->mapping), i); + + if (mapping->mode & MODE_DIRECTORY) { + mapping->begin = cluster; + if(read_directory(s, i)) { + fprintf(stderr, "Could not read directory %s\n", + mapping->path); + return -1; + } + mapping = array_get(&(s->mapping), i); + } else { + assert(mapping->mode == MODE_UNDEFINED); + mapping->mode=MODE_NORMAL; + mapping->begin = cluster; + if (mapping->end > 0) { + direntry_t* direntry = array_get(&(s->directory), + mapping->dir_index); + + mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size; + set_begin_of_direntry(direntry, mapping->begin); + } else { + mapping->end = cluster + 1; + fix_fat = 0; + } + } + + assert(mapping->begin < mapping->end); + + /* next free cluster */ + cluster = mapping->end; + + if(cluster > s->cluster_count) { + fprintf(stderr,"Directory does not fit in FAT%d (capacity %.2f MB)\n", + s->fat_type, s->sector_count / 2000.0); + return -EINVAL; + } + + /* fix fat for entry */ + if (fix_fat) { + int j; + for(j = mapping->begin; j < mapping->end - 1; j++) + fat_set(s, j, j+1); + fat_set(s, mapping->end - 1, s->max_fat_value); + } + } + + mapping = array_get(&(s->mapping), 0); + s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster; + s->last_cluster_of_root_directory = mapping->end; + + /* the FAT signature */ + fat_set(s,0,s->max_fat_value); + fat_set(s,1,s->max_fat_value); + + s->current_mapping = NULL; + + bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200); + bootsector->jump[0]=0xeb; + bootsector->jump[1]=0x3e; + bootsector->jump[2]=0x90; + memcpy(bootsector->name,"QEMU ",8); + bootsector->sector_size=cpu_to_le16(0x200); + bootsector->sectors_per_cluster=s->sectors_per_cluster; + bootsector->reserved_sectors=cpu_to_le16(1); + bootsector->number_of_fats=0x2; /* number of FATs */ + bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10); + bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count); + bootsector->media_type=(s->first_sectors_number>1?0xf8:0xf0); /* media descriptor (f8=hd, f0=3.5 fd)*/ + s->fat.pointer[0] = bootsector->media_type; + bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat); + bootsector->sectors_per_track = cpu_to_le16(secs); + bootsector->number_of_heads = cpu_to_le16(heads); + bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f); + bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0); + + /* LATER TODO: if FAT32, this is wrong */ + bootsector->u.fat16.drive_number=s->first_sectors_number==1?0:0x80; /* fda=0, hda=0x80 */ + bootsector->u.fat16.current_head=0; + bootsector->u.fat16.signature=0x29; + bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); + + memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11); + memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8); + bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; + + return 0; +} + +#ifdef DEBUG +static BDRVVVFATState *vvv = NULL; +#endif + +static int enable_write_target(BDRVVVFATState *s); +static int is_consistent(BDRVVVFATState *s); + +static void vvfat_rebind(BlockDriverState *bs) +{ + BDRVVVFATState *s = bs->opaque; + s->bs = bs; +} + +static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags) +{ + BDRVVVFATState *s = bs->opaque; + int i, cyls, heads, secs; + +#ifdef DEBUG + vvv = s; +#endif + +DLOG(if (stderr == NULL) { + stderr = fopen("vvfat.log", "a"); + setbuf(stderr, NULL); +}) + + s->bs = bs; + + /* LATER TODO: if FAT32, adjust */ + s->sectors_per_cluster=0x10; + + s->current_cluster=0xffffffff; + + s->first_sectors_number=0x40; + /* read only is the default for safety */ + bs->read_only = 1; + s->qcow = s->write_target = NULL; + s->qcow_filename = NULL; + s->fat2 = NULL; + s->downcase_short_names = 1; + + if (!strstart(dirname, "fat:", NULL)) + return -1; + + if (strstr(dirname, ":32:")) { + fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n"); + s->fat_type = 32; + } else if (strstr(dirname, ":16:")) { + s->fat_type = 16; + } else if (strstr(dirname, ":12:")) { + s->fat_type = 12; + } + + if (strstr(dirname, ":floppy:")) { + /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ + if (!s->fat_type) { + s->fat_type = 12; + secs = 36; + s->sectors_per_cluster=2; + } else { + secs = s->fat_type == 12 ? 18 : 36; + s->sectors_per_cluster=1; + } + s->first_sectors_number = 1; + cyls = 80; + heads = 2; + } else { + /* 32MB or 504MB disk*/ + if (!s->fat_type) { + s->fat_type = 16; + } + cyls = s->fat_type == 12 ? 64 : 1024; + heads = 16; + secs = 63; + } + fprintf(stderr, "vvfat %s chs %d,%d,%d\n", + dirname, cyls, heads, secs); + + s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); + + if (strstr(dirname, ":rw:")) { + if (enable_write_target(s)) + return -1; + bs->read_only = 0; + } + + i = strrchr(dirname, ':') - dirname; + assert(i >= 3); + if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1])) + /* workaround for DOS drive names */ + dirname += i-1; + else + dirname += i+1; + + bs->total_sectors = cyls * heads * secs; + + if (init_directories(s, dirname, heads, secs)) { + return -1; + } + + s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; + + if (s->first_sectors_number == 0x40) { + init_mbr(s, cyls, heads, secs); + } + + // assert(is_consistent(s)); + qemu_co_mutex_init(&s->lock); + + /* Disable migration when vvfat is used rw */ + if (s->qcow) { + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vvfat (rw)", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + } + + return 0; +} + +static inline void vvfat_close_current_file(BDRVVVFATState *s) +{ + if(s->current_mapping) { + s->current_mapping = NULL; + if (s->current_fd) { + qemu_close(s->current_fd); + s->current_fd = 0; + } + } + s->current_cluster = -1; +} + +/* mappings between index1 and index2-1 are supposed to be ordered + * return value is the index of the last mapping for which end>cluster_num + */ +static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2) +{ + while(1) { + int index3; + mapping_t* mapping; + index3=(index1+index2)/2; + mapping=array_get(&(s->mapping),index3); + assert(mapping->begin < mapping->end); + if(mapping->begin>=cluster_num) { + assert(index2!=index3 || index2==0); + if(index2==index3) + return index1; + index2=index3; + } else { + if(index1==index3) + return mapping->end<=cluster_num ? index2 : index1; + index1=index3; + } + assert(index1<=index2); + DLOG(mapping=array_get(&(s->mapping),index1); + assert(mapping->begin<=cluster_num); + assert(index2 >= s->mapping.next || + ((mapping = array_get(&(s->mapping),index2)) && + mapping->end>cluster_num))); + } +} + +static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num) +{ + int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next); + mapping_t* mapping; + if(index>=s->mapping.next) + return NULL; + mapping=array_get(&(s->mapping),index); + if(mapping->begin>cluster_num) + return NULL; + assert(mapping->begin<=cluster_num && mapping->end>cluster_num); + return mapping; +} + +static int open_file(BDRVVVFATState* s,mapping_t* mapping) +{ + if(!mapping) + return -1; + if(!s->current_mapping || + strcmp(s->current_mapping->path,mapping->path)) { + /* open file */ + int fd = qemu_open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE); + if(fd<0) + return -1; + vvfat_close_current_file(s); + s->current_fd = fd; + s->current_mapping = mapping; + } + return 0; +} + +static inline int read_cluster(BDRVVVFATState *s,int cluster_num) +{ + if(s->current_cluster != cluster_num) { + int result=0; + off_t offset; + assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY)); + if(!s->current_mapping + || s->current_mapping->begin>cluster_num + || s->current_mapping->end<=cluster_num) { + /* binary search of mappings for file */ + mapping_t* mapping=find_mapping_for_cluster(s,cluster_num); + + assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end)); + + if (mapping && mapping->mode & MODE_DIRECTORY) { + vvfat_close_current_file(s); + s->current_mapping = mapping; +read_cluster_directory: + offset = s->cluster_size*(cluster_num-s->current_mapping->begin); + s->cluster = (unsigned char*)s->directory.pointer+offset + + 0x20*s->current_mapping->info.dir.first_dir_index; + assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0); + assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size); + s->current_cluster = cluster_num; + return 0; + } + + if(open_file(s,mapping)) + return -2; + } else if (s->current_mapping->mode & MODE_DIRECTORY) + goto read_cluster_directory; + + assert(s->current_fd); + + offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset; + if(lseek(s->current_fd, offset, SEEK_SET)!=offset) + return -3; + s->cluster=s->cluster_buffer; + result=read(s->current_fd,s->cluster,s->cluster_size); + if(result<0) { + s->current_cluster = -1; + return -1; + } + s->current_cluster = cluster_num; + } + return 0; +} + +#ifdef DEBUG +static void print_direntry(const direntry_t* direntry) +{ + int j = 0; + char buffer[1024]; + + fprintf(stderr, "direntry %p: ", direntry); + if(!direntry) + return; + if(is_long_name(direntry)) { + unsigned char* c=(unsigned char*)direntry; + int i; + for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2) +#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;} + ADD_CHAR(c[i]); + for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + buffer[j] = 0; + fprintf(stderr, "%s\n", buffer); + } else { + int i; + for(i=0;i<11;i++) + ADD_CHAR(direntry->name[i]); + buffer[j] = 0; + fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n", + buffer, + direntry->attributes, + begin_of_direntry(direntry),le32_to_cpu(direntry->size)); + } +} + +static void print_mapping(const mapping_t* mapping) +{ + fprintf(stderr, "mapping (%p): begin, end = %d, %d, dir_index = %d, " + "first_mapping_index = %d, name = %s, mode = 0x%x, " , + mapping, mapping->begin, mapping->end, mapping->dir_index, + mapping->first_mapping_index, mapping->path, mapping->mode); + + if (mapping->mode & MODE_DIRECTORY) + fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index); + else + fprintf(stderr, "offset = %d\n", mapping->info.file.offset); +} +#endif + +static int vvfat_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++,sector_num++) { + if (sector_num >= bs->total_sectors) + return -1; + if (s->qcow) { + int n; + if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) { +DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); + if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) { + return -1; + } + i += n - 1; + sector_num += n - 1; + continue; + } +DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); + } + if(sector_num<s->faked_sectors) { + if(sector_num<s->first_sectors_number) + memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200); + else if(sector_num-s->first_sectors_number<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200); + else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200); + } else { + uint32_t sector=sector_num-s->faked_sectors, + sector_offset_in_cluster=(sector%s->sectors_per_cluster), + cluster_num=sector/s->sectors_per_cluster; + if(cluster_num > s->cluster_count || read_cluster(s, cluster_num) != 0) { + /* LATER TODO: strict: return -1; */ + memset(buf+i*0x200,0,0x200); + continue; + } + memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200); + } + } + return 0; +} + +static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVVFATState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vvfat_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +/* LATER TODO: statify all functions */ + +/* + * Idea of the write support (use snapshot): + * + * 1. check if all data is consistent, recording renames, modifications, + * new files and directories (in s->commits). + * + * 2. if the data is not consistent, stop committing + * + * 3. handle renames, and create new files and directories (do not yet + * write their contents) + * + * 4. walk the directories, fixing the mapping and direntries, and marking + * the handled mappings as not deleted + * + * 5. commit the contents of the files + * + * 6. handle deleted files and directories + * + */ + +typedef struct commit_t { + char* path; + union { + struct { uint32_t cluster; } rename; + struct { int dir_index; uint32_t modified_offset; } writeout; + struct { uint32_t first_cluster; } new_file; + struct { uint32_t cluster; } mkdir; + } param; + /* DELETEs and RMDIRs are handled differently: see handle_deletes() */ + enum { + ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR + } action; +} commit_t; + +static void clear_commits(BDRVVVFATState* s) +{ + int i; +DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next)); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + assert(commit->path || commit->action == ACTION_WRITEOUT); + if (commit->action != ACTION_WRITEOUT) { + assert(commit->path); + g_free(commit->path); + } else + assert(commit->path == NULL); + } + s->commits.next = 0; +} + +static void schedule_rename(BDRVVVFATState* s, + uint32_t cluster, char* new_path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = new_path; + commit->param.rename.cluster = cluster; + commit->action = ACTION_RENAME; +} + +static void schedule_writeout(BDRVVVFATState* s, + int dir_index, uint32_t modified_offset) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = NULL; + commit->param.writeout.dir_index = dir_index; + commit->param.writeout.modified_offset = modified_offset; + commit->action = ACTION_WRITEOUT; +} + +static void schedule_new_file(BDRVVVFATState* s, + char* path, uint32_t first_cluster) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.new_file.first_cluster = first_cluster; + commit->action = ACTION_NEW_FILE; +} + +static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.mkdir.cluster = cluster; + commit->action = ACTION_MKDIR; +} + +typedef struct { + /* + * Since the sequence number is at most 0x3f, and the filename + * length is at most 13 times the sequence number, the maximal + * filename length is 0x3f * 13 bytes. + */ + unsigned char name[0x3f * 13 + 1]; + int checksum, len; + int sequence_number; +} long_file_name; + +static void lfn_init(long_file_name* lfn) +{ + lfn->sequence_number = lfn->len = 0; + lfn->checksum = 0x100; +} + +/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */ +static int parse_long_name(long_file_name* lfn, + const direntry_t* direntry) +{ + int i, j, offset; + const unsigned char* pointer = (const unsigned char*)direntry; + + if (!is_long_name(direntry)) + return 1; + + if (pointer[0] & 0x40) { + lfn->sequence_number = pointer[0] & 0x3f; + lfn->checksum = pointer[13]; + lfn->name[0] = 0; + lfn->name[lfn->sequence_number * 13] = 0; + } else if ((pointer[0] & 0x3f) != --lfn->sequence_number) + return -1; + else if (pointer[13] != lfn->checksum) + return -2; + else if (pointer[12] || pointer[26] || pointer[27]) + return -3; + + offset = 13 * (lfn->sequence_number - 1); + for (i = 0, j = 1; i < 13; i++, j+=2) { + if (j == 11) + j = 14; + else if (j == 26) + j = 28; + + if (pointer[j+1] == 0) + lfn->name[offset + i] = pointer[j]; + else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0) + return -4; + else + lfn->name[offset + i] = 0; + } + + if (pointer[0] & 0x40) + lfn->len = offset + strlen((char*)lfn->name + offset); + + return 0; +} + +/* returns 0 if successful, >0 if no short_name, and <0 on error */ +static int parse_short_name(BDRVVVFATState* s, + long_file_name* lfn, direntry_t* direntry) +{ + int i, j; + + if (!is_short_name(direntry)) + return 1; + + for (j = 7; j >= 0 && direntry->name[j] == ' '; j--); + for (i = 0; i <= j; i++) { + if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f) + return -1; + else if (s->downcase_short_names) + lfn->name[i] = qemu_tolower(direntry->name[i]); + else + lfn->name[i] = direntry->name[i]; + } + + for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--); + if (j >= 0) { + lfn->name[i++] = '.'; + lfn->name[i + j + 1] = '\0'; + for (;j >= 0; j--) { + if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f) + return -2; + else if (s->downcase_short_names) + lfn->name[i + j] = qemu_tolower(direntry->extension[j]); + else + lfn->name[i + j] = direntry->extension[j]; + } + } else + lfn->name[i + j + 1] = '\0'; + + lfn->len = strlen((char*)lfn->name); + + return 0; +} + +static inline uint32_t modified_fat_get(BDRVVVFATState* s, + unsigned int cluster) +{ + if (cluster < s->last_cluster_of_root_directory) { + if (cluster + 1 == s->last_cluster_of_root_directory) + return s->max_fat_value; + else + return cluster + 1; + } + + if (s->fat_type==32) { + uint32_t* entry=((uint32_t*)s->fat2)+cluster; + return le32_to_cpu(*entry); + } else if (s->fat_type==16) { + uint16_t* entry=((uint16_t*)s->fat2)+cluster; + return le16_to_cpu(*entry); + } else { + const uint8_t* x=s->fat2+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) +{ + int was_modified = 0; + int i, dummy; + + if (s->qcow == NULL) + return 0; + + for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) + was_modified = bdrv_is_allocated(s->qcow, + cluster2sector(s, cluster_num) + i, 1, &dummy); + + return was_modified; +} + +static const char* get_basename(const char* path) +{ + char* basename = strrchr(path, '/'); + if (basename == NULL) + return path; + else + return basename + 1; /* strip '/' */ +} + +/* + * The array s->used_clusters holds the states of the clusters. If it is + * part of a file, it has bit 2 set, in case of a directory, bit 1. If it + * was modified, bit 3 is set. + * If any cluster is allocated, but not part of a file or directory, this + * driver refuses to commit. + */ +typedef enum { + USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4 +} used_t; + +/* + * get_cluster_count_for_direntry() not only determines how many clusters + * are occupied by direntry, but also if it was renamed or modified. + * + * A file is thought to be renamed *only* if there already was a file with + * exactly the same first cluster, but a different name. + * + * Further, the files/directories handled by this function are + * assumed to be *not* deleted (and *only* those). + */ +static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, + direntry_t* direntry, const char* path) +{ + /* + * This is a little bit tricky: + * IF the guest OS just inserts a cluster into the file chain, + * and leaves the rest alone, (i.e. the original file had clusters + * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens: + * + * - do_commit will write the cluster into the file at the given + * offset, but + * + * - the cluster which is overwritten should be moved to a later + * position in the file. + * + * I am not aware that any OS does something as braindead, but this + * situation could happen anyway when not committing for a long time. + * Just to be sure that this does not bite us, detect it, and copy the + * contents of the clusters to-be-overwritten into the qcow. + */ + int copy_it = 0; + int was_modified = 0; + int32_t ret = 0; + + uint32_t cluster_num = begin_of_direntry(direntry); + uint32_t offset = 0; + int first_mapping_index = -1; + mapping_t* mapping = NULL; + const char* basename2 = NULL; + + vvfat_close_current_file(s); + + /* the root directory */ + if (cluster_num == 0) + return 0; + + /* write support */ + if (s->qcow) { + basename2 = get_basename(path); + + mapping = find_mapping_for_cluster(s, cluster_num); + + if (mapping) { + const char* basename; + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + basename = get_basename(mapping->path); + + assert(mapping->mode & MODE_NORMAL); + + /* rename */ + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, g_strdup(path)); + } else if (is_file(direntry)) + /* new file */ + schedule_new_file(s, g_strdup(path), cluster_num); + else { + abort(); + return 0; + } + } + + while(1) { + if (s->qcow) { + if (!copy_it && cluster_was_modified(s, cluster_num)) { + if (mapping == NULL || + mapping->begin > cluster_num || + mapping->end <= cluster_num) + mapping = find_mapping_for_cluster(s, cluster_num); + + + if (mapping && + (mapping->mode & MODE_DIRECTORY) == 0) { + + /* was modified in qcow */ + if (offset != mapping->info.file.offset + s->cluster_size + * (cluster_num - mapping->begin)) { + /* offset of this cluster in file chain has changed */ + abort(); + copy_it = 1; + } else if (offset == 0) { + const char* basename = get_basename(mapping->path); + + if (strcmp(basename, basename2)) + copy_it = 1; + first_mapping_index = array_index(&(s->mapping), mapping); + } + + if (mapping->first_mapping_index != first_mapping_index + && mapping->info.file.offset > 0) { + abort(); + copy_it = 1; + } + + /* need to write out? */ + if (!was_modified && is_file(direntry)) { + was_modified = 1; + schedule_writeout(s, mapping->dir_index, offset); + } + } + } + + if (copy_it) { + int i, dummy; + /* + * This is horribly inefficient, but that is okay, since + * it is rarely executed, if at all. + */ + int64_t offset = cluster2sector(s, cluster_num); + + vvfat_close_current_file(s); + for (i = 0; i < s->sectors_per_cluster; i++) { + if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) { + if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) { + return -1; + } + if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) { + return -2; + } + } + } + } + } + + ret++; + if (s->used_clusters[cluster_num] & USED_ANY) + return 0; + s->used_clusters[cluster_num] = USED_FILE; + + cluster_num = modified_fat_get(s, cluster_num); + + if (fat_eof(s, cluster_num)) + return ret; + else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16) + return -1; + + offset += s->cluster_size; + } +} + +/* + * This function looks at the modified data (qcow). + * It returns 0 upon inconsistency or error, and the number of clusters + * used by the directory, its subdirectories and their files. + */ +static int check_directory_consistency(BDRVVVFATState *s, + int cluster_num, const char* path) +{ + int ret = 0; + unsigned char* cluster = g_malloc(s->cluster_size); + direntry_t* direntries = (direntry_t*)cluster; + mapping_t* mapping = find_mapping_for_cluster(s, cluster_num); + + long_file_name lfn; + int path_len = strlen(path); + char path2[PATH_MAX + 1]; + + assert(path_len < PATH_MAX); /* len was tested before! */ + pstrcpy(path2, sizeof(path2), path); + path2[path_len] = '/'; + path2[path_len + 1] = '\0'; + + if (mapping) { + const char* basename = get_basename(mapping->path); + const char* basename2 = get_basename(path); + + assert(mapping->mode & MODE_DIRECTORY); + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, g_strdup(path)); + } else + /* new directory */ + schedule_mkdir(s, cluster_num, g_strdup(path)); + + lfn_init(&lfn); + do { + int i; + int subret = 0; + + ret++; + + if (s->used_clusters[cluster_num] & USED_ANY) { + fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); + return 0; + } + s->used_clusters[cluster_num] = USED_DIRECTORY; + +DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num))); + subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster, + s->sectors_per_cluster); + if (subret) { + fprintf(stderr, "Error fetching direntries\n"); + fail: + g_free(cluster); + return 0; + } + + for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) { + int cluster_count = 0; + +DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)); + if (is_volume_label(direntries + i) || is_dot(direntries + i) || + is_free(direntries + i)) + continue; + + subret = parse_long_name(&lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in long name\n"); + goto fail; + } + if (subret == 0 || is_free(direntries + i)) + continue; + + if (fat_chksum(direntries+i) != lfn.checksum) { + subret = parse_short_name(s, &lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in short name (%d)\n", subret); + goto fail; + } + if (subret > 0 || !strcmp((char*)lfn.name, ".") + || !strcmp((char*)lfn.name, "..")) + continue; + } + lfn.checksum = 0x100; /* cannot use long name twice */ + + if (path_len + 1 + lfn.len >= PATH_MAX) { + fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name); + goto fail; + } + pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1, + (char*)lfn.name); + + if (is_directory(direntries + i)) { + if (begin_of_direntry(direntries + i) == 0) { + DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i)); + goto fail; + } + cluster_count = check_directory_consistency(s, + begin_of_direntry(direntries + i), path2); + if (cluster_count == 0) { + DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i)); + goto fail; + } + } else if (is_file(direntries + i)) { + /* check file size with FAT */ + cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); + if (cluster_count != + (le32_to_cpu(direntries[i].size) + s->cluster_size + - 1) / s->cluster_size) { + DLOG(fprintf(stderr, "Cluster count mismatch\n")); + goto fail; + } + } else + abort(); /* cluster_count = 0; */ + + ret += cluster_count; + } + + cluster_num = modified_fat_get(s, cluster_num); + } while(!fat_eof(s, cluster_num)); + + g_free(cluster); + return ret; +} + +/* returns 1 on success */ +static int is_consistent(BDRVVVFATState* s) +{ + int i, check; + int used_clusters_count = 0; + +DLOG(checkpoint()); + /* + * - get modified FAT + * - compare the two FATs (TODO) + * - get buffer for marking used clusters + * - recurse direntries from root (using bs->bdrv_read to make + * sure to get the new data) + * - check that the FAT agrees with the size + * - count the number of clusters occupied by this directory and + * its files + * - check that the cumulative used cluster count agrees with the + * FAT + * - if all is fine, return number of used clusters + */ + if (s->fat2 == NULL) { + int size = 0x200 * s->sectors_per_fat; + s->fat2 = g_malloc(size); + memcpy(s->fat2, s->fat.pointer, size); + } + check = vvfat_read(s->bs, + s->first_sectors_number, s->fat2, s->sectors_per_fat); + if (check) { + fprintf(stderr, "Could not copy fat\n"); + return 0; + } + assert (s->used_clusters); + for (i = 0; i < sector2cluster(s, s->sector_count); i++) + s->used_clusters[i] &= ~USED_ANY; + + clear_commits(s); + + /* mark every mapped file/directory as deleted. + * (check_directory_consistency() will unmark those still present). */ + if (s->qcow) + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->first_mapping_index < 0) + mapping->mode |= MODE_DELETED; + } + + used_clusters_count = check_directory_consistency(s, 0, s->path); + if (used_clusters_count <= 0) { + DLOG(fprintf(stderr, "problem in directory\n")); + return 0; + } + + check = s->last_cluster_of_root_directory; + for (i = check; i < sector2cluster(s, s->sector_count); i++) { + if (modified_fat_get(s, i)) { + if(!s->used_clusters[i]) { + DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i)); + return 0; + } + check++; + } + + if (s->used_clusters[i] == USED_ALLOCATED) { + /* allocated, but not used... */ + DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i)); + return 0; + } + } + + if (check != used_clusters_count) + return 0; + + return used_clusters_count; +} + +static inline void adjust_mapping_indices(BDRVVVFATState* s, + int offset, int adjust) +{ + int i; + + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + +#define ADJUST_MAPPING_INDEX(name) \ + if (mapping->name >= offset) \ + mapping->name += adjust + + ADJUST_MAPPING_INDEX(first_mapping_index); + if (mapping->mode & MODE_DIRECTORY) + ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index); + } +} + +/* insert or update mapping */ +static mapping_t* insert_mapping(BDRVVVFATState* s, + uint32_t begin, uint32_t end) +{ + /* + * - find mapping where mapping->begin >= begin, + * - if mapping->begin > begin: insert + * - adjust all references to mappings! + * - else: adjust + * - replace name + */ + int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next); + mapping_t* mapping = NULL; + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index)) + && mapping->begin < begin) { + mapping->end = begin; + index++; + mapping = array_get(&(s->mapping), index); + } + if (index >= s->mapping.next || mapping->begin > begin) { + mapping = array_insert(&(s->mapping), index, 1); + mapping->path = NULL; + adjust_mapping_indices(s, index, +1); + } + + mapping->begin = begin; + mapping->end = end; + +DLOG(mapping_t* next_mapping; +assert(index + 1 >= s->mapping.next || +((next_mapping = array_get(&(s->mapping), index + 1)) && + next_mapping->begin >= end))); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return mapping; +} + +static int remove_mapping(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + /* free mapping */ + if (mapping->first_mapping_index < 0) { + g_free(mapping->path); + } + + /* remove from s->mapping */ + array_remove(&(s->mapping), mapping_index); + + /* adjust all references to mappings */ + adjust_mapping_indices(s, mapping_index, -1); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return 0; +} + +static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->dir_index >= offset) + mapping->dir_index += adjust; + if ((mapping->mode & MODE_DIRECTORY) && + mapping->info.dir.first_dir_index >= offset) + mapping->info.dir.first_dir_index += adjust; + } +} + +static direntry_t* insert_direntries(BDRVVVFATState* s, + int dir_index, int count) +{ + /* + * make room in s->directory, + * adjust_dirindices + */ + direntry_t* result = array_insert(&(s->directory), dir_index, count); + if (result == NULL) + return NULL; + adjust_dirindices(s, dir_index, count); + return result; +} + +static int remove_direntries(BDRVVVFATState* s, int dir_index, int count) +{ + int ret = array_remove_slice(&(s->directory), dir_index, count); + if (ret) + return ret; + adjust_dirindices(s, dir_index, -count); + return 0; +} + +/* + * Adapt the mappings of the cluster chain starting at first cluster + * (i.e. if a file starts at first_cluster, the chain is followed according + * to the modified fat, and the corresponding entries in s->mapping are + * adjusted) + */ +static int commit_mappings(BDRVVVFATState* s, + uint32_t first_cluster, int dir_index) +{ + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t cluster = first_cluster; + + vvfat_close_current_file(s); + + assert(mapping); + assert(mapping->begin == first_cluster); + mapping->first_mapping_index = -1; + mapping->dir_index = dir_index; + mapping->mode = (dir_index <= 0 || is_directory(direntry)) ? + MODE_DIRECTORY : MODE_NORMAL; + + while (!fat_eof(s, cluster)) { + uint32_t c, c1; + + for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1; + c = c1, c1 = modified_fat_get(s, c1)); + + c++; + if (c > mapping->end) { + int index = array_index(&(s->mapping), mapping); + int i, max_i = s->mapping.next - index; + for (i = 1; i < max_i && mapping[i].begin < c; i++); + while (--i > 0) + remove_mapping(s, index + 1); + } + assert(mapping == array_get(&(s->mapping), s->mapping.next - 1) + || mapping[1].begin >= c); + mapping->end = c; + + if (!fat_eof(s, c1)) { + int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next); + mapping_t* next_mapping = i >= s->mapping.next ? NULL : + array_get(&(s->mapping), i); + + if (next_mapping == NULL || next_mapping->begin > c1) { + int i1 = array_index(&(s->mapping), mapping); + + next_mapping = insert_mapping(s, c1, c1+1); + + if (c1 < c) + i1++; + mapping = array_get(&(s->mapping), i1); + } + + next_mapping->dir_index = mapping->dir_index; + next_mapping->first_mapping_index = + mapping->first_mapping_index < 0 ? + array_index(&(s->mapping), mapping) : + mapping->first_mapping_index; + next_mapping->path = mapping->path; + next_mapping->mode = mapping->mode; + next_mapping->read_only = mapping->read_only; + if (mapping->mode & MODE_DIRECTORY) { + next_mapping->info.dir.parent_mapping_index = + mapping->info.dir.parent_mapping_index; + next_mapping->info.dir.first_dir_index = + mapping->info.dir.first_dir_index + + 0x10 * s->sectors_per_cluster * + (mapping->end - mapping->begin); + } else + next_mapping->info.file.offset = mapping->info.file.offset + + mapping->end - mapping->begin; + + mapping = next_mapping; + } + + cluster = c1; + } + + return 0; +} + +static int commit_direntries(BDRVVVFATState* s, + int dir_index, int parent_mapping_index) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + + int factor = 0x10 * s->sectors_per_cluster; + int old_cluster_count, new_cluster_count; + int current_dir_index = mapping->info.dir.first_dir_index; + int first_dir_index = current_dir_index; + int ret, i; + uint32_t c; + +DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index)); + + assert(direntry); + assert(mapping); + assert(mapping->begin == first_cluster); + assert(mapping->info.dir.first_dir_index < s->directory.next); + assert(mapping->mode & MODE_DIRECTORY); + assert(dir_index == 0 || is_directory(direntry)); + + mapping->info.dir.parent_mapping_index = parent_mapping_index; + + if (first_cluster == 0) { + old_cluster_count = new_cluster_count = + s->last_cluster_of_root_directory; + } else { + for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = fat_get(s, c)) + old_cluster_count++; + + for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = modified_fat_get(s, c)) + new_cluster_count++; + } + + if (new_cluster_count > old_cluster_count) { + if (insert_direntries(s, + current_dir_index + factor * old_cluster_count, + factor * (new_cluster_count - old_cluster_count)) == NULL) + return -1; + } else if (new_cluster_count < old_cluster_count) + remove_direntries(s, + current_dir_index + factor * new_cluster_count, + factor * (old_cluster_count - new_cluster_count)); + + for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { + void* direntry = array_get(&(s->directory), current_dir_index); + int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, + s->sectors_per_cluster); + if (ret) + return ret; + assert(!strncmp(s->directory.pointer, "QEMU", 4)); + current_dir_index += factor; + } + + ret = commit_mappings(s, first_cluster, dir_index); + if (ret) + return ret; + + /* recurse */ + for (i = 0; i < factor * new_cluster_count; i++) { + direntry = array_get(&(s->directory), first_dir_index + i); + if (is_directory(direntry) && !is_dot(direntry)) { + mapping = find_mapping_for_cluster(s, first_cluster); + assert(mapping->mode & MODE_DIRECTORY); + ret = commit_direntries(s, first_dir_index + i, + array_index(&(s->mapping), mapping)); + if (ret) + return ret; + } + } + + return 0; +} + +/* commit one file (adjust contents, adjust mapping), + return first_mapping_index */ +static int commit_one_file(BDRVVVFATState* s, + int dir_index, uint32_t offset) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t c = begin_of_direntry(direntry); + uint32_t first_cluster = c; + mapping_t* mapping = find_mapping_for_cluster(s, c); + uint32_t size = filesize_of_direntry(direntry); + char* cluster = g_malloc(s->cluster_size); + uint32_t i; + int fd = 0; + + assert(offset < size); + assert((offset % s->cluster_size) == 0); + + for (i = s->cluster_size; i < offset; i += s->cluster_size) + c = modified_fat_get(s, c); + + fd = qemu_open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); + if (fd < 0) { + fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, + strerror(errno), errno); + g_free(cluster); + return fd; + } + if (offset > 0) { + if (lseek(fd, offset, SEEK_SET) != offset) { + qemu_close(fd); + g_free(cluster); + return -3; + } + } + + while (offset < size) { + uint32_t c1; + int rest_size = (size - offset > s->cluster_size ? + s->cluster_size : size - offset); + int ret; + + c1 = modified_fat_get(s, c); + + assert((size - offset == 0 && fat_eof(s, c)) || + (size > offset && c >=2 && !fat_eof(s, c))); + + ret = vvfat_read(s->bs, cluster2sector(s, c), + (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200); + + if (ret < 0) { + qemu_close(fd); + g_free(cluster); + return ret; + } + + if (write(fd, cluster, rest_size) < 0) { + qemu_close(fd); + g_free(cluster); + return -2; + } + + offset += rest_size; + c = c1; + } + + if (ftruncate(fd, size)) { + perror("ftruncate()"); + qemu_close(fd); + g_free(cluster); + return -4; + } + qemu_close(fd); + g_free(cluster); + + return commit_mappings(s, first_cluster, dir_index); +} + +#ifdef DEBUG +/* test, if all mappings point to valid direntries */ +static void check1(BDRVVVFATState* s) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + fprintf(stderr, "deleted\n"); + continue; + } + assert(mapping->dir_index < s->directory.next); + direntry_t* direntry = array_get(&(s->directory), mapping->dir_index); + assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0); + if (mapping->mode & MODE_DIRECTORY) { + assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next); + assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0); + } + } +} + +/* test, if all direntries have mappings */ +static void check2(BDRVVVFATState* s) +{ + int i; + int first_mapping = -1; + + for (i = 0; i < s->directory.next; i++) { + direntry_t* direntry = array_get(&(s->directory), i); + + if (is_short_name(direntry) && begin_of_direntry(direntry)) { + mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry)); + assert(mapping); + assert(mapping->dir_index == i || is_dot(direntry)); + assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry)); + } + + if ((i % (0x10 * s->sectors_per_cluster)) == 0) { + /* cluster start */ + int j, count = 0; + + for (j = 0; j < s->mapping.next; j++) { + mapping_t* mapping = array_get(&(s->mapping), j); + if (mapping->mode & MODE_DELETED) + continue; + if (mapping->mode & MODE_DIRECTORY) { + if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) { + assert(++count == 1); + if (mapping->first_mapping_index == -1) + first_mapping = array_index(&(s->mapping), mapping); + else + assert(first_mapping == mapping->first_mapping_index); + if (mapping->info.dir.parent_mapping_index < 0) + assert(j == 0); + else { + mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index); + assert(parent->mode & MODE_DIRECTORY); + assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index); + } + } + } + } + if (count == 0) + first_mapping = -1; + } + } +} +#endif + +static int handle_renames_and_mkdirs(BDRVVVFATState* s) +{ + int i; + +#ifdef DEBUG + fprintf(stderr, "handle_renames\n"); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action); + } +#endif + + for (i = 0; i < s->commits.next;) { + commit_t* commit = array_get(&(s->commits), i); + if (commit->action == ACTION_RENAME) { + mapping_t* mapping = find_mapping_for_cluster(s, + commit->param.rename.cluster); + char* old_path = mapping->path; + + assert(commit->path); + mapping->path = commit->path; + if (rename(old_path, mapping->path)) + return -2; + + if (mapping->mode & MODE_DIRECTORY) { + int l1 = strlen(mapping->path); + int l2 = strlen(old_path); + int diff = l1 - l2; + direntry_t* direntry = array_get(&(s->directory), + mapping->info.dir.first_dir_index); + uint32_t c = mapping->begin; + int i = 0; + + /* recurse */ + while (!fat_eof(s, c)) { + do { + direntry_t* d = direntry + i; + + if (is_file(d) || (is_directory(d) && !is_dot(d))) { + mapping_t* m = find_mapping_for_cluster(s, + begin_of_direntry(d)); + int l = strlen(m->path); + char* new_path = g_malloc(l + diff + 1); + + assert(!strncmp(m->path, mapping->path, l2)); + + pstrcpy(new_path, l + diff + 1, mapping->path); + pstrcpy(new_path + l1, l + diff + 1 - l1, + m->path + l2); + + schedule_rename(s, m->begin, new_path); + } + i++; + } while((i % (0x10 * s->sectors_per_cluster)) != 0); + c = fat_get(s, c); + } + } + + g_free(old_path); + array_remove(&(s->commits), i); + continue; + } else if (commit->action == ACTION_MKDIR) { + mapping_t* mapping; + int j, parent_path_len; + +#ifdef __MINGW32__ + if (mkdir(commit->path)) + return -5; +#else + if (mkdir(commit->path, 0755)) + return -5; +#endif + + mapping = insert_mapping(s, commit->param.mkdir.cluster, + commit->param.mkdir.cluster + 1); + if (mapping == NULL) + return -6; + + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + mapping->path = commit->path; + j = s->directory.next; + assert(j); + insert_direntries(s, s->directory.next, + 0x10 * s->sectors_per_cluster); + mapping->info.dir.first_dir_index = j; + + parent_path_len = strlen(commit->path) + - strlen(get_basename(commit->path)) - 1; + for (j = 0; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->first_mapping_index < 0 && m != mapping && + !strncmp(m->path, mapping->path, parent_path_len) && + strlen(m->path) == parent_path_len) + break; + } + assert(j < s->mapping.next); + mapping->info.dir.parent_mapping_index = j; + + array_remove(&(s->commits), i); + continue; + } + + i++; + } + return 0; +} + +/* + * TODO: make sure that the short name is not matching *another* file + */ +static int handle_commits(BDRVVVFATState* s) +{ + int i, fail = 0; + + vvfat_close_current_file(s); + + for (i = 0; !fail && i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + switch(commit->action) { + case ACTION_RENAME: case ACTION_MKDIR: + abort(); + fail = -2; + break; + case ACTION_WRITEOUT: { +#ifndef NDEBUG + /* these variables are only used by assert() below */ + direntry_t* entry = array_get(&(s->directory), + commit->param.writeout.dir_index); + uint32_t begin = begin_of_direntry(entry); + mapping_t* mapping = find_mapping_for_cluster(s, begin); +#endif + + assert(mapping); + assert(mapping->begin == begin); + assert(commit->path == NULL); + + if (commit_one_file(s, commit->param.writeout.dir_index, + commit->param.writeout.modified_offset)) + fail = -3; + + break; + } + case ACTION_NEW_FILE: { + int begin = commit->param.new_file.first_cluster; + mapping_t* mapping = find_mapping_for_cluster(s, begin); + direntry_t* entry; + int i; + + /* find direntry */ + for (i = 0; i < s->directory.next; i++) { + entry = array_get(&(s->directory), i); + if (is_file(entry) && begin_of_direntry(entry) == begin) + break; + } + + if (i >= s->directory.next) { + fail = -6; + continue; + } + + /* make sure there exists an initial mapping */ + if (mapping && mapping->begin != begin) { + mapping->end = begin; + mapping = NULL; + } + if (mapping == NULL) { + mapping = insert_mapping(s, begin, begin+1); + } + /* most members will be fixed in commit_mappings() */ + assert(commit->path); + mapping->path = commit->path; + mapping->read_only = 0; + mapping->mode = MODE_NORMAL; + mapping->info.file.offset = 0; + + if (commit_one_file(s, i, 0)) + fail = -7; + + break; + } + default: + abort(); + } + } + if (i > 0 && array_remove_slice(&(s->commits), 0, i)) + return -1; + return fail; +} + +static int handle_deletes(BDRVVVFATState* s) +{ + int i, deferred = 1, deleted = 1; + + /* delete files corresponding to mappings marked as deleted */ + /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */ + while (deferred && deleted) { + deferred = 0; + deleted = 0; + + for (i = 1; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + direntry_t* entry = array_get(&(s->directory), + mapping->dir_index); + + if (is_free(entry)) { + /* remove file/directory */ + if (mapping->mode & MODE_DIRECTORY) { + int j, next_dir_index = s->directory.next, + first_dir_index = mapping->info.dir.first_dir_index; + + if (rmdir(mapping->path) < 0) { + if (errno == ENOTEMPTY) { + deferred++; + continue; + } else + return -5; + } + + for (j = 1; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->mode & MODE_DIRECTORY && + m->info.dir.first_dir_index > + first_dir_index && + m->info.dir.first_dir_index < + next_dir_index) + next_dir_index = + m->info.dir.first_dir_index; + } + remove_direntries(s, first_dir_index, + next_dir_index - first_dir_index); + + deleted++; + } + } else { + if (unlink(mapping->path)) + return -4; + deleted++; + } + DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry)); + remove_mapping(s, i); + } + } + } + + return 0; +} + +/* + * synchronize mapping with new state: + * + * - copy FAT (with bdrv_read) + * - mark all filenames corresponding to mappings as deleted + * - recurse direntries from root (using bs->bdrv_read) + * - delete files corresponding to mappings marked as deleted + */ +static int do_commit(BDRVVVFATState* s) +{ + int ret = 0; + + /* the real meat are the commits. Nothing to do? Move along! */ + if (s->commits.next == 0) + return 0; + + vvfat_close_current_file(s); + + ret = handle_renames_and_mkdirs(s); + if (ret) { + fprintf(stderr, "Error handling renames (%d)\n", ret); + abort(); + return ret; + } + + /* copy FAT (with bdrv_read) */ + memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat); + + /* recurse direntries from root (using bs->bdrv_read) */ + ret = commit_direntries(s, 0, -1); + if (ret) { + fprintf(stderr, "Fatal: error while committing (%d)\n", ret); + abort(); + return ret; + } + + ret = handle_commits(s); + if (ret) { + fprintf(stderr, "Error handling commits (%d)\n", ret); + abort(); + return ret; + } + + ret = handle_deletes(s); + if (ret) { + fprintf(stderr, "Error deleting\n"); + abort(); + return ret; + } + + if (s->qcow->drv->bdrv_make_empty) { + s->qcow->drv->bdrv_make_empty(s->qcow); + } + + memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); + +DLOG(checkpoint()); + return 0; +} + +static int try_commit(BDRVVVFATState* s) +{ + vvfat_close_current_file(s); +DLOG(checkpoint()); + if(!is_consistent(s)) + return -1; + return do_commit(s); +} + +static int vvfat_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i, ret; + +DLOG(checkpoint()); + + /* Check if we're operating in read-only mode */ + if (s->qcow == NULL) { + return -EACCES; + } + + vvfat_close_current_file(s); + + /* + * Some sanity checks: + * - do not allow writing to the boot sector + * - do not allow to write non-ASCII filenames + */ + + if (sector_num < s->first_sectors_number) + return -1; + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1);) { + mapping_t* mapping = find_mapping_for_cluster(s, i); + if (mapping) { + if (mapping->read_only) { + fprintf(stderr, "Tried to write to write-protected file %s\n", + mapping->path); + return -1; + } + + if (mapping->mode & MODE_DIRECTORY) { + int begin = cluster2sector(s, i); + int end = begin + s->sectors_per_cluster, k; + int dir_index; + const direntry_t* direntries; + long_file_name lfn; + + lfn_init(&lfn); + + if (begin < sector_num) + begin = sector_num; + if (end > sector_num + nb_sectors) + end = sector_num + nb_sectors; + dir_index = mapping->dir_index + + 0x10 * (begin - mapping->begin * s->sectors_per_cluster); + direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num)); + + for (k = 0; k < (end - begin) * 0x10; k++) { + /* do not allow non-ASCII filenames */ + if (parse_long_name(&lfn, direntries + k) < 0) { + fprintf(stderr, "Warning: non-ASCII filename\n"); + return -1; + } + /* no access to the direntry of a read-only file */ + else if (is_short_name(direntries+k) && + (direntries[k].attributes & 1)) { + if (memcmp(direntries + k, + array_get(&(s->directory), dir_index + k), + sizeof(direntry_t))) { + fprintf(stderr, "Warning: tried to write to write-protected file\n"); + return -1; + } + } + } + } + i = mapping->end; + } else + i++; + } + + /* + * Use qcow backend. Commit later. + */ +DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); + ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors); + if (ret < 0) { + fprintf(stderr, "Error writing to qcow backend\n"); + return ret; + } + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1); i++) + if (i >= 0) + s->used_clusters[i] |= USED_ALLOCATED; + +DLOG(checkpoint()); + /* TODO: add timeout */ + try_commit(s); + +DLOG(checkpoint()); + return 0; +} + +static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVVFATState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vvfat_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int* n) +{ + BDRVVVFATState* s = bs->opaque; + *n = s->sector_count - sector_num; + if (*n > nb_sectors) + *n = nb_sectors; + else if (*n < 0) + return 0; + return 1; +} + +static int write_target_commit(BlockDriverState *bs, int64_t sector_num, + const uint8_t* buffer, int nb_sectors) { + BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); + return try_commit(s); +} + +static void write_target_close(BlockDriverState *bs) { + BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); + bdrv_delete(s->qcow); + g_free(s->qcow_filename); +} + +static BlockDriver vvfat_write_target = { + .format_name = "vvfat_write_target", + .bdrv_write = write_target_commit, + .bdrv_close = write_target_close, +}; + +static int enable_write_target(BDRVVVFATState *s) +{ + BlockDriver *bdrv_qcow; + QEMUOptionParameter *options; + int ret; + int size = sector2cluster(s, s->sector_count); + s->used_clusters = calloc(size, 1); + + array_init(&(s->commits), sizeof(commit_t)); + + s->qcow_filename = g_malloc(1024); + ret = get_tmp_filename(s->qcow_filename, 1024); + if (ret < 0) { + g_free(s->qcow_filename); + s->qcow_filename = NULL; + return ret; + } + + bdrv_qcow = bdrv_find_format("qcow"); + options = parse_option_parameters("", bdrv_qcow->create_options, NULL); + set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); + set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); + + if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0) + return -1; + + s->qcow = bdrv_new(""); + if (s->qcow == NULL) { + return -1; + } + + ret = bdrv_open(s->qcow, s->qcow_filename, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); + if (ret < 0) { + return ret; + } + +#ifndef _WIN32 + unlink(s->qcow_filename); +#endif + + s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1); + s->bs->backing_hd->drv = &vvfat_write_target; + s->bs->backing_hd->opaque = g_malloc(sizeof(void*)); + *(void**)s->bs->backing_hd->opaque = s; + + return 0; +} + +static void vvfat_close(BlockDriverState *bs) +{ + BDRVVVFATState *s = bs->opaque; + + vvfat_close_current_file(s); + array_free(&(s->fat)); + array_free(&(s->directory)); + array_free(&(s->mapping)); + g_free(s->cluster_buffer); + + if (s->qcow) { + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + } +} + +static BlockDriver bdrv_vvfat = { + .format_name = "vvfat", + .instance_size = sizeof(BDRVVVFATState), + .bdrv_file_open = vvfat_open, + .bdrv_rebind = vvfat_rebind, + .bdrv_read = vvfat_co_read, + .bdrv_write = vvfat_co_write, + .bdrv_close = vvfat_close, + .bdrv_co_is_allocated = vvfat_co_is_allocated, + .protocol_name = "fat", +}; + +static void bdrv_vvfat_init(void) +{ + bdrv_register(&bdrv_vvfat); +} + +block_init(bdrv_vvfat_init); + +#ifdef DEBUG +static void checkpoint(void) { + assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2); + check1(vvv); + check2(vvv); + assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY)); +#if 0 + if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf) + fprintf(stderr, "Nonono!\n"); + mapping_t* mapping; + direntry_t* direntry; + assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next); + assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next); + if (vvv->mapping.next<47) + return; + assert((mapping = array_get(&(vvv->mapping), 47))); + assert(mapping->dir_index < vvv->directory.next); + direntry = array_get(&(vvv->directory), mapping->dir_index); + assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0); +#endif +} +#endif |