diff options
Diffstat (limited to 'block')
43 files changed, 7821 insertions, 1629 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 7f015105b..4cf9aa499 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -2,7 +2,9 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o +block-obj-y += vhdx.o block-obj-y += parallels.o blkdebug.o blkverify.o +block-obj-y += snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o @@ -13,8 +15,12 @@ block-obj-$(CONFIG_LIBISCSI) += iscsi.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_LIBSSH2) += ssh.o endif common-obj-y += stream.o common-obj-y += commit.o common-obj-y += mirror.o +common-obj-y += backup.o + +$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS) diff --git a/block/backup.c b/block/backup.c new file mode 100644 index 000000000..6ae8a05a3 --- /dev/null +++ b/block/backup.c @@ -0,0 +1,386 @@ +/* + * QEMU backup + * + * Copyright (C) 2013 Proxmox Server Solutions + * + * Authors: + * Dietmar Maurer (dietmar@proxmox.com) + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + +#include "trace.h" +#include "block/block.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qemu/ratelimit.h" + +#define BACKUP_CLUSTER_BITS 16 +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) +#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct CowRequest { + int64_t start; + int64_t end; + QLIST_ENTRY(CowRequest) list; + CoQueue wait_queue; /* coroutines blocked on this request */ +} CowRequest; + +typedef struct BackupBlockJob { + BlockJob common; + BlockDriverState *target; + MirrorSyncMode sync_mode; + RateLimit limit; + BlockdevOnError on_source_error; + BlockdevOnError on_target_error; + CoRwlock flush_rwlock; + uint64_t sectors_read; + HBitmap *bitmap; + QLIST_HEAD(, CowRequest) inflight_reqs; +} BackupBlockJob; + +/* See if in-flight requests overlap and wait for them to complete */ +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, + int64_t start, + int64_t end) +{ + CowRequest *req; + bool retry; + + do { + retry = false; + QLIST_FOREACH(req, &job->inflight_reqs, list) { + if (end > req->start && start < req->end) { + qemu_co_queue_wait(&req->wait_queue); + retry = true; + break; + } + } + } while (retry); +} + +/* Keep track of an in-flight request */ +static void cow_request_begin(CowRequest *req, BackupBlockJob *job, + int64_t start, int64_t end) +{ + req->start = start; + req->end = end; + qemu_co_queue_init(&req->wait_queue); + QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); +} + +/* Forget about a completed request */ +static void cow_request_end(CowRequest *req) +{ + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +static int coroutine_fn backup_do_cow(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + bool *error_is_read) +{ + BackupBlockJob *job = (BackupBlockJob *)bs->job; + CowRequest cow_request; + struct iovec iov; + QEMUIOVector bounce_qiov; + void *bounce_buffer = NULL; + int ret = 0; + int64_t start, end; + int n; + + qemu_co_rwlock_rdlock(&job->flush_rwlock); + + start = sector_num / BACKUP_SECTORS_PER_CLUSTER; + end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); + + trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); + + wait_for_overlapping_requests(job, start, end); + cow_request_begin(&cow_request, job, start, end); + + for (; start < end; start++) { + if (hbitmap_get(job->bitmap, start)) { + trace_backup_do_cow_skip(job, start); + continue; /* already copied */ + } + + trace_backup_do_cow_process(job, start); + + n = MIN(BACKUP_SECTORS_PER_CLUSTER, + job->common.len / BDRV_SECTOR_SIZE - + start * BACKUP_SECTORS_PER_CLUSTER); + + if (!bounce_buffer) { + bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); + } + iov.iov_base = bounce_buffer; + iov.iov_len = n * BDRV_SECTOR_SIZE; + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, + &bounce_qiov); + if (ret < 0) { + trace_backup_do_cow_read_fail(job, start, ret); + if (error_is_read) { + *error_is_read = true; + } + goto out; + } + + if (buffer_is_zero(iov.iov_base, iov.iov_len)) { + ret = bdrv_co_write_zeroes(job->target, + start * BACKUP_SECTORS_PER_CLUSTER, n); + } else { + ret = bdrv_co_writev(job->target, + start * BACKUP_SECTORS_PER_CLUSTER, n, + &bounce_qiov); + } + if (ret < 0) { + trace_backup_do_cow_write_fail(job, start, ret); + if (error_is_read) { + *error_is_read = false; + } + goto out; + } + + hbitmap_set(job->bitmap, start, 1); + + /* Publish progress, guest I/O counts as progress too. Note that the + * offset field is an opaque progress value, it is not a disk offset. + */ + job->sectors_read += n; + job->common.offset += n * BDRV_SECTOR_SIZE; + } + +out: + if (bounce_buffer) { + qemu_vfree(bounce_buffer); + } + + cow_request_end(&cow_request); + + trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); + + qemu_co_rwlock_unlock(&job->flush_rwlock); + + return ret; +} + +static int coroutine_fn backup_before_write_notify( + NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + + return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL); +} + +static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + if (speed < 0) { + error_set(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static void backup_iostatus_reset(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + bdrv_iostatus_reset(s->target); +} + +static const BlockJobType backup_job_type = { + .instance_size = sizeof(BackupBlockJob), + .job_type = "backup", + .set_speed = backup_set_speed, + .iostatus_reset = backup_iostatus_reset, +}; + +static BlockErrorAction backup_error_action(BackupBlockJob *job, + bool read, int error) +{ + if (read) { + return block_job_error_action(&job->common, job->common.bs, + job->on_source_error, true, error); + } else { + return block_job_error_action(&job->common, job->target, + job->on_target_error, false, error); + } +} + +static void coroutine_fn backup_run(void *opaque) +{ + BackupBlockJob *job = opaque; + BlockDriverState *bs = job->common.bs; + BlockDriverState *target = job->target; + BlockdevOnError on_target_error = job->on_target_error; + NotifierWithReturn before_write = { + .notify = backup_before_write_notify, + }; + int64_t start, end; + int ret = 0; + + QLIST_INIT(&job->inflight_reqs); + qemu_co_rwlock_init(&job->flush_rwlock); + + start = 0; + end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE, + BACKUP_SECTORS_PER_CLUSTER); + + job->bitmap = hbitmap_alloc(end, 0); + + bdrv_set_enable_write_cache(target, true); + bdrv_set_on_error(target, on_target_error, on_target_error); + bdrv_iostatus_enable(target); + + bdrv_add_before_write_notifier(bs, &before_write); + + if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { + while (!block_job_is_cancelled(&job->common)) { + /* Yield until the job is cancelled. We just let our before_write + * notify callback service CoW requests. */ + job->common.busy = false; + qemu_coroutine_yield(); + job->common.busy = true; + } + } else { + /* Both FULL and TOP SYNC_MODE's require copying.. */ + for (; start < end; start++) { + bool error_is_read; + + if (block_job_is_cancelled(&job->common)) { + break; + } + + /* we need to yield so that qemu_aio_flush() returns. + * (without, VM does not reboot) + */ + if (job->common.speed) { + uint64_t delay_ns = ratelimit_calculate_delay( + &job->limit, job->sectors_read); + job->sectors_read = 0; + block_job_sleep_ns(&job->common, rt_clock, delay_ns); + } else { + block_job_sleep_ns(&job->common, rt_clock, 0); + } + + if (block_job_is_cancelled(&job->common)) { + break; + } + + if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { + int i, n; + int alloced = 0; + + /* Check to see if these blocks are already in the + * backing file. */ + + for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { + /* bdrv_co_is_allocated() only returns true/false based + * on the first set of sectors it comes accross that + * are are all in the same state. + * For that reason we must verify each sector in the + * backup cluster length. We end up copying more than + * needed but at some point that is always the case. */ + alloced = + bdrv_co_is_allocated(bs, + start * BACKUP_SECTORS_PER_CLUSTER + i, + BACKUP_SECTORS_PER_CLUSTER - i, &n); + i += n; + + if (alloced == 1) { + break; + } + } + + /* If the above loop never found any sectors that are in + * the topmost image, skip this backup. */ + if (alloced == 0) { + continue; + } + } + /* FULL sync mode we copy the whole drive. */ + ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, + BACKUP_SECTORS_PER_CLUSTER, &error_is_read); + if (ret < 0) { + /* Depending on error action, fail now or retry cluster */ + BlockErrorAction action = + backup_error_action(job, error_is_read, -ret); + if (action == BDRV_ACTION_REPORT) { + break; + } else { + start--; + continue; + } + } + } + } + + notifier_with_return_remove(&before_write); + + /* wait until pending backup_do_cow() calls have completed */ + qemu_co_rwlock_wrlock(&job->flush_rwlock); + qemu_co_rwlock_unlock(&job->flush_rwlock); + + hbitmap_free(job->bitmap); + + bdrv_iostatus_disable(target); + bdrv_delete(target); + + block_job_completed(&job->common, ret); +} + +void backup_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, MirrorSyncMode sync_mode, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, void *opaque, + Error **errp) +{ + int64_t len; + + assert(bs); + assert(target); + assert(cb); + + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || + on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && + !bdrv_iostatus_is_enabled(bs)) { + error_set(errp, QERR_INVALID_PARAMETER, "on-source-error"); + return; + } + + len = bdrv_getlength(bs); + if (len < 0) { + error_setg_errno(errp, -len, "unable to get length for '%s'", + bdrv_get_device_name(bs)); + return; + } + + BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed, + cb, opaque, errp); + if (!job) { + return; + } + + job->on_source_error = on_source_error; + job->on_target_error = on_target_error; + job->target = target; + job->sync_mode = sync_mode; + job->common.len = len; + job->common.co = qemu_coroutine_create(backup_run); + qemu_coroutine_enter(job->common.co, job); +} diff --git a/block/blkdebug.c b/block/blkdebug.c index d61ece86a..ccb627ad9 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -23,14 +23,17 @@ */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "qemu/config-file.h" +#include "block/block_int.h" +#include "qemu/module.h" typedef struct BDRVBlkdebugState { int state; int new_state; + QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; + QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs; } BDRVBlkdebugState; typedef struct BlkdebugAIOCB { @@ -39,6 +42,12 @@ typedef struct BlkdebugAIOCB { int ret; } BlkdebugAIOCB; +typedef struct BlkdebugSuspendedReq { + Coroutine *co; + char *tag; + QLIST_ENTRY(BlkdebugSuspendedReq) next; +} BlkdebugSuspendedReq; + static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb); static const AIOCBInfo blkdebug_aiocb_info = { @@ -49,6 +58,7 @@ static const AIOCBInfo blkdebug_aiocb_info = { enum { ACTION_INJECT_ERROR, ACTION_SET_STATE, + ACTION_SUSPEND, }; typedef struct BlkdebugRule { @@ -65,6 +75,9 @@ typedef struct BlkdebugRule { struct { int new_state; } set_state; + struct { + char *tag; + } suspend; } options; QLIST_ENTRY(BlkdebugRule) next; QSIMPLEQ_ENTRY(BlkdebugRule) active_next; @@ -169,6 +182,9 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc", [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes", [BLKDBG_CLUSTER_FREE] = "cluster_free", + + [BLKDBG_FLUSH_TO_OS] = "flush_to_os", + [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) @@ -226,6 +242,11 @@ static int add_rule(QemuOpts *opts, void *opaque) rule->options.set_state.new_state = qemu_opt_get_number(opts, "new_state", 0); break; + + case ACTION_SUSPEND: + rule->options.suspend.tag = + g_strdup(qemu_opt_get(opts, "tag")); + break; }; /* Add the rule */ @@ -234,6 +255,21 @@ static int add_rule(QemuOpts *opts, void *opaque) return 0; } +static void remove_rule(BlkdebugRule *rule) +{ + switch (rule->action) { + case ACTION_INJECT_ERROR: + case ACTION_SET_STATE: + break; + case ACTION_SUSPEND: + g_free(rule->options.suspend.tag); + break; + } + + QLIST_REMOVE(rule, next); + g_free(rule); +} + static int read_config(BDRVBlkdebugState *s, const char *filename) { FILE *f; @@ -266,43 +302,98 @@ fail: } /* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */ -static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags) +static void blkdebug_parse_filename(const char *filename, QDict *options, + Error **errp) { - BDRVBlkdebugState *s = bs->opaque; - int ret; - char *config, *c; + const char *c; /* Parse the blkdebug: prefix */ - if (strncmp(filename, "blkdebug:", strlen("blkdebug:"))) { - return -EINVAL; + if (!strstart(filename, "blkdebug:", &filename)) { + error_setg(errp, "File name string must start with 'blkdebug:'"); + return; } - filename += strlen("blkdebug:"); - /* Read rules from config file */ + /* Parse config file path */ c = strchr(filename, ':'); if (c == NULL) { - return -EINVAL; + error_setg(errp, "blkdebug requires both config file and image path"); + return; } - config = g_strdup(filename); - config[c - filename] = '\0'; - ret = read_config(s, config); - g_free(config); - if (ret < 0) { - return ret; + if (c != filename) { + QString *config_path; + config_path = qstring_from_substr(filename, 0, c - filename - 1); + qdict_put(options, "config", config_path); } + + /* TODO Allow multi-level nesting and set file.filename here */ filename = c + 1; + qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { + .name = "blkdebug", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "config", + .type = QEMU_OPT_STRING, + .help = "Path to the configuration file", + }, + { + .name = "x-image", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { /* end of list */ } + }, +}; + +static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVBlkdebugState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename, *config; + int ret; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + /* Read rules from config file */ + config = qemu_opt_get(opts, "config"); + if (config) { + ret = read_config(s, config); + if (ret < 0) { + goto fail; + } + } /* Set initial state */ s->state = 1; /* Open the backing file */ - ret = bdrv_file_open(&bs->file, filename, flags); + filename = qemu_opt_get(opts, "x-image"); + if (filename == NULL) { + ret = -EINVAL; + goto fail; + } + + ret = bdrv_file_open(&bs->file, filename, NULL, flags); if (ret < 0) { - return ret; + goto fail; } - return 0; + ret = 0; +fail: + qemu_opts_del(opts); + return ret; } static void error_callback_bh(void *opaque) @@ -389,6 +480,7 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs, return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); } + static void blkdebug_close(BlockDriverState *bs) { BDRVBlkdebugState *s = bs->opaque; @@ -397,12 +489,32 @@ static void blkdebug_close(BlockDriverState *bs) for (i = 0; i < BLKDBG_EVENT_MAX; i++) { QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { - QLIST_REMOVE(rule, next); - g_free(rule); + remove_rule(rule); } } } +static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq r; + + r = (BlkdebugSuspendedReq) { + .co = qemu_coroutine_self(), + .tag = g_strdup(rule->options.suspend.tag), + }; + + remove_rule(rule); + QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next); + + printf("blkdebug: Suspended request '%s'\n", r.tag); + qemu_coroutine_yield(); + printf("blkdebug: Resuming request '%s'\n", r.tag); + + QLIST_REMOVE(&r, next); + g_free(r.tag); +} + static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, bool injected) { @@ -426,6 +538,10 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, case ACTION_SET_STATE: s->new_state = rule->options.set_state.new_state; break; + + case ACTION_SUSPEND: + suspend_request(bs, rule); + break; } return injected; } @@ -433,38 +549,94 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) { BDRVBlkdebugState *s = bs->opaque; - struct BlkdebugRule *rule; + struct BlkdebugRule *rule, *next; bool injected; assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); injected = false; s->new_state = s->state; - QLIST_FOREACH(rule, &s->rules[event], next) { + QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) { injected = process_rule(bs, rule, injected); } s->state = s->new_state; } -static int64_t blkdebug_getlength(BlockDriverState *bs) +static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag) { - return bdrv_getlength(bs->file); + BDRVBlkdebugState *s = bs->opaque; + struct BlkdebugRule *rule; + BlkDebugEvent blkdebug_event; + + if (get_event_by_name(event, &blkdebug_event) < 0) { + return -ENOENT; + } + + + rule = g_malloc(sizeof(*rule)); + *rule = (struct BlkdebugRule) { + .event = blkdebug_event, + .action = ACTION_SUSPEND, + .state = 0, + .options.suspend.tag = g_strdup(tag), + }; + + QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next); + + return 0; +} + +static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r; + + QLIST_FOREACH(r, &s->suspended_reqs, next) { + if (!strcmp(r->tag, tag)) { + qemu_coroutine_enter(r->co, NULL); + return 0; + } + } + return -ENOENT; } -static BlockDriver bdrv_blkdebug = { - .format_name = "blkdebug", - .protocol_name = "blkdebug", - .instance_size = sizeof(BDRVBlkdebugState), +static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r; - .bdrv_file_open = blkdebug_open, - .bdrv_close = blkdebug_close, - .bdrv_getlength = blkdebug_getlength, + QLIST_FOREACH(r, &s->suspended_reqs, next) { + if (!strcmp(r->tag, tag)) { + return true; + } + } + return false; +} - .bdrv_aio_readv = blkdebug_aio_readv, - .bdrv_aio_writev = blkdebug_aio_writev, +static int64_t blkdebug_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file); +} - .bdrv_debug_event = blkdebug_debug_event, +static BlockDriver bdrv_blkdebug = { + .format_name = "blkdebug", + .protocol_name = "blkdebug", + .instance_size = sizeof(BDRVBlkdebugState), + + .bdrv_parse_filename = blkdebug_parse_filename, + .bdrv_file_open = blkdebug_open, + .bdrv_close = blkdebug_close, + .bdrv_getlength = blkdebug_getlength, + + .bdrv_aio_readv = blkdebug_aio_readv, + .bdrv_aio_writev = blkdebug_aio_writev, + + .bdrv_debug_event = blkdebug_debug_event, + .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, + .bdrv_debug_resume = blkdebug_debug_resume, + .bdrv_debug_is_suspended = blkdebug_debug_is_suspended, }; static void bdrv_blkdebug_init(void) diff --git a/block/blkverify.c b/block/blkverify.c index 4beede77a..1d58cc393 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -8,8 +8,8 @@ */ #include <stdarg.h> -#include "qemu_socket.h" /* for EINPROGRESS on Windows */ -#include "block_int.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "block/block_int.h" typedef struct { BlockDriverState *test_file; @@ -69,43 +69,100 @@ static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, } /* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ -static int blkverify_open(BlockDriverState *bs, const char *filename, int flags) +static void blkverify_parse_filename(const char *filename, QDict *options, + Error **errp) { - BDRVBlkverifyState *s = bs->opaque; - int ret; - char *raw, *c; + const char *c; + QString *raw_path; + /* Parse the blkverify: prefix */ - if (strncmp(filename, "blkverify:", strlen("blkverify:"))) { - return -EINVAL; + if (!strstart(filename, "blkverify:", &filename)) { + error_setg(errp, "File name string must start with 'blkverify:'"); + return; } - filename += strlen("blkverify:"); /* Parse the raw image filename */ c = strchr(filename, ':'); if (c == NULL) { - return -EINVAL; + error_setg(errp, "blkverify requires raw copy and original image path"); + return; } - raw = g_strdup(filename); - raw[c - filename] = '\0'; - ret = bdrv_file_open(&bs->file, raw, flags); - g_free(raw); + /* TODO Implement option pass-through and set raw.filename here */ + raw_path = qstring_from_substr(filename, 0, c - filename - 1); + qdict_put(options, "x-raw", raw_path); + + /* TODO Allow multi-level nesting and set file.filename here */ + filename = c + 1; + qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { + .name = "blkverify", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "x-raw", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { + .name = "x-image", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { /* end of list */ } + }, +}; + +static int blkverify_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVBlkverifyState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename, *raw; + int ret; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + /* Parse the raw image filename */ + raw = qemu_opt_get(opts, "x-raw"); + if (raw == NULL) { + ret = -EINVAL; + goto fail; + } + + ret = bdrv_file_open(&bs->file, raw, NULL, flags); if (ret < 0) { - return ret; + goto fail; } - filename = c + 1; /* Open the test file */ + filename = qemu_opt_get(opts, "x-image"); + if (filename == NULL) { + ret = -EINVAL; + goto fail; + } + s->test_file = bdrv_new(""); - ret = bdrv_open(s->test_file, filename, flags, NULL); + ret = bdrv_open(s->test_file, filename, NULL, flags, NULL); if (ret < 0) { bdrv_delete(s->test_file); s->test_file = NULL; - return ret; + goto fail; } - return 0; + ret = 0; +fail: + return ret; } static void blkverify_close(BlockDriverState *bs) @@ -343,19 +400,18 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs, } static BlockDriver bdrv_blkverify = { - .format_name = "blkverify", - .protocol_name = "blkverify", - - .instance_size = sizeof(BDRVBlkverifyState), - - .bdrv_getlength = blkverify_getlength, - - .bdrv_file_open = blkverify_open, - .bdrv_close = blkverify_close, - - .bdrv_aio_readv = blkverify_aio_readv, - .bdrv_aio_writev = blkverify_aio_writev, - .bdrv_aio_flush = blkverify_aio_flush, + .format_name = "blkverify", + .protocol_name = "blkverify", + .instance_size = sizeof(BDRVBlkverifyState), + + .bdrv_parse_filename = blkverify_parse_filename, + .bdrv_file_open = blkverify_open, + .bdrv_close = blkverify_close, + .bdrv_getlength = blkverify_getlength, + + .bdrv_aio_readv = blkverify_aio_readv, + .bdrv_aio_writev = blkverify_aio_writev, + .bdrv_aio_flush = blkverify_aio_flush, }; static void bdrv_blkverify_init(void) diff --git a/block/bochs.c b/block/bochs.c index ab7944dc4..d7078c077 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -23,8 +23,8 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" /**************************************************************/ @@ -108,17 +108,19 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int bochs_open(BlockDriverState *bs, int flags) +static int bochs_open(BlockDriverState *bs, QDict *options, int flags) { BDRVBochsState *s = bs->opaque; int i; struct bochs_header bochs; struct bochs_header_v1 header_v1; + int ret; bs->read_only = 1; // no write support yet - if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) { - goto fail; + ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); + if (ret < 0) { + return ret; } if (strcmp(bochs.magic, HEADER_MAGIC) || @@ -126,7 +128,7 @@ static int bochs_open(BlockDriverState *bs, int flags) strcmp(bochs.subtype, GROWING_TYPE) || ((le32_to_cpu(bochs.version) != HEADER_VERSION) && (le32_to_cpu(bochs.version) != HEADER_V1))) { - goto fail; + return -EMEDIUMTYPE; } if (le32_to_cpu(bochs.version) == HEADER_V1) { @@ -138,9 +140,13 @@ static int bochs_open(BlockDriverState *bs, int flags) s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); s->catalog_bitmap = g_malloc(s->catalog_size * 4); - if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, - s->catalog_size * 4) != s->catalog_size * 4) - goto fail; + + ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, + s->catalog_size * 4); + if (ret < 0) { + goto fail; + } + for (i = 0; i < s->catalog_size; i++) le32_to_cpus(&s->catalog_bitmap[i]); @@ -153,8 +159,10 @@ static int bochs_open(BlockDriverState *bs, int flags) qemu_co_mutex_init(&s->lock); return 0; - fail: - return -1; + +fail: + g_free(s->catalog_bitmap); + return ret; } static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) diff --git a/block/cloop.c b/block/cloop.c index 7570eb8e7..6ea7cf404 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -22,8 +22,8 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" #include <zlib.h> typedef struct BDRVCloopState { @@ -53,31 +53,36 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cloop_open(BlockDriverState *bs, int flags) +static int cloop_open(BlockDriverState *bs, QDict *options, int flags) { BDRVCloopState *s = bs->opaque; uint32_t offsets_size, max_compressed_block_size = 1, i; + int ret; bs->read_only = 1; /* read header */ - if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) { - goto cloop_close; + ret = bdrv_pread(bs->file, 128, &s->block_size, 4); + if (ret < 0) { + return ret; } s->block_size = be32_to_cpu(s->block_size); - if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) { - goto cloop_close; + ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); + if (ret < 0) { + return ret; } s->n_blocks = be32_to_cpu(s->n_blocks); /* read offsets */ offsets_size = s->n_blocks * sizeof(uint64_t); s->offsets = g_malloc(offsets_size); - if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) < - offsets_size) { - goto cloop_close; + + ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); + if (ret < 0) { + goto fail; } + for(i=0;i<s->n_blocks;i++) { s->offsets[i] = be64_to_cpu(s->offsets[i]); if (i > 0) { @@ -92,7 +97,8 @@ static int cloop_open(BlockDriverState *bs, int flags) s->compressed_block = g_malloc(max_compressed_block_size + 1); s->uncompressed_block = g_malloc(s->block_size); if (inflateInit(&s->zstream) != Z_OK) { - goto cloop_close; + ret = -EINVAL; + goto fail; } s->current_block = s->n_blocks; @@ -101,8 +107,11 @@ static int cloop_open(BlockDriverState *bs, int flags) qemu_co_mutex_init(&s->lock); return 0; -cloop_close: - return -1; +fail: + g_free(s->offsets); + g_free(s->compressed_block); + g_free(s->uncompressed_block); + return ret; } static inline int cloop_read_block(BlockDriverState *bs, int block_num) diff --git a/block/commit.c b/block/commit.c index fae79582d..2227fc2e6 100644 --- a/block/commit.c +++ b/block/commit.c @@ -13,8 +13,8 @@ */ #include "trace.h" -#include "block_int.h" -#include "blockjob.h" +#include "block/block_int.h" +#include "block/blockjob.h" #include "qemu/ratelimit.h" enum { @@ -65,7 +65,7 @@ static void coroutine_fn commit_run(void *opaque) BlockDriverState *active = s->active; BlockDriverState *top = s->top; BlockDriverState *base = s->base; - BlockDriverState *overlay_bs = NULL; + BlockDriverState *overlay_bs; int64_t sector_num, end; int ret = 0; int n = 0; @@ -92,8 +92,6 @@ static void coroutine_fn commit_run(void *opaque) } } - overlay_bs = bdrv_find_overlay(active, top); - end = s->common.len >> BDRV_SECTOR_BITS; buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); @@ -103,7 +101,7 @@ static void coroutine_fn commit_run(void *opaque) wait: /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that qemu_aio_flush() returns. + * with no pending I/O here so that bdrv_drain_all() returns. */ block_job_sleep_ns(&s->common, rt_clock, delay_ns); if (block_job_is_cancelled(&s->common)) { @@ -156,7 +154,8 @@ exit_restore_reopen: if (s->base_flags != bdrv_get_flags(base)) { bdrv_reopen(base, s->base_flags, NULL); } - if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { + overlay_bs = bdrv_find_overlay(active, top); + if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); } @@ -174,7 +173,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static BlockJobType commit_job_type = { +static const BlockJobType commit_job_type = { .instance_size = sizeof(CommitBlockJob), .job_type = "commit", .set_speed = commit_set_speed, diff --git a/block/cow.c b/block/cow.c index a5a00eb9c..1cc2e89c7 100644 --- a/block/cow.c +++ b/block/cow.c @@ -22,8 +22,8 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" /**************************************************************/ /* COW block driver using file system holes */ @@ -58,7 +58,7 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cow_open(BlockDriverState *bs, int flags) +static int cow_open(BlockDriverState *bs, QDict *options, int flags) { BDRVCowState *s = bs->opaque; struct cow_header_v2 cow_header; @@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags) } if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } @@ -279,7 +279,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) return ret; } - ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR); + ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR); if (ret < 0) { return ret; } @@ -340,6 +340,7 @@ static BlockDriver bdrv_cow = { .bdrv_open = cow_open, .bdrv_close = cow_close, .bdrv_create = cow_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_read = cow_co_read, .bdrv_write = cow_co_write, diff --git a/block/curl.c b/block/curl.c index 1179484de..82d39ff53 100644 --- a/block/curl.c +++ b/block/curl.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" +#include "block/block_int.h" #include <curl/curl.h> // #define DEBUG @@ -34,6 +34,10 @@ #define DPRINTF(fmt, ...) do { } while (0) #endif +#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ + CURLPROTO_FTP | CURLPROTO_FTPS | \ + CURLPROTO_TFTP) + #define CURL_NUM_STATES 8 #define CURL_NUM_ACB 8 #define SECTOR_SIZE 512 @@ -77,6 +81,7 @@ typedef struct BDRVCURLState { CURLState states[CURL_NUM_STATES]; char *url; size_t readahead_size; + bool accept_range; } BDRVCURLState; static void curl_clean_state(CURLState *s); @@ -106,14 +111,15 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, return 0; } -static size_t curl_size_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) { - CURLState *s = ((CURLState*)opaque); + BDRVCURLState *s = opaque; size_t realsize = size * nmemb; - size_t fsize; + const char *accept_line = "Accept-Ranges: bytes"; - if(sscanf(ptr, "Content-Length: %zd", &fsize) == 1) { - s->s->len = fsize; + if (realsize >= strlen(accept_line) + && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { + s->accept_range = true; } return realsize; @@ -302,6 +308,17 @@ static CURLState *curl_init_state(BDRVCURLState *s) curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + /* Restrict supported protocols to avoid security issues in the more + * obscure protocols. For example, do not allow POP3/SMTP/IMAP see + * CVE-2013-0249. + * + * Restricting protocols is only supported from 7.19.4 upwards. + */ +#if LIBCURL_VERSION_NUM >= 0x071304 + curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); +#endif + #ifdef DEBUG_VERBOSE curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); #endif @@ -320,11 +337,9 @@ static void curl_clean_state(CURLState *s) s->in_use = 0; } -static int curl_open(BlockDriverState *bs, const char *filename, int flags) +static void curl_parse_filename(const char *filename, QDict *options, + Error **errp) { - BDRVCURLState *s = bs->opaque; - CURLState *state = NULL; - double d; #define RA_OPTSTR ":readahead=" char *file; @@ -332,19 +347,17 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags) const char *ra_val; int parse_state = 0; - static int inited = 0; - file = g_strdup(filename); - s->readahead_size = READ_AHEAD_SIZE; /* Parse a trailing ":readahead=#:" param, if present. */ ra = file + strlen(file) - 1; while (ra >= file) { if (parse_state == 0) { - if (*ra == ':') + if (*ra == ':') { parse_state++; - else + } else { break; + } } else if (parse_state == 1) { if (*ra > '9' || *ra < '0') { char *opt_start = ra - strlen(RA_OPTSTR) + 1; @@ -353,46 +366,108 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags) ra_val = ra + 1; ra -= strlen(RA_OPTSTR) - 1; *ra = '\0'; - s->readahead_size = atoi(ra_val); - break; - } else { - break; + qdict_put(options, "readahead", qstring_from_str(ra_val)); } + break; } } ra--; } + qdict_put(options, "url", qstring_from_str(file)); + + g_free(file); +} + +static QemuOptsList runtime_opts = { + .name = "curl", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "url", + .type = QEMU_OPT_STRING, + .help = "URL to open", + }, + { + .name = "readahead", + .type = QEMU_OPT_SIZE, + .help = "Readahead size", + }, + { /* end of list */ } + }, +}; + +static int curl_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVCURLState *s = bs->opaque; + CURLState *state = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *file; + double d; + + static int inited = 0; + + if (flags & BDRV_O_RDWR) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, + "curl block device does not support writes"); + return -EROFS; + } + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + goto out_noclean; + } + + s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE); if ((s->readahead_size & 0x1ff) != 0) { fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n", s->readahead_size); goto out_noclean; } + file = qemu_opt_get(opts, "url"); + if (file == NULL) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires " + "an 'url' option"); + goto out_noclean; + } + if (!inited) { curl_global_init(CURL_GLOBAL_ALL); inited = 1; } DPRINTF("CURL: Opening %s\n", file); - s->url = file; + s->url = g_strdup(file); state = curl_init_state(s); if (!state) goto out_noclean; // Get file size + s->accept_range = false; curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1); - curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_size_cb); + curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION, + curl_header_cb); + curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s); if (curl_easy_perform(state->curl)) goto out; curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d); - curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); - curl_easy_setopt(state->curl, CURLOPT_NOBODY, 0); if (d) s->len = (size_t)d; else if(!s->len) goto out; + if ((!strncasecmp(s->url, "http://", strlen("http://")) + || !strncasecmp(s->url, "https://", strlen("https://"))) + && !s->accept_range) { + pstrcpy(state->errmsg, CURL_ERROR_SIZE, + "Server does not support 'range' (byte ranges)."); + goto out; + } DPRINTF("CURL: Size = %zd\n", s->len); curl_clean_state(state); @@ -403,10 +478,11 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags) // initialize the multi interface! s->multi = curl_multi_init(); - curl_multi_setopt( s->multi, CURLMOPT_SOCKETDATA, s); - curl_multi_setopt( s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb ); + curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); curl_multi_do(s); + qemu_opts_del(opts); return 0; out: @@ -414,7 +490,8 @@ out: curl_easy_cleanup(state->curl); state->curl = NULL; out_noclean: - g_free(file); + g_free(s->url); + qemu_opts_del(opts); return -EINVAL; } @@ -552,63 +629,68 @@ static int64_t curl_getlength(BlockDriverState *bs) } static BlockDriver bdrv_http = { - .format_name = "http", - .protocol_name = "http", + .format_name = "http", + .protocol_name = "http", - .instance_size = sizeof(BDRVCURLState), - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, }; static BlockDriver bdrv_https = { - .format_name = "https", - .protocol_name = "https", + .format_name = "https", + .protocol_name = "https", - .instance_size = sizeof(BDRVCURLState), - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, }; static BlockDriver bdrv_ftp = { - .format_name = "ftp", - .protocol_name = "ftp", + .format_name = "ftp", + .protocol_name = "ftp", - .instance_size = sizeof(BDRVCURLState), - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, }; static BlockDriver bdrv_ftps = { - .format_name = "ftps", - .protocol_name = "ftps", + .format_name = "ftps", + .protocol_name = "ftps", - .instance_size = sizeof(BDRVCURLState), - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, }; static BlockDriver bdrv_tftp = { - .format_name = "tftp", - .protocol_name = "tftp", + .format_name = "tftp", + .protocol_name = "tftp", - .instance_size = sizeof(BDRVCURLState), - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, }; static void curl_block_init(void) diff --git a/block/dmg.c b/block/dmg.c index 37902a434..3141cb5b8 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -22,9 +22,9 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "bswap.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/bswap.h" +#include "qemu/module.h" #include <zlib.h> typedef struct BDRVDMGState { @@ -51,35 +51,55 @@ typedef struct BDRVDMGState { static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) { - int len=strlen(filename); - if(len>4 && !strcmp(filename+len-4,".dmg")) - return 2; + int len; + + if (!filename) { + return 0; + } + + len = strlen(filename); + if (len > 4 && !strcmp(filename + len - 4, ".dmg")) { + return 2; + } return 0; } -static off_t read_off(BlockDriverState *bs, int64_t offset) +static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) { - uint64_t buffer; - if (bdrv_pread(bs->file, offset, &buffer, 8) < 8) - return 0; - return be64_to_cpu(buffer); + uint64_t buffer; + int ret; + + ret = bdrv_pread(bs->file, offset, &buffer, 8); + if (ret < 0) { + return ret; + } + + *result = be64_to_cpu(buffer); + return 0; } -static off_t read_uint32(BlockDriverState *bs, int64_t offset) +static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) { - uint32_t buffer; - if (bdrv_pread(bs->file, offset, &buffer, 4) < 4) - return 0; - return be32_to_cpu(buffer); + uint32_t buffer; + int ret; + + ret = bdrv_pread(bs->file, offset, &buffer, 4); + if (ret < 0) { + return ret; + } + + *result = be32_to_cpu(buffer); + return 0; } -static int dmg_open(BlockDriverState *bs, int flags) +static int dmg_open(BlockDriverState *bs, QDict *options, int flags) { BDRVDMGState *s = bs->opaque; - off_t info_begin,info_end,last_in_offset,last_out_offset; - uint32_t count; + uint64_t info_begin,info_end,last_in_offset,last_out_offset; + uint32_t count, tmp; uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; int64_t offset; + int ret; bs->read_only = 1; s->n_chunks = 0; @@ -88,21 +108,32 @@ static int dmg_open(BlockDriverState *bs, int flags) /* read offset of info blocks */ offset = bdrv_getlength(bs->file); if (offset < 0) { + ret = offset; goto fail; } offset -= 0x1d8; - info_begin = read_off(bs, offset); - if (info_begin == 0) { - goto fail; + ret = read_uint64(bs, offset, &info_begin); + if (ret < 0) { + goto fail; + } else if (info_begin == 0) { + ret = -EINVAL; + goto fail; } - if (read_uint32(bs, info_begin) != 0x100) { + ret = read_uint32(bs, info_begin, &tmp); + if (ret < 0) { + goto fail; + } else if (tmp != 0x100) { + ret = -EINVAL; goto fail; } - count = read_uint32(bs, info_begin + 4); - if (count == 0) { + ret = read_uint32(bs, info_begin + 4, &count); + if (ret < 0) { + goto fail; + } else if (count == 0) { + ret = -EINVAL; goto fail; } info_end = info_begin + count; @@ -114,12 +145,20 @@ static int dmg_open(BlockDriverState *bs, int flags) while (offset < info_end) { uint32_t type; - count = read_uint32(bs, offset); - if(count==0) - goto fail; + ret = read_uint32(bs, offset, &count); + if (ret < 0) { + goto fail; + } else if (count == 0) { + ret = -EINVAL; + goto fail; + } offset += 4; - type = read_uint32(bs, offset); + ret = read_uint32(bs, offset, &type); + if (ret < 0) { + goto fail; + } + if (type == 0x6d697368 && count >= 244) { int new_size, chunk_count; @@ -134,8 +173,11 @@ static int dmg_open(BlockDriverState *bs, int flags) s->sectors = g_realloc(s->sectors, new_size); s->sectorcounts = g_realloc(s->sectorcounts, new_size); - for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) { - s->types[i] = read_uint32(bs, offset); + for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { + ret = read_uint32(bs, offset, &s->types[i]); + if (ret < 0) { + goto fail; + } offset += 4; if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { if(s->types[i]==0xffffffff) { @@ -149,17 +191,31 @@ static int dmg_open(BlockDriverState *bs, int flags) } offset += 4; - s->sectors[i] = last_out_offset+read_off(bs, offset); - offset += 8; - - s->sectorcounts[i] = read_off(bs, offset); - offset += 8; - - s->offsets[i] = last_in_offset+read_off(bs, offset); - offset += 8; - - s->lengths[i] = read_off(bs, offset); - offset += 8; + ret = read_uint64(bs, offset, &s->sectors[i]); + if (ret < 0) { + goto fail; + } + s->sectors[i] += last_out_offset; + offset += 8; + + ret = read_uint64(bs, offset, &s->sectorcounts[i]); + if (ret < 0) { + goto fail; + } + offset += 8; + + ret = read_uint64(bs, offset, &s->offsets[i]); + if (ret < 0) { + goto fail; + } + s->offsets[i] += last_in_offset; + offset += 8; + + ret = read_uint64(bs, offset, &s->lengths[i]); + if (ret < 0) { + goto fail; + } + offset += 8; if(s->lengths[i]>max_compressed_size) max_compressed_size = s->lengths[i]; @@ -173,15 +229,25 @@ static int dmg_open(BlockDriverState *bs, int flags) /* initialize zlib engine */ s->compressed_chunk = g_malloc(max_compressed_size+1); s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk); - if(inflateInit(&s->zstream) != Z_OK) - goto fail; + if(inflateInit(&s->zstream) != Z_OK) { + ret = -EINVAL; + goto fail; + } s->current_chunk = s->n_chunks; qemu_co_mutex_init(&s->lock); return 0; + fail: - return -1; + g_free(s->types); + g_free(s->offsets); + g_free(s->lengths); + g_free(s->sectors); + g_free(s->sectorcounts); + g_free(s->compressed_chunk); + g_free(s->uncompressed_chunk); + return ret; } static inline int is_sector_in_chunk(BDRVDMGState* s, @@ -296,15 +362,15 @@ static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, static void dmg_close(BlockDriverState *bs) { BDRVDMGState *s = bs->opaque; - if(s->n_chunks>0) { - free(s->types); - free(s->offsets); - free(s->lengths); - free(s->sectors); - free(s->sectorcounts); - } - free(s->compressed_chunk); - free(s->uncompressed_chunk); + + g_free(s->types); + g_free(s->offsets); + g_free(s->lengths); + g_free(s->sectors); + g_free(s->sectorcounts); + g_free(s->compressed_chunk); + g_free(s->uncompressed_chunk); + inflateEnd(&s->zstream); } diff --git a/block/gluster.c b/block/gluster.c index 1c90174b1..645b7f12a 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -16,9 +16,9 @@ * GNU GPL, version 2 or (at your option) any later version. */ #include <glusterfs/api/glfs.h> -#include "block_int.h" -#include "qemu_socket.h" -#include "uri.h" +#include "block/block_int.h" +#include "qemu/sockets.h" +#include "qemu/uri.h" typedef struct GlusterAIOCB { BlockDriverAIOCB common; @@ -217,7 +217,7 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) ret = glfs_init(glfs); if (ret) { error_report("Gluster connection failed for server=%s port=%d " - "volume=%s image=%s transport=%s\n", gconf->server, gconf->port, + "volume=%s image=%s transport=%s", gconf->server, gconf->port, gconf->volname, gconf->image, gconf->transport); goto out; } @@ -282,13 +282,42 @@ static int qemu_gluster_aio_flush_cb(void *opaque) return (s->qemu_aio_count > 0); } -static int qemu_gluster_open(BlockDriverState *bs, const char *filename, - int bdrv_flags) +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "gluster", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the gluster image", + }, + { /* end of list */ } + }, +}; + +static int qemu_gluster_open(BlockDriverState *bs, QDict *options, + int bdrv_flags) { BDRVGlusterState *s = bs->opaque; int open_flags = O_BINARY; int ret = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto out; + } + + filename = qemu_opt_get(opts, "filename"); + s->glfs = qemu_gluster_init(gconf, filename); if (!s->glfs) { @@ -322,6 +351,7 @@ static int qemu_gluster_open(BlockDriverState *bs, const char *filename, qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); out: + qemu_opts_del(opts); qemu_gluster_gconf_free(gconf); if (!ret) { return ret; @@ -463,6 +493,19 @@ out: return NULL; } +static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) +{ + int ret; + BDRVGlusterState *s = bs->opaque; + + ret = glfs_ftruncate(s->fd, offset); + if (ret < 0) { + return -errno; + } + + return 0; +} + static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) @@ -502,6 +545,39 @@ out: return NULL; } +#ifdef CONFIG_GLUSTERFS_DISCARD +static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, + void *opaque) +{ + int ret; + GlusterAIOCB *acb; + BDRVGlusterState *s = bs->opaque; + size_t size; + off_t offset; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); + acb->size = 0; + acb->ret = 0; + acb->finished = NULL; + s->qemu_aio_count++; + + ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + if (ret < 0) { + goto out; + } + return &acb->common; + +out: + s->qemu_aio_count--; + qemu_aio_release(acb); + return NULL; +} +#endif + static int64_t qemu_gluster_getlength(BlockDriverState *bs) { BDRVGlusterState *s = bs->opaque; @@ -544,6 +620,12 @@ static void qemu_gluster_close(BlockDriverState *bs) glfs_fini(s->glfs); } +static int qemu_gluster_has_zero_init(BlockDriverState *bs) +{ + /* GlusterFS volume could be backed by a block device */ + return 0; +} + static QEMUOptionParameter qemu_gluster_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -562,9 +644,14 @@ static BlockDriver bdrv_gluster = { .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, .bdrv_aio_readv = qemu_gluster_aio_readv, .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -577,9 +664,14 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, .bdrv_aio_readv = qemu_gluster_aio_readv, .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -592,9 +684,14 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, .bdrv_aio_readv = qemu_gluster_aio_readv, .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -607,9 +704,14 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, .bdrv_aio_readv = qemu_gluster_aio_readv, .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; diff --git a/block/iscsi.c b/block/iscsi.c index c0b70b3d3..e7c1c2b53 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -27,17 +27,19 @@ #include <poll.h> #include <arpa/inet.h> #include "qemu-common.h" -#include "qemu-error.h" -#include "block_int.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "block/block_int.h" #include "trace.h" -#include "hw/scsi-defs.h" +#include "block/scsi.h" +#include "qemu/iov.h" #include <iscsi/iscsi.h> #include <iscsi/scsi-lowlevel.h> #ifdef __linux__ #include <scsi/sg.h> -#include <hw/scsi-defs.h> +#include <block/scsi.h> #endif typedef struct IscsiLun { @@ -47,6 +49,7 @@ typedef struct IscsiLun { int block_size; uint64_t num_blocks; int events; + QEMUTimer *nop_timer; } IscsiLun; typedef struct IscsiAIOCB { @@ -58,13 +61,18 @@ typedef struct IscsiAIOCB { uint8_t *buf; int status; int canceled; - size_t read_size; - size_t read_offset; + int retries; + int64_t sector_num; + int nb_sectors; #ifdef __linux__ sg_io_hdr_t *ioh; #endif } IscsiAIOCB; +#define NOP_INTERVAL 5000 +#define MAX_NOP_FAILURES 3 +#define ISCSI_CMD_RETRIES 5 + static void iscsi_bh_cb(void *p) { @@ -72,6 +80,9 @@ iscsi_bh_cb(void *p) qemu_bh_delete(acb->bh); + g_free(acb->buf); + acb->buf = NULL; + if (acb->canceled == 0) { acb->common.cb(acb->common.opaque, acb->status); } @@ -183,6 +194,8 @@ iscsi_process_write(void *arg) iscsi_set_events(iscsilun); } +static int +iscsi_aio_writev_acb(IscsiAIOCB *acb); static void iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, @@ -193,13 +206,24 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled); g_free(acb->buf); + acb->buf = NULL; if (acb->canceled != 0) { return; } acb->status = 0; - if (status < 0) { + if (status != 0) { + if (status == SCSI_STATUS_CHECK_CONDITION + && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && acb->retries-- > 0) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + if (iscsi_aio_writev_acb(acb) == 0) { + iscsi_set_events(acb->iscsilun); + return; + } + } error_report("Failed to write16 data to iSCSI lun. %s", iscsi_get_error(iscsi)); acb->status = -EIO; @@ -208,78 +232,139 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } +static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ + return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; +} + static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) { return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; } -static BlockDriverAIOCB * -iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, + IscsiLun *iscsilun) { - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - IscsiAIOCB *acb; + if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || + (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { + error_report("iSCSI misaligned request: " + "iscsilun->block_size %u, sector_num %" PRIi64 + ", nb_sectors %d", + iscsilun->block_size, sector_num, nb_sectors); + return 0; + } + return 1; +} + +static int +iscsi_aio_writev_acb(IscsiAIOCB *acb) +{ + struct iscsi_context *iscsi = acb->iscsilun->iscsi; size_t size; uint32_t num_sectors; uint64_t lba; +#if !defined(LIBISCSI_FEATURE_IOVECTOR) struct iscsi_data data; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb); - - acb->iscsilun = iscsilun; - acb->qiov = qiov; +#endif + int ret; acb->canceled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; + acb->buf = NULL; - /* XXX we should pass the iovec to write16 to avoid the extra copy */ /* this will allow us to get rid of 'buf' completely */ - size = nb_sectors * BDRV_SECTOR_SIZE; - acb->buf = g_malloc(size); - qemu_iovec_to_buf(acb->qiov, 0, acb->buf, size); + size = acb->nb_sectors * BDRV_SECTOR_SIZE; + +#if !defined(LIBISCSI_FEATURE_IOVECTOR) + data.size = MIN(size, acb->qiov->size); + + /* if the iovec only contains one buffer we can pass it directly */ + if (acb->qiov->niov == 1) { + data.data = acb->qiov->iov[0].iov_base; + } else { + acb->buf = g_malloc(data.size); + qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size); + data.data = acb->buf; + } +#endif acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi WRITE16 " "command. %s", iscsi_get_error(iscsi)); - qemu_aio_release(acb); - return NULL; + return -1; } memset(acb->task, 0, sizeof(struct scsi_task)); acb->task->xfer_dir = SCSI_XFER_WRITE; acb->task->cdb_size = 16; acb->task->cdb[0] = 0x8a; - lba = sector_qemu2lun(sector_num, iscsilun); + lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); - num_sectors = size / iscsilun->block_size; + num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); acb->task->expxferlen = size; - data.data = acb->buf; - data.size = size; - - if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, - iscsi_aio_write16_cb, - &data, - acb) != 0) { +#if defined(LIBISCSI_FEATURE_IOVECTOR) + ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, + iscsi_aio_write16_cb, + NULL, + acb); +#else + ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, + iscsi_aio_write16_cb, + &data, + acb); +#endif + if (ret != 0) { scsi_free_scsi_task(acb->task); g_free(acb->buf); + return -1; + } + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); +#endif + + return 0; +} + +static BlockDriverAIOCB * +iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + IscsiAIOCB *acb; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return NULL; + } + + acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); + + acb->iscsilun = iscsilun; + acb->qiov = qiov; + acb->nb_sectors = nb_sectors; + acb->sector_num = sector_num; + acb->retries = ISCSI_CMD_RETRIES; + + if (iscsi_aio_writev_acb(acb) != 0) { qemu_aio_release(acb); return NULL; } iscsi_set_events(iscsilun); - return &acb->common; } +static int +iscsi_aio_readv_acb(IscsiAIOCB *acb); + static void iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, void *command_data, void *opaque) @@ -294,6 +379,16 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, acb->status = 0; if (status != 0) { + if (status == SCSI_STATUS_CHECK_CONDITION + && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && acb->retries-- > 0) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + if (iscsi_aio_readv_acb(acb) == 0) { + iscsi_set_events(acb->iscsilun); + return; + } + } error_report("Failed to read16 data from iSCSI lun. %s", iscsi_get_error(iscsi)); acb->status = -EIO; @@ -302,63 +397,39 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } -static BlockDriverAIOCB * -iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static int +iscsi_aio_readv_acb(IscsiAIOCB *acb) { - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - IscsiAIOCB *acb; - size_t qemu_read_size; - int i; + struct iscsi_context *iscsi = acb->iscsilun->iscsi; + size_t size; uint64_t lba; uint32_t num_sectors; - - qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb); - - acb->iscsilun = iscsilun; - acb->qiov = qiov; + int ret; +#if !defined(LIBISCSI_FEATURE_IOVECTOR) + int i; +#endif acb->canceled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; - acb->read_size = qemu_read_size; acb->buf = NULL; - /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU - * may be misaligned to the LUN, so we may need to read some extra - * data. - */ - acb->read_offset = 0; - if (iscsilun->block_size > BDRV_SECTOR_SIZE) { - uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num; - - acb->read_offset = bdrv_offset % iscsilun->block_size; - } - - num_sectors = (qemu_read_size + iscsilun->block_size - + acb->read_offset - 1) - / iscsilun->block_size; + size = acb->nb_sectors * BDRV_SECTOR_SIZE; acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi READ16 " "command. %s", iscsi_get_error(iscsi)); - qemu_aio_release(acb); - return NULL; + return -1; } memset(acb->task, 0, sizeof(struct scsi_task)); acb->task->xfer_dir = SCSI_XFER_READ; - lba = sector_qemu2lun(sector_num, iscsilun); - acb->task->expxferlen = qemu_read_size; + acb->task->expxferlen = size; + lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); + num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); - switch (iscsilun->type) { + switch (acb->iscsilun->type) { case TYPE_DISK: acb->task->cdb_size = 16; acb->task->cdb[0] = 0x88; @@ -374,26 +445,60 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, break; } - if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, - iscsi_aio_read16_cb, - NULL, - acb) != 0) { + ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, + iscsi_aio_read16_cb, + NULL, + acb); + if (ret != 0) { scsi_free_scsi_task(acb->task); - qemu_aio_release(acb); - return NULL; + return -1; } +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); +#else for (i = 0; i < acb->qiov->niov; i++) { scsi_task_add_data_in_buffer(acb->task, acb->qiov->iov[i].iov_len, acb->qiov->iov[i].iov_base); } +#endif + return 0; +} - iscsi_set_events(iscsilun); +static BlockDriverAIOCB * +iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + IscsiAIOCB *acb; + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return NULL; + } + + acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); + + acb->nb_sectors = nb_sectors; + acb->sector_num = sector_num; + acb->iscsilun = iscsilun; + acb->qiov = qiov; + acb->retries = ISCSI_CMD_RETRIES; + + if (iscsi_aio_readv_acb(acb) != 0) { + qemu_aio_release(acb); + return NULL; + } + + iscsi_set_events(iscsilun); return &acb->common; } +static int +iscsi_aio_flush_acb(IscsiAIOCB *acb); static void iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, @@ -406,7 +511,17 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, } acb->status = 0; - if (status < 0) { + if (status != 0) { + if (status == SCSI_STATUS_CHECK_CONDITION + && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && acb->retries-- > 0) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + if (iscsi_aio_flush_acb(acb) == 0) { + iscsi_set_events(acb->iscsilun); + return; + } + } error_report("Failed to sync10 data on iSCSI lun. %s", iscsi_get_error(iscsi)); acb->status = -EIO; @@ -415,28 +530,43 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } -static BlockDriverAIOCB * -iscsi_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static int +iscsi_aio_flush_acb(IscsiAIOCB *acb) { - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - IscsiAIOCB *acb; + struct iscsi_context *iscsi = acb->iscsilun->iscsi; - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; acb->canceled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; + acb->buf = NULL; - acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun, + acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun, 0, 0, 0, 0, iscsi_synccache10_cb, acb); if (acb->task == NULL) { error_report("iSCSI: Failed to send synchronizecache10 command. %s", iscsi_get_error(iscsi)); + return -1; + } + + return 0; +} + +static BlockDriverAIOCB * +iscsi_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + + IscsiAIOCB *acb; + + acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->retries = ISCSI_CMD_RETRIES; + + if (iscsi_aio_flush_acb(acb) != 0) { qemu_aio_release(acb); return NULL; } @@ -446,6 +576,8 @@ iscsi_aio_flush(BlockDriverState *bs, return &acb->common; } +static int iscsi_aio_discard_acb(IscsiAIOCB *acb); + static void iscsi_unmap_cb(struct iscsi_context *iscsi, int status, void *command_data, void *opaque) @@ -457,7 +589,17 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status, } acb->status = 0; - if (status < 0) { + if (status != 0) { + if (status == SCSI_STATUS_CHECK_CONDITION + && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && acb->retries-- > 0) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + if (iscsi_aio_discard_acb(acb) == 0) { + iscsi_set_events(acb->iscsilun); + return; + } + } error_report("Failed to unmap data on iSCSI lun. %s", iscsi_get_error(iscsi)); acb->status = -EIO; @@ -466,33 +608,47 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } -static BlockDriverAIOCB * -iscsi_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - IscsiAIOCB *acb; +static int iscsi_aio_discard_acb(IscsiAIOCB *acb) { + struct iscsi_context *iscsi = acb->iscsilun->iscsi; struct unmap_list list[1]; - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; acb->canceled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; + acb->buf = NULL; - list[0].lba = sector_qemu2lun(sector_num, iscsilun); - list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size; + list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); + list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size; - acb->task = iscsi_unmap_task(iscsi, iscsilun->lun, + acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun, 0, 0, &list[0], 1, iscsi_unmap_cb, acb); if (acb->task == NULL) { error_report("iSCSI: Failed to send unmap command. %s", iscsi_get_error(iscsi)); + return -1; + } + + return 0; +} + +static BlockDriverAIOCB * +iscsi_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + IscsiAIOCB *acb; + + acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->nb_sectors = nb_sectors; + acb->sector_num = sector_num; + acb->retries = ISCSI_CMD_RETRIES; + + if (iscsi_aio_discard_acb(acb) != 0) { qemu_aio_release(acb); return NULL; } @@ -509,6 +665,9 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, { IscsiAIOCB *acb = opaque; + g_free(acb->buf); + acb->buf = NULL; + if (acb->canceled != 0) { return; } @@ -585,14 +744,30 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); acb->task->expxferlen = acb->ioh->dxfer_len; + data.size = 0; if (acb->task->xfer_dir == SCSI_XFER_WRITE) { - data.data = acb->ioh->dxferp; - data.size = acb->ioh->dxfer_len; + if (acb->ioh->iovec_count == 0) { + data.data = acb->ioh->dxferp; + data.size = acb->ioh->dxfer_len; + } else { +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_out(acb->task, + (struct scsi_iovec *) acb->ioh->dxferp, + acb->ioh->iovec_count); +#else + struct iovec *iov = (struct iovec *)acb->ioh->dxferp; + + acb->buf = g_malloc(acb->ioh->dxfer_len); + data.data = acb->buf; + data.size = iov_to_buf(iov, acb->ioh->iovec_count, 0, + acb->buf, acb->ioh->dxfer_len); +#endif + } } + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, iscsi_aio_ioctl_cb, - (acb->task->xfer_dir == SCSI_XFER_WRITE) ? - &data : NULL, + (data.size > 0) ? &data : NULL, acb) != 0) { scsi_free_scsi_task(acb->task); qemu_aio_release(acb); @@ -601,9 +776,26 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, /* tell libiscsi to read straight into the buffer we got from ioctl */ if (acb->task->xfer_dir == SCSI_XFER_READ) { - scsi_task_add_data_in_buffer(acb->task, - acb->ioh->dxfer_len, - acb->ioh->dxferp); + if (acb->ioh->iovec_count == 0) { + scsi_task_add_data_in_buffer(acb->task, + acb->ioh->dxfer_len, + acb->ioh->dxferp); + } else { +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_in(acb->task, + (struct scsi_iovec *) acb->ioh->dxferp, + acb->ioh->iovec_count); +#else + int i; + for (i = 0; i < acb->ioh->iovec_count; i++) { + struct iovec *iov = (struct iovec *)acb->ioh->dxferp; + + scsi_task_add_data_in_buffer(acb->task, + iov[i].iov_len, + iov[i].iov_base); + } +#endif + } } iscsi_set_events(iscsilun); @@ -761,20 +953,118 @@ static char *parse_initiator_name(const char *target) } } +#if defined(LIBISCSI_FEATURE_NOP_COUNTER) +static void iscsi_nop_timed_event(void *opaque) +{ + IscsiLun *iscsilun = opaque; + + if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) { + error_report("iSCSI: NOP timeout. Reconnecting..."); + iscsi_reconnect(iscsilun->iscsi); + } + + if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { + error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages."); + return; + } + + qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); + iscsi_set_events(iscsilun); +} +#endif + +static int iscsi_readcapacity_sync(IscsiLun *iscsilun) +{ + struct scsi_task *task = NULL; + struct scsi_readcapacity10 *rc10 = NULL; + struct scsi_readcapacity16 *rc16 = NULL; + int ret = 0; + int retries = ISCSI_CMD_RETRIES; + + do { + if (task != NULL) { + scsi_free_scsi_task(task); + task = NULL; + } + + switch (iscsilun->type) { + case TYPE_DISK: + task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun); + if (task != NULL && task->status == SCSI_STATUS_GOOD) { + rc16 = scsi_datain_unmarshall(task); + if (rc16 == NULL) { + error_report("iSCSI: Failed to unmarshall readcapacity16 data."); + ret = -EINVAL; + } else { + iscsilun->block_size = rc16->block_length; + iscsilun->num_blocks = rc16->returned_lba + 1; + } + } + break; + case TYPE_ROM: + task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0); + if (task != NULL && task->status == SCSI_STATUS_GOOD) { + rc10 = scsi_datain_unmarshall(task); + if (rc10 == NULL) { + error_report("iSCSI: Failed to unmarshall readcapacity10 data."); + ret = -EINVAL; + } else { + iscsilun->block_size = rc10->block_size; + if (rc10->lba == 0) { + /* blank disk loaded */ + iscsilun->num_blocks = 0; + } else { + iscsilun->num_blocks = rc10->lba + 1; + } + } + } + break; + default: + return 0; + } + } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && retries-- > 0); + + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + error_report("iSCSI: failed to send readcapacity10 command."); + ret = -EINVAL; + } + if (task) { + scsi_free_scsi_task(task); + } + return ret; +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "iscsi", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the iscsi image", + }, + { /* end of list */ } + }, +}; + /* * We support iscsi url's on the form * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> */ -static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) +static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) { IscsiLun *iscsilun = bs->opaque; struct iscsi_context *iscsi = NULL; struct iscsi_url *iscsi_url = NULL; struct scsi_task *task = NULL; struct scsi_inquiry_standard *inq = NULL; - struct scsi_readcapacity10 *rc10 = NULL; - struct scsi_readcapacity16 *rc16 = NULL; char *initiator_name = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; int ret; if ((BDRV_SECTOR_SIZE % 512) != 0) { @@ -784,6 +1074,18 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) return -EINVAL; } + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto out; + } + + filename = qemu_opt_get(opts, "filename"); + + iscsi_url = iscsi_parse_full_url(iscsi, filename); if (iscsi_url == NULL) { error_report("Failed to parse URL : %s", filename); @@ -863,52 +1165,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) iscsilun->type = inq->periperal_device_type; - scsi_free_scsi_task(task); - - switch (iscsilun->type) { - case TYPE_DISK: - task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun); - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send readcapacity16 command."); - ret = -EINVAL; - goto out; - } - rc16 = scsi_datain_unmarshall(task); - if (rc16 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity16 data."); - ret = -EINVAL; - goto out; - } - iscsilun->block_size = rc16->block_length; - iscsilun->num_blocks = rc16->returned_lba + 1; - break; - case TYPE_ROM: - task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0); - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send readcapacity10 command."); - ret = -EINVAL; - goto out; - } - rc10 = scsi_datain_unmarshall(task); - if (rc10 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity10 data."); - ret = -EINVAL; - goto out; - } - iscsilun->block_size = rc10->block_size; - if (rc10->lba == 0) { - /* blank disk loaded */ - iscsilun->num_blocks = 0; - } else { - iscsilun->num_blocks = rc10->lba + 1; - } - break; - default: - break; + if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { + goto out; } - - bs->total_sectors = iscsilun->num_blocks * - iscsilun->block_size / BDRV_SECTOR_SIZE ; + bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); /* Medium changer or tape. We dont have any emulation for this so this must * be sg ioctl compatible. We force it to be sg, otherwise qemu will try @@ -919,9 +1179,14 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) bs->sg = 1; } - ret = 0; +#if defined(LIBISCSI_FEATURE_NOP_COUNTER) + /* Set up a timer for sending out iSCSI NOPs */ + iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun); + qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); +#endif out: + qemu_opts_del(opts); if (initiator_name != NULL) { g_free(initiator_name); } @@ -946,16 +1211,100 @@ static void iscsi_close(BlockDriverState *bs) IscsiLun *iscsilun = bs->opaque; struct iscsi_context *iscsi = iscsilun->iscsi; + if (iscsilun->nop_timer) { + qemu_del_timer(iscsilun->nop_timer); + qemu_free_timer(iscsilun->nop_timer); + } qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL); iscsi_destroy_context(iscsi); memset(iscsilun, 0, sizeof(IscsiLun)); } +static int iscsi_truncate(BlockDriverState *bs, int64_t offset) +{ + IscsiLun *iscsilun = bs->opaque; + int ret = 0; + + if (iscsilun->type != TYPE_DISK) { + return -ENOTSUP; + } + + if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { + return ret; + } + + if (offset > iscsi_getlength(bs)) { + return -EINVAL; + } + + return 0; +} + static int iscsi_has_zero_init(BlockDriverState *bs) { return 0; } +static int iscsi_create(const char *filename, QEMUOptionParameter *options) +{ + int ret = 0; + int64_t total_size = 0; + BlockDriverState bs; + IscsiLun *iscsilun = NULL; + QDict *bs_options; + + memset(&bs, 0, sizeof(BlockDriverState)); + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, "size")) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + bs.opaque = g_malloc0(sizeof(struct IscsiLun)); + iscsilun = bs.opaque; + + bs_options = qdict_new(); + qdict_put(bs_options, "filename", qstring_from_str(filename)); + ret = iscsi_open(&bs, bs_options, 0); + QDECREF(bs_options); + + if (ret != 0) { + goto out; + } + if (iscsilun->nop_timer) { + qemu_del_timer(iscsilun->nop_timer); + qemu_free_timer(iscsilun->nop_timer); + } + if (iscsilun->type != TYPE_DISK) { + ret = -ENODEV; + goto out; + } + if (bs.total_sectors < total_size) { + ret = -ENOSPC; + goto out; + } + + ret = 0; +out: + if (iscsilun->iscsi != NULL) { + iscsi_destroy_context(iscsilun->iscsi); + } + g_free(bs.opaque); + return ret; +} + +static QEMUOptionParameter iscsi_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + static BlockDriver bdrv_iscsi = { .format_name = "iscsi", .protocol_name = "iscsi", @@ -963,8 +1312,11 @@ static BlockDriver bdrv_iscsi = { .instance_size = sizeof(IscsiLun), .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, + .bdrv_create = iscsi_create, + .create_options = iscsi_create_options, .bdrv_getlength = iscsi_getlength, + .bdrv_truncate = iscsi_truncate, .bdrv_aio_readv = iscsi_aio_readv, .bdrv_aio_writev = iscsi_aio_writev, @@ -979,9 +1331,36 @@ static BlockDriver bdrv_iscsi = { #endif }; +static QemuOptsList qemu_iscsi_opts = { + .name = "iscsi", + .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head), + .desc = { + { + .name = "user", + .type = QEMU_OPT_STRING, + .help = "username for CHAP authentication to target", + },{ + .name = "password", + .type = QEMU_OPT_STRING, + .help = "password for CHAP authentication to target", + },{ + .name = "header-digest", + .type = QEMU_OPT_STRING, + .help = "HeaderDigest setting. " + "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}", + },{ + .name = "initiator-name", + .type = QEMU_OPT_STRING, + .help = "Initiator iqn name to use when connecting", + }, + { /* end of list */ } + }, +}; + static void iscsi_block_init(void) { bdrv_register(&bdrv_iscsi); + qemu_add_opts(&qemu_iscsi_opts); } block_init(iscsi_block_init); diff --git a/block/linux-aio.c b/block/linux-aio.c index 91ef86324..ee0f8d10c 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -8,10 +8,10 @@ * See the COPYING file in the top-level directory. */ #include "qemu-common.h" -#include "qemu-aio.h" -#include "qemu-queue.h" +#include "block/aio.h" +#include "qemu/queue.h" #include "block/raw-aio.h" -#include "event_notifier.h" +#include "qemu/event_notifier.h" #include <libaio.h> diff --git a/block/mirror.c b/block/mirror.c index d6618a4b3..bed4a7ead 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -12,20 +12,20 @@ */ #include "trace.h" -#include "blockjob.h" -#include "block_int.h" +#include "block/blockjob.h" +#include "block/block_int.h" #include "qemu/ratelimit.h" +#include "qemu/bitmap.h" -enum { - /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. - */ - BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */ -}; +#define SLICE_TIME 100000000ULL /* ns */ +#define MAX_IN_FLIGHT 16 -#define SLICE_TIME 100000000ULL /* ns */ +/* The mirroring buffer is a list of granularity-sized chunks. + * Free chunks are organized in a list. + */ +typedef struct MirrorBuffer { + QSIMPLEQ_ENTRY(MirrorBuffer) next; +} MirrorBuffer; typedef struct MirrorBlockJob { BlockJob common; @@ -36,9 +36,26 @@ typedef struct MirrorBlockJob { bool synced; bool should_complete; int64_t sector_num; + int64_t granularity; + size_t buf_size; + unsigned long *cow_bitmap; + HBitmapIter hbi; uint8_t *buf; + QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; + int buf_free_count; + + unsigned long *in_flight_bitmap; + int in_flight; + int ret; } MirrorBlockJob; +typedef struct MirrorOp { + MirrorBlockJob *s; + QEMUIOVector qiov; + int64_t sector_num; + int nb_sectors; +} MirrorOp; + static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, int error) { @@ -52,51 +69,234 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, } } -static int coroutine_fn mirror_iteration(MirrorBlockJob *s, - BlockErrorAction *p_action) +static void mirror_iteration_done(MirrorOp *op, int ret) { - BlockDriverState *source = s->common.bs; - BlockDriverState *target = s->target; - QEMUIOVector qiov; - int ret, nb_sectors; - int64_t end; - struct iovec iov; + MirrorBlockJob *s = op->s; + struct iovec *iov; + int64_t chunk_num; + int i, nb_chunks, sectors_per_chunk; + + trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); + + s->in_flight--; + iov = op->qiov.iov; + for (i = 0; i < op->qiov.niov; i++) { + MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); + s->buf_free_count++; + } - end = s->common.len >> BDRV_SECTOR_BITS; - s->sector_num = bdrv_get_next_dirty(source, s->sector_num); - nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num); - bdrv_reset_dirty(source, s->sector_num, nb_sectors); + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + chunk_num = op->sector_num / sectors_per_chunk; + nb_chunks = op->nb_sectors / sectors_per_chunk; + bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); + if (s->cow_bitmap && ret >= 0) { + bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + } - /* Copy the dirty cluster. */ - iov.iov_base = s->buf; - iov.iov_len = nb_sectors * 512; - qemu_iovec_init_external(&qiov, &iov, 1); + g_slice_free(MirrorOp, op); + qemu_coroutine_enter(s->common.co, NULL); +} - trace_mirror_one_iteration(s, s->sector_num, nb_sectors); - ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov); +static void mirror_write_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; if (ret < 0) { - *p_action = mirror_error_action(s, true, -ret); - goto fail; + BlockDriverState *source = s->common.bs; + BlockErrorAction action; + + bdrv_set_dirty(source, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, false, -ret); + if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } } - ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov); + mirror_iteration_done(op, ret); +} + +static void mirror_read_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; if (ret < 0) { - *p_action = mirror_error_action(s, false, -ret); - s->synced = false; - goto fail; + BlockDriverState *source = s->common.bs; + BlockErrorAction action; + + bdrv_set_dirty(source, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, true, -ret); + if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } + + mirror_iteration_done(op, ret); + return; + } + bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, + mirror_write_complete, op); +} + +static void coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ + BlockDriverState *source = s->common.bs; + int nb_sectors, sectors_per_chunk, nb_chunks; + int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; + MirrorOp *op; + + s->sector_num = hbitmap_iter_next(&s->hbi); + if (s->sector_num < 0) { + bdrv_dirty_iter_init(source, &s->hbi); + s->sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); + assert(s->sector_num >= 0); + } + + hbitmap_next_sector = s->sector_num; + sector_num = s->sector_num; + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + end = s->common.len >> BDRV_SECTOR_BITS; + + /* Extend the QEMUIOVector to include all adjacent blocks that will + * be copied in this operation. + * + * We have to do this if we have no backing file yet in the destination, + * and the cluster size is very large. Then we need to do COW ourselves. + * The first time a cluster is copied, copy it entirely. Note that, + * because both the granularity and the cluster size are powers of two, + * the number of sectors to copy cannot exceed one cluster. + * + * We also want to extend the QEMUIOVector to include more adjacent + * dirty blocks if possible, to limit the number of I/O operations and + * run efficiently even with a small granularity. + */ + nb_chunks = 0; + nb_sectors = 0; + next_sector = sector_num; + next_chunk = sector_num / sectors_per_chunk; + + /* Wait for I/O to this cluster (from a previous iteration) to be done. */ + while (test_bit(next_chunk, s->in_flight_bitmap)) { + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + qemu_coroutine_yield(); + } + + do { + int added_sectors, added_chunks; + + if (!bdrv_get_dirty(source, next_sector) || + test_bit(next_chunk, s->in_flight_bitmap)) { + assert(nb_sectors > 0); + break; + } + + added_sectors = sectors_per_chunk; + if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { + bdrv_round_to_clusters(s->target, + next_sector, added_sectors, + &next_sector, &added_sectors); + + /* On the first iteration, the rounding may make us copy + * sectors before the first dirty one. + */ + if (next_sector < sector_num) { + assert(nb_sectors == 0); + sector_num = next_sector; + next_chunk = next_sector / sectors_per_chunk; + } + } + + added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); + added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; + + /* When doing COW, it may happen that there is not enough space for + * a full cluster. Wait if that is the case. + */ + while (nb_chunks == 0 && s->buf_free_count < added_chunks) { + trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); + qemu_coroutine_yield(); + } + if (s->buf_free_count < nb_chunks + added_chunks) { + trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); + break; + } + + /* We have enough free space to copy these sectors. */ + bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); + + nb_sectors += added_sectors; + nb_chunks += added_chunks; + next_sector += added_sectors; + next_chunk += added_chunks; + } while (next_sector < end); + + /* Allocate a MirrorOp that is used as an AIO callback. */ + op = g_slice_new(MirrorOp); + op->s = s; + op->sector_num = sector_num; + op->nb_sectors = nb_sectors; + + /* Now make a QEMUIOVector taking enough granularity-sized chunks + * from s->buf_free. + */ + qemu_iovec_init(&op->qiov, nb_chunks); + next_sector = sector_num; + while (nb_chunks-- > 0) { + MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); + QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); + s->buf_free_count--; + qemu_iovec_add(&op->qiov, buf, s->granularity); + + /* Advance the HBitmapIter in parallel, so that we do not examine + * the same sector twice. + */ + if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) { + hbitmap_next_sector = hbitmap_iter_next(&s->hbi); + } + + next_sector += sectors_per_chunk; } - return 0; -fail: - /* Try again later. */ - bdrv_set_dirty(source, s->sector_num, nb_sectors); - return ret; + bdrv_reset_dirty(source, sector_num, nb_sectors); + + /* Copy the dirty cluster. */ + s->in_flight++; + trace_mirror_one_iteration(s, sector_num, nb_sectors); + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + mirror_read_complete, op); +} + +static void mirror_free_init(MirrorBlockJob *s) +{ + int granularity = s->granularity; + size_t buf_size = s->buf_size; + uint8_t *buf = s->buf; + + assert(s->buf_free_count == 0); + QSIMPLEQ_INIT(&s->buf_free); + while (buf_size != 0) { + MirrorBuffer *cur = (MirrorBuffer *)buf; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); + s->buf_free_count++; + buf_size -= granularity; + buf += granularity; + } +} + +static void mirror_drain(MirrorBlockJob *s) +{ + while (s->in_flight > 0) { + qemu_coroutine_yield(); + } } static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; BlockDriverState *bs = s->common.bs; - int64_t sector_num, end; + int64_t sector_num, end, sectors_per_chunk, length; + uint64_t last_pause_ns; + BlockDriverInfo bdi; + char backing_filename[1024]; int ret = 0; int n; @@ -105,20 +305,39 @@ static void coroutine_fn mirror_run(void *opaque) } s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { + if (s->common.len <= 0) { block_job_completed(&s->common, s->common.len); return; } + length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; + s->in_flight_bitmap = bitmap_new(length); + + /* If we have no backing file yet in the destination, we cannot let + * the destination do COW. Instead, we copy sectors around the + * dirty data if needed. We need a bitmap to do that. + */ + bdrv_get_backing_filename(s->target, backing_filename, + sizeof(backing_filename)); + if (backing_filename[0] && !s->target->backing_hd) { + bdrv_get_info(s->target, &bdi); + if (s->granularity < bdi.cluster_size) { + s->buf_size = MAX(s->buf_size, bdi.cluster_size); + s->cow_bitmap = bitmap_new(length); + } + } + end = s->common.len >> BDRV_SECTOR_BITS; - s->buf = qemu_blockalign(bs, BLOCK_SIZE); + s->buf = qemu_blockalign(bs, s->buf_size); + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + mirror_free_init(s); if (s->mode != MIRROR_SYNC_MODE_NONE) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base; base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; for (sector_num = 0; sector_num < end; ) { - int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; + int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; ret = bdrv_co_is_allocated_above(bs, base, sector_num, next - sector_num, &n); @@ -136,24 +355,40 @@ static void coroutine_fn mirror_run(void *opaque) } } - s->sector_num = -1; + bdrv_dirty_iter_init(bs, &s->hbi); + last_pause_ns = qemu_get_clock_ns(rt_clock); for (;;) { uint64_t delay_ns; int64_t cnt; bool should_complete; + if (s->ret < 0) { + ret = s->ret; + goto immediate_exit; + } + cnt = bdrv_get_dirty_count(bs); - if (cnt != 0) { - BlockErrorAction action = BDRV_ACTION_REPORT; - ret = mirror_iteration(s, &action); - if (ret < 0 && action == BDRV_ACTION_REPORT) { - goto immediate_exit; + + /* Note that even when no rate limit is applied we need to yield + * periodically with no pending I/O so that qemu_aio_flush() returns. + * We do so every SLICE_TIME nanoseconds, or when there is an error, + * or when the source is clean, whichever comes first. + */ + if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME && + s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); + qemu_coroutine_yield(); + continue; + } else if (cnt != 0) { + mirror_iteration(s); + continue; } - cnt = bdrv_get_dirty_count(bs); } should_complete = false; - if (cnt == 0) { + if (s->in_flight == 0 && cnt == 0) { trace_mirror_before_flush(s); ret = bdrv_flush(s->target); if (ret < 0) { @@ -196,23 +431,20 @@ static void coroutine_fn mirror_run(void *opaque) trace_mirror_before_sleep(s, cnt, s->synced); if (!s->synced) { /* Publish progress */ - s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE; + s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK); + delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk); } else { delay_ns = 0; } - /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that qemu_aio_flush() returns. - */ block_job_sleep_ns(&s->common, rt_clock, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } } else if (!should_complete) { - delay_ns = (cnt == 0 ? SLICE_TIME : 0); + delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); block_job_sleep_ns(&s->common, rt_clock, delay_ns); } else if (cnt == 0) { /* The two disks are in sync. Exit and report successful @@ -222,11 +454,24 @@ static void coroutine_fn mirror_run(void *opaque) s->common.cancelled = false; break; } + last_pause_ns = qemu_get_clock_ns(rt_clock); } immediate_exit: - g_free(s->buf); - bdrv_set_dirty_tracking(bs, false); + if (s->in_flight > 0) { + /* We get here only if something went wrong. Either the job failed, + * or it was cancelled prematurely so that we do not guarantee that + * the target is a copy of the source. + */ + assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); + mirror_drain(s); + } + + assert(s->in_flight == 0); + qemu_vfree(s->buf); + g_free(s->cow_bitmap); + g_free(s->in_flight_bitmap); + bdrv_set_dirty_tracking(bs, 0); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { @@ -262,12 +507,12 @@ static void mirror_complete(BlockJob *job, Error **errp) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); int ret; - ret = bdrv_open_backing_file(s->target); + ret = bdrv_open_backing_file(s->target, NULL); if (ret < 0) { char backing_filename[PATH_MAX]; bdrv_get_full_backing_filename(s->target, backing_filename, sizeof(backing_filename)); - error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename); + error_setg_file_open(errp, -ret, backing_filename); return; } if (!s->synced) { @@ -279,7 +524,7 @@ static void mirror_complete(BlockJob *job, Error **errp) block_job_resume(job); } -static BlockJobType mirror_job_type = { +static const BlockJobType mirror_job_type = { .instance_size = sizeof(MirrorBlockJob), .job_type = "mirror", .set_speed = mirror_set_speed, @@ -288,14 +533,28 @@ static BlockJobType mirror_job_type = { }; void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode mode, - BlockdevOnError on_source_error, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) { MirrorBlockJob *s; + if (granularity == 0) { + /* Choose the default granularity based on the target file's cluster + * size, clamped between 4k and 64k. */ + BlockDriverInfo bdi; + if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + } + + assert ((granularity & (granularity - 1)) == 0); + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { @@ -312,7 +571,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_target_error = on_target_error; s->target = target; s->mode = mode; - bdrv_set_dirty_tracking(bs, true); + s->granularity = granularity; + s->buf_size = MAX(buf_size, granularity); + + bdrv_set_dirty_tracking(bs, granularity); bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); diff --git a/block/nbd.c b/block/nbd.c index e87c24817..9c480b8f2 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -27,11 +27,13 @@ */ #include "qemu-common.h" -#include "nbd.h" -#include "uri.h" -#include "block_int.h" -#include "module.h" -#include "qemu_socket.h" +#include "block/nbd.h" +#include "qemu/uri.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/sockets.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qint.h" #include <sys/types.h> #include <unistd.h> @@ -65,17 +67,19 @@ typedef struct BDRVNBDState { Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; struct nbd_reply reply; - int is_unix; - char *host_spec; + bool is_unix; + QemuOpts *socket_opts; + char *export_name; /* An NBD server may export several devices */ } BDRVNBDState; -static int nbd_parse_uri(BDRVNBDState *s, const char *filename) +static int nbd_parse_uri(const char *filename, QDict *options) { URI *uri; const char *p; QueryParams *qp = NULL; int ret = 0; + bool is_unix; uri = uri_parse(filename); if (!uri) { @@ -84,11 +88,11 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename) /* transport */ if (!strcmp(uri->scheme, "nbd")) { - s->is_unix = false; + is_unix = false; } else if (!strcmp(uri->scheme, "nbd+tcp")) { - s->is_unix = false; + is_unix = false; } else if (!strcmp(uri->scheme, "nbd+unix")) { - s->is_unix = true; + is_unix = true; } else { ret = -EINVAL; goto out; @@ -97,32 +101,44 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename) p = uri->path ? uri->path : "/"; p += strspn(p, "/"); if (p[0]) { - s->export_name = g_strdup(p); + qdict_put(options, "export", qstring_from_str(p)); } qp = query_params_parse(uri->query); - if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) { + if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { ret = -EINVAL; goto out; } - if (s->is_unix) { + if (is_unix) { /* nbd+unix:///export?socket=path */ if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { ret = -EINVAL; goto out; } - s->host_spec = g_strdup(qp->p[0].value); + qdict_put(options, "path", qstring_from_str(qp->p[0].value)); } else { - /* nbd[+tcp]://host:port/export */ + QString *host; + /* nbd[+tcp]://host[:port]/export */ if (!uri->server) { ret = -EINVAL; goto out; } - if (!uri->port) { - uri->port = NBD_DEFAULT_PORT; + + /* strip braces from literal IPv6 address */ + if (uri->server[0] == '[') { + host = qstring_from_substr(uri->server, 1, + strlen(uri->server) - 2); + } else { + host = qstring_from_str(uri->server); + } + + qdict_put(options, "host", host); + if (uri->port) { + char* port_str = g_strdup_printf("%d", uri->port); + qdict_put(options, "port", qstring_from_str(port_str)); + g_free(port_str); } - s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port); } out: @@ -133,16 +149,29 @@ out: return ret; } -static int nbd_config(BDRVNBDState *s, const char *filename) +static void nbd_parse_filename(const char *filename, QDict *options, + Error **errp) { char *file; char *export_name; const char *host_spec; const char *unixpath; - int err = -EINVAL; + + if (qdict_haskey(options, "host") + || qdict_haskey(options, "port") + || qdict_haskey(options, "path")) + { + error_setg(errp, "host/port/path and a file name may not be specified " + "at the same time"); + return; + } if (strstr(filename, "://")) { - return nbd_parse_uri(s, filename); + int ret = nbd_parse_uri(filename, options); + if (ret < 0) { + error_setg(errp, "No valid URL specified"); + } + return; } file = g_strdup(filename); @@ -154,34 +183,79 @@ static int nbd_config(BDRVNBDState *s, const char *filename) } export_name[0] = 0; /* truncate 'file' */ export_name += strlen(EN_OPTSTR); - s->export_name = g_strdup(export_name); + + qdict_put(options, "export", qstring_from_str(export_name)); } /* extract the host_spec - fail if it's not nbd:... */ if (!strstart(file, "nbd:", &host_spec)) { + error_setg(errp, "File name string for NBD must start with 'nbd:'"); + goto out; + } + + if (!*host_spec) { goto out; } /* are we a UNIX or TCP socket? */ if (strstart(host_spec, "unix:", &unixpath)) { - s->is_unix = true; - s->host_spec = g_strdup(unixpath); + qdict_put(options, "path", qstring_from_str(unixpath)); } else { - s->is_unix = false; - s->host_spec = g_strdup(host_spec); - } + InetSocketAddress *addr = NULL; - err = 0; + addr = inet_parse(host_spec, errp); + if (error_is_set(errp)) { + goto out; + } + + qdict_put(options, "host", qstring_from_str(addr->host)); + qdict_put(options, "port", qstring_from_str(addr->port)); + qapi_free_InetSocketAddress(addr); + } out: g_free(file); - if (err != 0) { - g_free(s->export_name); - g_free(s->host_spec); +} + +static int nbd_config(BDRVNBDState *s, QDict *options) +{ + Error *local_err = NULL; + + if (qdict_haskey(options, "path")) { + if (qdict_haskey(options, "host")) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not " + "be used at the same time."); + return -EINVAL; + } + s->is_unix = true; + } else if (qdict_haskey(options, "host")) { + s->is_unix = false; + } else { + return -EINVAL; + } + + s->socket_opts = qemu_opts_create_nofail(&socket_optslist); + + qemu_opts_absorb_qdict(s->socket_opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + return -EINVAL; + } + + if (!qemu_opt_get(s->socket_opts, "port")) { + qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT); } - return err; + + s->export_name = g_strdup(qdict_get_try_str(options, "export")); + if (s->export_name) { + qdict_del(options, "export"); + } + + return 0; } + static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) { int i; @@ -269,13 +343,23 @@ static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, s->send_coroutine = qemu_coroutine_self(); qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, nbd_have_request, s); - rc = nbd_send_request(s->sock, request); - if (rc >= 0 && qiov) { - ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, - offset, request->len); - if (ret != request->len) { - return -EIO; + if (qiov) { + if (!s->is_unix) { + socket_set_cork(s->sock, 1); } + rc = nbd_send_request(s->sock, request); + if (rc >= 0) { + ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + rc = -EIO; + } + } + if (!s->is_unix) { + socket_set_cork(s->sock, 0); + } + } else { + rc = nbd_send_request(s->sock, request); } qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, nbd_have_request, s); @@ -328,9 +412,12 @@ static int nbd_establish_connection(BlockDriverState *bs) size_t blocksize; if (s->is_unix) { - sock = unix_socket_outgoing(s->host_spec); + sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path")); } else { - sock = tcp_socket_outgoing_spec(s->host_spec); + sock = tcp_socket_outgoing_opts(s->socket_opts); + if (sock >= 0) { + socket_set_nodelay(sock); + } } /* Failed to establish connection */ @@ -350,7 +437,7 @@ static int nbd_establish_connection(BlockDriverState *bs) /* Now that we're connected, set the socket to be non-blocking and * kick the reply mechanism. */ - socket_set_nonblock(sock); + qemu_set_nonblock(sock); qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, nbd_have_request, s); @@ -376,7 +463,7 @@ static void nbd_teardown_connection(BlockDriverState *bs) closesocket(s->sock); } -static int nbd_open(BlockDriverState *bs, const char* filename, int flags) +static int nbd_open(BlockDriverState *bs, QDict *options, int flags) { BDRVNBDState *s = bs->opaque; int result; @@ -385,7 +472,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags) qemu_co_mutex_init(&s->free_sema); /* Pop the config into our state object. Exit if invalid. */ - result = nbd_config(s, filename); + result = nbd_config(s, options); if (result != 0) { return result; } @@ -531,7 +618,7 @@ static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } request.type = NBD_CMD_TRIM; - request.from = sector_num * 512;; + request.from = sector_num * 512; request.len = nb_sectors * 512; nbd_coroutine_start(s, &request); @@ -549,7 +636,7 @@ static void nbd_close(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; g_free(s->export_name); - g_free(s->host_spec); + qemu_opts_del(s->socket_opts); nbd_teardown_connection(bs); } @@ -565,6 +652,7 @@ static BlockDriver bdrv_nbd = { .format_name = "nbd", .protocol_name = "nbd", .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, @@ -578,6 +666,7 @@ static BlockDriver bdrv_nbd_tcp = { .format_name = "nbd", .protocol_name = "nbd+tcp", .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, @@ -591,6 +680,7 @@ static BlockDriver bdrv_nbd_unix = { .format_name = "nbd", .protocol_name = "nbd+unix", .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, diff --git a/block/parallels.c b/block/parallels.c index d30f0ecf7..18b3ac0b2 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -24,8 +24,8 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" /**************************************************************/ @@ -68,19 +68,23 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam return 0; } -static int parallels_open(BlockDriverState *bs, int flags) +static int parallels_open(BlockDriverState *bs, QDict *options, int flags) { BDRVParallelsState *s = bs->opaque; int i; struct parallels_header ph; + int ret; bs->read_only = 1; // no write support yet - if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph)) + ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph)); + if (ret < 0) { goto fail; + } if (memcmp(ph.magic, HEADER_MAGIC, 16) || - (le32_to_cpu(ph.version) != HEADER_VERSION)) { + (le32_to_cpu(ph.version) != HEADER_VERSION)) { + ret = -EMEDIUMTYPE; goto fail; } @@ -90,18 +94,21 @@ static int parallels_open(BlockDriverState *bs, int flags) s->catalog_size = le32_to_cpu(ph.catalog_entries); s->catalog_bitmap = g_malloc(s->catalog_size * 4); - if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) != - s->catalog_size * 4) - goto fail; + + ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); + if (ret < 0) { + goto fail; + } + for (i = 0; i < s->catalog_size; i++) le32_to_cpus(&s->catalog_bitmap[i]); qemu_co_mutex_init(&s->lock); return 0; + fail: - if (s->catalog_bitmap) - g_free(s->catalog_bitmap); - return -1; + g_free(s->catalog_bitmap); + return ret; } static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) diff --git a/block/qapi.c b/block/qapi.c new file mode 100644 index 000000000..a4bc4113b --- /dev/null +++ b/block/qapi.c @@ -0,0 +1,470 @@ +/* + * Block layer qmp and info dump related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/qapi.h" +#include "block/block_int.h" +#include "qmp-commands.h" + +/* + * Returns 0 on success, with *p_list either set to describe snapshot + * information, or NULL because there are no snapshots. Returns -errno on + * error, with *p_list untouched. + */ +int bdrv_query_snapshot_info_list(BlockDriverState *bs, + SnapshotInfoList **p_list, + Error **errp) +{ + int i, sn_count; + QEMUSnapshotInfo *sn_tab = NULL; + SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL; + SnapshotInfo *info; + + sn_count = bdrv_snapshot_list(bs, &sn_tab); + if (sn_count < 0) { + const char *dev = bdrv_get_device_name(bs); + switch (sn_count) { + case -ENOMEDIUM: + error_setg(errp, "Device '%s' is not inserted", dev); + break; + case -ENOTSUP: + error_setg(errp, + "Device '%s' does not support internal snapshots", + dev); + break; + default: + error_setg_errno(errp, -sn_count, + "Can't list snapshots of device '%s'", dev); + break; + } + return sn_count; + } + + for (i = 0; i < sn_count; i++) { + info = g_new0(SnapshotInfo, 1); + info->id = g_strdup(sn_tab[i].id_str); + info->name = g_strdup(sn_tab[i].name); + info->vm_state_size = sn_tab[i].vm_state_size; + info->date_sec = sn_tab[i].date_sec; + info->date_nsec = sn_tab[i].date_nsec; + info->vm_clock_sec = sn_tab[i].vm_clock_nsec / 1000000000; + info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000; + + info_list = g_new0(SnapshotInfoList, 1); + info_list->value = info; + + /* XXX: waiting for the qapi to support qemu-queue.h types */ + if (!cur_item) { + head = cur_item = info_list; + } else { + cur_item->next = info_list; + cur_item = info_list; + } + + } + + g_free(sn_tab); + *p_list = head; + return 0; +} + +/** + * bdrv_query_image_info: + * @bs: block device to examine + * @p_info: location to store image information + * @errp: location to store error information + * + * Store "flat" image information in @p_info. + * + * "Flat" means it does *not* query backing image information, + * i.e. (*pinfo)->has_backing_image will be set to false and + * (*pinfo)->backing_image to NULL even when the image does in fact have + * a backing image. + * + * @p_info will be set only on success. On error, store error in @errp. + */ +void bdrv_query_image_info(BlockDriverState *bs, + ImageInfo **p_info, + Error **errp) +{ + uint64_t total_sectors; + const char *backing_filename; + char backing_filename2[1024]; + BlockDriverInfo bdi; + int ret; + Error *err = NULL; + ImageInfo *info = g_new0(ImageInfo, 1); + + bdrv_get_geometry(bs, &total_sectors); + + info->filename = g_strdup(bs->filename); + info->format = g_strdup(bdrv_get_format_name(bs)); + info->virtual_size = total_sectors * 512; + info->actual_size = bdrv_get_allocated_file_size(bs); + info->has_actual_size = info->actual_size >= 0; + if (bdrv_is_encrypted(bs)) { + info->encrypted = true; + info->has_encrypted = true; + } + if (bdrv_get_info(bs, &bdi) >= 0) { + if (bdi.cluster_size != 0) { + info->cluster_size = bdi.cluster_size; + info->has_cluster_size = true; + } + info->dirty_flag = bdi.is_dirty; + info->has_dirty_flag = true; + } + backing_filename = bs->backing_file; + if (backing_filename[0] != '\0') { + info->backing_filename = g_strdup(backing_filename); + info->has_backing_filename = true; + bdrv_get_full_backing_filename(bs, backing_filename2, + sizeof(backing_filename2)); + + if (strcmp(backing_filename, backing_filename2) != 0) { + info->full_backing_filename = + g_strdup(backing_filename2); + info->has_full_backing_filename = true; + } + + if (bs->backing_format[0]) { + info->backing_filename_format = g_strdup(bs->backing_format); + info->has_backing_filename_format = true; + } + } + + ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err); + switch (ret) { + case 0: + if (info->snapshots) { + info->has_snapshots = true; + } + break; + /* recoverable error */ + case -ENOMEDIUM: + case -ENOTSUP: + error_free(err); + break; + default: + error_propagate(errp, err); + qapi_free_ImageInfo(info); + return; + } + + *p_info = info; +} + +/* @p_info will be set only on success. */ +void bdrv_query_info(BlockDriverState *bs, + BlockInfo **p_info, + Error **errp) +{ + BlockInfo *info = g_malloc0(sizeof(*info)); + BlockDriverState *bs0; + ImageInfo **p_image_info; + Error *local_err = NULL; + info->device = g_strdup(bs->device_name); + info->type = g_strdup("unknown"); + info->locked = bdrv_dev_is_medium_locked(bs); + info->removable = bdrv_dev_has_removable_media(bs); + + if (bdrv_dev_has_removable_media(bs)) { + info->has_tray_open = true; + info->tray_open = bdrv_dev_is_tray_open(bs); + } + + if (bdrv_iostatus_is_enabled(bs)) { + info->has_io_status = true; + info->io_status = bs->iostatus; + } + + if (bs->dirty_bitmap) { + info->has_dirty = true; + info->dirty = g_malloc0(sizeof(*info->dirty)); + info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; + info->dirty->granularity = + ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); + } + + if (bs->drv) { + info->has_inserted = true; + info->inserted = g_malloc0(sizeof(*info->inserted)); + info->inserted->file = g_strdup(bs->filename); + info->inserted->ro = bs->read_only; + info->inserted->drv = g_strdup(bs->drv->format_name); + info->inserted->encrypted = bs->encrypted; + info->inserted->encryption_key_missing = bdrv_key_required(bs); + + if (bs->backing_file[0]) { + info->inserted->has_backing_file = true; + info->inserted->backing_file = g_strdup(bs->backing_file); + } + + info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); + + if (bs->io_limits_enabled) { + info->inserted->bps = + bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + info->inserted->bps_rd = + bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; + info->inserted->bps_wr = + bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; + info->inserted->iops = + bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + info->inserted->iops_rd = + bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; + info->inserted->iops_wr = + bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; + } + + bs0 = bs; + p_image_info = &info->inserted->image; + while (1) { + bdrv_query_image_info(bs0, p_image_info, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto err; + } + if (bs0->drv && bs0->backing_hd) { + bs0 = bs0->backing_hd; + (*p_image_info)->has_backing_image = true; + p_image_info = &((*p_image_info)->backing_image); + } else { + break; + } + } + } + + *p_info = info; + return; + + err: + qapi_free_BlockInfo(info); +} + +BlockStats *bdrv_query_stats(const BlockDriverState *bs) +{ + BlockStats *s; + + s = g_malloc0(sizeof(*s)); + + if (bs->device_name[0]) { + s->has_device = true; + s->device = g_strdup(bs->device_name); + } + + s->stats = g_malloc0(sizeof(*s->stats)); + s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ]; + s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE]; + s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ]; + s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE]; + s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE; + s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH]; + s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE]; + s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ]; + s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH]; + + if (bs->file) { + s->has_parent = true; + s->parent = bdrv_query_stats(bs->file); + } + + return s; +} + +BlockInfoList *qmp_query_block(Error **errp) +{ + BlockInfoList *head = NULL, **p_next = &head; + BlockDriverState *bs = NULL; + Error *local_err = NULL; + + while ((bs = bdrv_next(bs))) { + BlockInfoList *info = g_malloc0(sizeof(*info)); + bdrv_query_info(bs, &info->value, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto err; + } + + *p_next = info; + p_next = &info->next; + } + + return head; + + err: + qapi_free_BlockInfoList(head); + return NULL; +} + +BlockStatsList *qmp_query_blockstats(Error **errp) +{ + BlockStatsList *head = NULL, **p_next = &head; + BlockDriverState *bs = NULL; + + while ((bs = bdrv_next(bs))) { + BlockStatsList *info = g_malloc0(sizeof(*info)); + info->value = bdrv_query_stats(bs); + + *p_next = info; + p_next = &info->next; + } + + return head; +} + +#define NB_SUFFIXES 4 + +static char *get_human_readable_size(char *buf, int buf_size, int64_t size) +{ + static const char suffixes[NB_SUFFIXES] = "KMGT"; + int64_t base; + int i; + + if (size <= 999) { + snprintf(buf, buf_size, "%" PRId64, size); + } else { + base = 1024; + for (i = 0; i < NB_SUFFIXES; i++) { + if (size < (10 * base)) { + snprintf(buf, buf_size, "%0.1f%c", + (double)size / base, + suffixes[i]); + break; + } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { + snprintf(buf, buf_size, "%" PRId64 "%c", + ((size + (base >> 1)) / base), + suffixes[i]); + break; + } + base = base * 1024; + } + } + return buf; +} + +void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, + QEMUSnapshotInfo *sn) +{ + char buf1[128], date_buf[128], clock_buf[128]; + struct tm tm; + time_t ti; + int64_t secs; + + if (!sn) { + func_fprintf(f, + "%-10s%-20s%7s%20s%15s", + "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); + } else { + ti = sn->date_sec; + localtime_r(&ti, &tm); + strftime(date_buf, sizeof(date_buf), + "%Y-%m-%d %H:%M:%S", &tm); + secs = sn->vm_clock_nsec / 1000000000; + snprintf(clock_buf, sizeof(clock_buf), + "%02d:%02d:%02d.%03d", + (int)(secs / 3600), + (int)((secs / 60) % 60), + (int)(secs % 60), + (int)((sn->vm_clock_nsec / 1000000) % 1000)); + func_fprintf(f, + "%-10s%-20s%7s%20s%15s", + sn->id_str, sn->name, + get_human_readable_size(buf1, sizeof(buf1), + sn->vm_state_size), + date_buf, + clock_buf); + } +} + +void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, + ImageInfo *info) +{ + char size_buf[128], dsize_buf[128]; + if (!info->has_actual_size) { + snprintf(dsize_buf, sizeof(dsize_buf), "unavailable"); + } else { + get_human_readable_size(dsize_buf, sizeof(dsize_buf), + info->actual_size); + } + get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size); + func_fprintf(f, + "image: %s\n" + "file format: %s\n" + "virtual size: %s (%" PRId64 " bytes)\n" + "disk size: %s\n", + info->filename, info->format, size_buf, + info->virtual_size, + dsize_buf); + + if (info->has_encrypted && info->encrypted) { + func_fprintf(f, "encrypted: yes\n"); + } + + if (info->has_cluster_size) { + func_fprintf(f, "cluster_size: %" PRId64 "\n", + info->cluster_size); + } + + if (info->has_dirty_flag && info->dirty_flag) { + func_fprintf(f, "cleanly shut down: no\n"); + } + + if (info->has_backing_filename) { + func_fprintf(f, "backing file: %s", info->backing_filename); + if (info->has_full_backing_filename) { + func_fprintf(f, " (actual path: %s)", info->full_backing_filename); + } + func_fprintf(f, "\n"); + if (info->has_backing_filename_format) { + func_fprintf(f, "backing file format: %s\n", + info->backing_filename_format); + } + } + + if (info->has_snapshots) { + SnapshotInfoList *elem; + + func_fprintf(f, "Snapshot list:\n"); + bdrv_snapshot_dump(func_fprintf, f, NULL); + func_fprintf(f, "\n"); + + /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but + * we convert to the block layer's native QEMUSnapshotInfo for now. + */ + for (elem = info->snapshots; elem; elem = elem->next) { + QEMUSnapshotInfo sn = { + .vm_state_size = elem->value->vm_state_size, + .date_sec = elem->value->date_sec, + .date_nsec = elem->value->date_nsec, + .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL + + elem->value->vm_clock_nsec, + }; + + pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id); + pstrcpy(sn.name, sizeof(sn.name), elem->value->name); + bdrv_snapshot_dump(func_fprintf, f, &sn); + func_fprintf(f, "\n"); + } + } +} diff --git a/block/qcow.c b/block/qcow.c index b239c82ae..5239bd68f 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -22,11 +22,11 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" #include <zlib.h> -#include "aes.h" -#include "migration.h" +#include "qemu/aes.h" +#include "migration/migration.h" /**************************************************************/ /* QEMU COW block driver with compression and encryption support */ @@ -92,7 +92,7 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int qcow_open(BlockDriverState *bs, int flags) +static int qcow_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQcowState *s = bs->opaque; int len, i, shift, ret; @@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags) be64_to_cpus(&header.l1_table_offset); if (header.magic != QCOW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } if (header.version != QCOW_VERSION) { @@ -679,7 +679,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) return ret; } - ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR); + ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); if (ret < 0) { return ret; } @@ -787,8 +787,21 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, uint8_t *out_buf; uint64_t cluster_offset; - if (nb_sectors != s->cluster_sectors) - return -EINVAL; + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); @@ -879,6 +892,7 @@ static BlockDriver bdrv_qcow = { .bdrv_close = qcow_close, .bdrv_reopen_prepare = qcow_reopen_prepare, .bdrv_create = qcow_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 2d4322a8d..2f3114ecc 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ -#include "block_int.h" +#include "block/block_int.h" #include "qemu-common.h" #include "qcow2.h" #include "trace.h" diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index e179211c5..cca76d4fc 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -25,16 +25,17 @@ #include <zlib.h> #include "qemu-common.h" -#include "block_int.h" +#include "block/block_int.h" #include "block/qcow2.h" #include "trace.h" -int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size) { BDRVQcowState *s = bs->opaque; - int new_l1_size, new_l1_size2, ret, i; + int new_l1_size2, ret, i; uint64_t *new_l1_table; - int64_t new_l1_table_offset; + int64_t new_l1_table_offset, new_l1_size; uint8_t data[12]; if (min_size <= s->l1_size) @@ -53,8 +54,13 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) } } + if (new_l1_size > INT_MAX) { + return -EFBIG; + } + #ifdef DEBUG_ALLOC2 - fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size); + fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", + s->l1_size, new_l1_size); #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; @@ -92,14 +98,16 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) goto fail; } g_free(s->l1_table); - qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t)); + qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); s->l1_table_offset = new_l1_table_offset; s->l1_table = new_l1_table; s->l1_size = new_l1_size; return 0; fail: g_free(new_l1_table); - qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2); + qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, + QCOW2_DISCARD_OTHER); return ret; } @@ -391,8 +399,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset) { BDRVQcowState *s = bs->opaque; - unsigned int l1_index, l2_index; - uint64_t l2_offset, *l2_table; + unsigned int l2_index; + uint64_t l1_index, l2_offset, *l2_table; int l1_bits, c; unsigned int index_in_cluster, nb_clusters; uint64_t nb_available, nb_needed; @@ -454,6 +462,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; break; case QCOW2_CLUSTER_ZERO: + if (s->qcow_version < 3) { + return -EIO; + } c = count_contiguous_clusters(nb_clusters, s->cluster_size, &l2_table[l2_index], 0, QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); @@ -504,8 +515,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, int *new_l2_index) { BDRVQcowState *s = bs->opaque; - unsigned int l1_index, l2_index; - uint64_t l2_offset; + unsigned int l2_index; + uint64_t l1_index, l2_offset; uint64_t *l2_table = NULL; int ret; @@ -519,6 +530,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, } } + assert(l1_index < s->l1_size); l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; /* seek the l2 table of the given l2 offset */ @@ -538,7 +550,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, /* Then decrease the refcount of the old table */ if (l2_offset) { - qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); } } @@ -615,57 +628,67 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, return cluster_offset; } -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) { BDRVQcowState *s = bs->opaque; - int i, j = 0, l2_index, ret; - uint64_t *old_cluster, start_sect, *l2_table; - uint64_t cluster_offset = m->alloc_offset; - bool cow = false; - - trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); + int ret; - if (m->nb_clusters == 0) + if (r->nb_sectors == 0) { return 0; + } - old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + qemu_co_mutex_unlock(&s->lock); + ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, + r->offset / BDRV_SECTOR_SIZE, + r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); + qemu_co_mutex_lock(&s->lock); - /* copy content of unmodified sectors */ - start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; - if (m->n_start) { - cow = true; - qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) - goto err; - } - - if (m->nb_available & (s->cluster_sectors - 1)) { - cow = true; - qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available, - align_offset(m->nb_available, s->cluster_sectors)); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) - goto err; + if (ret < 0) { + return ret; } /* - * Update L2 table. - * * Before we update the L2 table to actually point to the new cluster, we * need to be sure that the refcounts have been increased and COW was * handled. */ - if (cow) { - qcow2_cache_depends_on_flush(s->l2_table_cache); + qcow2_cache_depends_on_flush(s->l2_table_cache); + + return 0; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int i, j = 0, l2_index, ret; + uint64_t *old_cluster, *l2_table; + uint64_t cluster_offset = m->alloc_offset; + + trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); + assert(m->nb_clusters > 0); + + old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + + /* copy content of unmodified sectors */ + ret = perform_cow(bs, m, &m->cow_start); + if (ret < 0) { + goto err; } + ret = perform_cow(bs, m, &m->cow_end); + if (ret < 0) { + goto err; + } + + /* Update L2 table. */ + if (s->use_lazy_refcounts) { + qcow2_mark_dirty(bs); + } if (qcow2_need_accurate_refcounts(s)) { qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); } + ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); if (ret < 0) { goto err; @@ -695,10 +718,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) /* * If this was a COW, we need to decrease the refcount of the old cluster. * Also flush bs->file to get the right order for L2 and refcount update. + * + * Don't discard clusters that reach a refcount of 0 (e.g. compressed + * clusters), the next write will reuse them anyway. */ if (j != 0) { for (i = 0; i < j; i++) { - qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1); + qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, + QCOW2_DISCARD_NEVER); } } @@ -743,56 +770,53 @@ out: } /* - * Allocates new clusters for the given guest_offset. - * - * At most *nb_clusters are allocated, and on return *nb_clusters is updated to - * contain the number of clusters that have been allocated and are contiguous - * in the image file. + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. * - * If *host_offset is non-zero, it specifies the offset in the image file at - * which the new clusters must start. *nb_clusters can be 0 on return in this - * case if the cluster at host_offset is already in use. If *host_offset is - * zero, the clusters can be allocated anywhere in the image file. + * Returns: + * 0 if there was no dependency. *cur_bytes indicates the number of + * bytes from guest_offset that can be read before the next + * dependency must be processed (or the request is complete) * - * *host_offset is updated to contain the offset into the image file at which - * the first allocated cluster starts. - * - * Return 0 on success and -errno in error cases. -EAGAIN means that the - * function has been waiting for another request and the allocation must be - * restarted, but the whole request should not be failed. + * -EAGAIN if we had to wait for another request, previously gathered + * information on cluster allocation may be invalid now. The caller + * must start over anyway, so consider *cur_bytes undefined. */ -static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, unsigned int *nb_clusters) +static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *cur_bytes, QCowL2Meta **m) { BDRVQcowState *s = bs->opaque; QCowL2Meta *old_alloc; + uint64_t bytes = *cur_bytes; - trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, - *host_offset, *nb_clusters); - - /* - * Check if there already is an AIO write request in flight which allocates - * the same cluster. In this case we need to wait until the previous - * request has completed and updated the L2 table accordingly. - */ QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { - uint64_t start = guest_offset >> s->cluster_bits; - uint64_t end = start + *nb_clusters; - uint64_t old_start = old_alloc->offset >> s->cluster_bits; - uint64_t old_end = old_start + old_alloc->nb_clusters; + uint64_t start = guest_offset; + uint64_t end = start + bytes; + uint64_t old_start = l2meta_cow_start(old_alloc); + uint64_t old_end = l2meta_cow_end(old_alloc); - if (end < old_start || start > old_end) { + if (end <= old_start || start >= old_end) { /* No intersection */ } else { if (start < old_start) { /* Stop at the start of a running allocation */ - *nb_clusters = old_start - start; + bytes = old_start - start; } else { - *nb_clusters = 0; + bytes = 0; } - if (*nb_clusters == 0) { + /* Stop if already an l2meta exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. */ + if (bytes == 0 && *m) { + *cur_bytes = 0; + return 0; + } + + if (bytes == 0) { /* Wait for the dependency to complete. We need to recheck * the free/allocated clusters when we continue. */ qemu_co_mutex_unlock(&s->lock); @@ -803,10 +827,144 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, } } - if (!*nb_clusters) { - abort(); + /* Make sure that existing clusters and new allocations are only used up to + * the next dependency if we shortened the request above */ + *cur_bytes = bytes; + + return 0; +} + +/* + * Checks how many already allocated clusters that don't require a copy on + * write there are at the given guest_offset (up to *bytes). If + * *host_offset is not zero, only physically contiguous clusters beginning at + * this host offset are counted. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no allocated clusters are available at the given offset. + * *bytes is normally unchanged. It is set to 0 if the cluster + * is allocated and doesn't need COW, but doesn't have the right + * physical offset. + * + * 1: if allocated clusters that don't require a COW are available at + * the requested offset. *bytes may have decreased and describes + * the length of the area that can be written to. + * + * -errno: in error cases + */ +static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index; + uint64_t cluster_offset; + uint64_t *l2_table; + unsigned int nb_clusters; + unsigned int keep_clusters; + int ret, pret; + + trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + + assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) + == offset_into_cluster(s, *host_offset)); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* Check how many clusters are already allocated and don't need COW */ + if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL + && (cluster_offset & QCOW_OFLAG_COPIED)) + { + /* If a specific host_offset is required, check it */ + bool offset_matches = + (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + + if (*host_offset != 0 && !offset_matches) { + *bytes = 0; + ret = 0; + goto out; + } + + /* We keep all QCOW_OFLAG_COPIED clusters */ + keep_clusters = + count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); + assert(keep_clusters <= nb_clusters); + + *bytes = MIN(*bytes, + keep_clusters * s->cluster_size + - offset_into_cluster(s, guest_offset)); + + ret = 1; + } else { + ret = 0; + } + + /* Cleanup */ +out: + pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (pret < 0) { + return pret; } + /* Only return a host offset if we actually made progress. Otherwise we + * would make requirements for handle_alloc() that it can't fulfill */ + if (ret) { + *host_offset = (cluster_offset & L2E_OFFSET_MASK) + + offset_into_cluster(s, guest_offset); + } + + return ret; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, unsigned int *nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + + trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, + *host_offset, *nb_clusters); + /* Allocate new clusters */ trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); if (*host_offset == 0) { @@ -828,6 +986,151 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, } /* + * Allocates new clusters for an area that either is yet unallocated or needs a + * copy on write. If *host_offset is non-zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no clusters could be allocated. *bytes is set to 0, + * *host_offset is left unchanged. + * + * 1: if new clusters were allocated. *bytes may be decreased if the + * new allocation doesn't cover all of the requested area. + * *host_offset is updated to contain the host offset of the first + * newly allocated cluster. + * + * -errno: in error cases + */ +static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index; + uint64_t *l2_table; + uint64_t entry; + unsigned int nb_clusters; + int ret; + + uint64_t alloc_cluster_offset; + + trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + assert(*bytes > 0); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + entry = be64_to_cpu(l2_table[l2_index]); + + /* For the moment, overwrite compressed clusters one by one */ + if (entry & QCOW_OFLAG_COMPRESSED) { + nb_clusters = 1; + } else { + nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); + } + + /* This function is only called when there were no non-COW clusters, so if + * we can't find any unallocated or COW clusters either, something is + * wrong with our code. */ + assert(nb_clusters > 0); + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + /* Allocate, if necessary at a given offset in the image file */ + alloc_cluster_offset = start_of_cluster(s, *host_offset); + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* Can't extend contiguous allocation */ + if (nb_clusters == 0) { + *bytes = 0; + return 0; + } + + /* + * Save info needed for meta data update. + * + * requested_sectors: Number of sectors from the start of the first + * newly allocated cluster to the end of the (possibly shortened + * before) write request. + * + * avail_sectors: Number of sectors from the start of the first + * newly allocated to the end of the last newly allocated cluster. + * + * nb_sectors: The number of sectors from the start of the first + * newly allocated cluster to the end of the area that the write + * request actually writes to (excluding COW at the end) + */ + int requested_sectors = + (*bytes + offset_into_cluster(s, guest_offset)) + >> BDRV_SECTOR_BITS; + int avail_sectors = nb_clusters + << (s->cluster_bits - BDRV_SECTOR_BITS); + int alloc_n_start = offset_into_cluster(s, guest_offset) + >> BDRV_SECTOR_BITS; + int nb_sectors = MIN(requested_sectors, avail_sectors); + QCowL2Meta *old_m = *m; + + *m = g_malloc0(sizeof(**m)); + + **m = (QCowL2Meta) { + .next = old_m, + + .alloc_offset = alloc_cluster_offset, + .offset = start_of_cluster(s, guest_offset), + .nb_clusters = nb_clusters, + .nb_available = nb_sectors, + + .cow_start = { + .offset = 0, + .nb_sectors = alloc_n_start, + }, + .cow_end = { + .offset = nb_sectors * BDRV_SECTOR_SIZE, + .nb_sectors = avail_sectors - nb_sectors, + }, + }; + qemu_co_queue_init(&(*m)->dependent_requests); + QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); + + *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); + *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) + - offset_into_cluster(s, guest_offset)); + assert(*bytes != 0); + + return 1; + +fail: + if (*m && (*m)->nb_clusters > 0) { + QLIST_REMOVE(*m, next_in_flight); + } + return ret; +} + +/* * alloc_cluster_offset * * For a given offset on the virtual disk, find the cluster offset in qcow2 @@ -847,151 +1150,113 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, * Return 0 on success and -errno in error cases */ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, QCowL2Meta *m) + int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) { BDRVQcowState *s = bs->opaque; - int l2_index, ret, sectors; - uint64_t *l2_table; - unsigned int nb_clusters, keep_clusters; + uint64_t start, remaining; uint64_t cluster_offset; + uint64_t cur_bytes; + int ret; trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, n_start, n_end); - /* Find L2 entry for the first involved cluster */ -again: - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } + assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); + offset = start_of_cluster(s, offset); - /* - * Calculate the number of clusters to look for. We stop at L2 table - * boundaries to keep things simple. - */ - nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS), - s->l2_size - l2_index); +again: + start = offset + (n_start << BDRV_SECTOR_BITS); + remaining = (n_end - n_start) << BDRV_SECTOR_BITS; + cluster_offset = 0; + *host_offset = 0; + cur_bytes = 0; + *m = NULL; - cluster_offset = be64_to_cpu(l2_table[l2_index]); + while (true) { - /* - * Check how many clusters are already allocated and don't need COW, and how - * many need a new allocation. - */ - if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL - && (cluster_offset & QCOW_OFLAG_COPIED)) - { - /* We keep all QCOW_OFLAG_COPIED clusters */ - keep_clusters = - count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); - assert(keep_clusters <= nb_clusters); - nb_clusters -= keep_clusters; - } else { - keep_clusters = 0; - cluster_offset = 0; - } - - if (nb_clusters > 0) { - /* For the moment, overwrite compressed clusters one by one */ - uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]); - if (entry & QCOW_OFLAG_COMPRESSED) { - nb_clusters = 1; - } else { - nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, - l2_index + keep_clusters); + if (!*host_offset) { + *host_offset = start_of_cluster(s, cluster_offset); } - } - cluster_offset &= L2E_OFFSET_MASK; + assert(remaining >= cur_bytes); - /* - * The L2 table isn't used any more after this. As long as the cache works - * synchronously, it's important to release it before calling - * do_alloc_cluster_offset, which may yield if we need to wait for another - * request to complete. If we still had the reference, we could use up the - * whole cache with sleeping requests. - */ - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } - - /* If there is something left to allocate, do that now */ - *m = (QCowL2Meta) { - .cluster_offset = cluster_offset, - .nb_clusters = 0, - }; - qemu_co_queue_init(&m->dependent_requests); + start += cur_bytes; + remaining -= cur_bytes; + cluster_offset += cur_bytes; - if (nb_clusters > 0) { - uint64_t alloc_offset; - uint64_t alloc_cluster_offset; - uint64_t keep_bytes = keep_clusters * s->cluster_size; - - /* Calculate start and size of allocation */ - alloc_offset = offset + keep_bytes; - - if (keep_clusters == 0) { - alloc_cluster_offset = 0; - } else { - alloc_cluster_offset = cluster_offset + keep_bytes; + if (remaining == 0) { + break; } - /* Allocate, if necessary at a given offset in the image file */ - ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset, - &nb_clusters); + cur_bytes = remaining; + + /* + * Now start gathering as many contiguous clusters as possible: + * + * 1. Check for overlaps with in-flight allocations + * + * a) Overlap not in the first cluster -> shorten this request and + * let the caller handle the rest in its next loop iteration. + * + * b) Real overlaps of two requests. Yield and restart the search + * for contiguous clusters (the situation could have changed + * while we were sleeping) + * + * c) TODO: Request starts in the same cluster as the in-flight + * allocation ends. Shorten the COW of the in-fight allocation, + * set cluster_offset to write to the same cluster and set up + * the right synchronisation between the in-flight request and + * the new one. + */ + ret = handle_dependencies(bs, start, &cur_bytes, m); if (ret == -EAGAIN) { + /* Currently handle_dependencies() doesn't yield if we already had + * an allocation. If it did, we would have to clean up the L2Meta + * structs before starting over. */ + assert(*m == NULL); goto again; } else if (ret < 0) { - goto fail; + return ret; + } else if (cur_bytes == 0) { + break; + } else { + /* handle_dependencies() may have decreased cur_bytes (shortened + * the allocations below) so that the next dependency is processed + * correctly during the next loop iteration. */ } - /* save info needed for meta data update */ - if (nb_clusters > 0) { - /* - * requested_sectors: Number of sectors from the start of the first - * newly allocated cluster to the end of the (possibly shortened - * before) write request. - * - * avail_sectors: Number of sectors from the start of the first - * newly allocated to the end of the last newly allocated cluster. - */ - int requested_sectors = n_end - keep_clusters * s->cluster_sectors; - int avail_sectors = nb_clusters - << (s->cluster_bits - BDRV_SECTOR_BITS); - - *m = (QCowL2Meta) { - .cluster_offset = keep_clusters == 0 ? - alloc_cluster_offset : cluster_offset, - .alloc_offset = alloc_cluster_offset, - .offset = alloc_offset, - .n_start = keep_clusters == 0 ? n_start : 0, - .nb_clusters = nb_clusters, - .nb_available = MIN(requested_sectors, avail_sectors), - }; - qemu_co_queue_init(&m->dependent_requests); - QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight); + /* + * 2. Count contiguous COPIED clusters. + */ + ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else if (cur_bytes == 0) { + break; } - } - /* Some cleanup work */ - sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9); - if (sectors > n_end) { - sectors = n_end; + /* + * 3. If the request still hasn't completed, allocate new clusters, + * considering any cluster_offset of steps 1c or 2. + */ + ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else { + assert(cur_bytes == 0); + break; + } } - assert(sectors > n_start); - *num = sectors - n_start; + *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); + assert(*num > 0); + assert(*host_offset != 0); return 0; - -fail: - if (m->nb_clusters > 0) { - QLIST_REMOVE(m, next_in_flight); - } - return ret; } static int decompress_buffer(uint8_t *out_buf, int out_buf_size, @@ -1081,7 +1346,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, l2_table[l2_index + i] = cpu_to_be64(0); /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_offset, 1); + qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); } ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); @@ -1112,18 +1377,25 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, nb_clusters = size_to_clusters(s, end_offset - offset); + s->cache_discards = true; + /* Each L2 table is handled by its own loop iteration */ while (nb_clusters > 0) { ret = discard_single_l2(bs, offset, nb_clusters); if (ret < 0) { - return ret; + goto fail; } nb_clusters -= ret; offset += (ret * s->cluster_size); } - return 0; + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; } /* @@ -1157,7 +1429,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); if (old_offset & QCOW_OFLAG_COMPRESSED) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); - qcow2_free_any_clusters(bs, old_offset, 1); + qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); } else { l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); } @@ -1185,15 +1457,22 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) /* Each L2 table is handled by its own loop iteration */ nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + s->cache_discards = true; + while (nb_clusters > 0) { ret = zero_single_l2(bs, offset, nb_clusters); if (ret < 0) { - return ret; + goto fail; } nb_clusters -= ret; offset += (ret * s->cluster_size); } - return 0; + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 96224d1af..1244693f3 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -23,13 +23,13 @@ */ #include "qemu-common.h" -#include "block_int.h" +#include "block/block_int.h" #include "block/qcow2.h" static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, - int addend); + int addend, enum qcow2_discard_type type); /*********************************************************/ @@ -201,7 +201,10 @@ static int alloc_refcount_block(BlockDriverState *bs, *refcount_block = NULL; /* We write to the refcount table, so we might depend on L2 tables */ - qcow2_cache_flush(bs, s->l2_table_cache); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + return ret; + } /* Allocate the refcount block itself and mark it as used */ int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); @@ -232,12 +235,16 @@ static int alloc_refcount_block(BlockDriverState *bs, } else { /* Described somewhere else. This can recurse at most twice before we * arrive at a block that describes itself. */ - ret = update_refcount(bs, new_block, s->cluster_size, 1); + ret = update_refcount(bs, new_block, s->cluster_size, 1, + QCOW2_DISCARD_NEVER); if (ret < 0) { goto fail_block; } - bdrv_flush(bs->file); + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } /* Initialize the new refcount block only after updating its refcount, * update_refcount uses the refcount cache itself */ @@ -393,7 +400,8 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Free old table. Remember, we must not change free_cluster_index */ uint64_t old_free_cluster_index = s->free_cluster_index; - qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); + qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); s->free_cluster_index = old_free_cluster_index; ret = load_refcount_block(bs, new_block, (void**) refcount_block); @@ -412,9 +420,77 @@ fail_block: return ret; } +void qcow2_process_discards(BlockDriverState *bs, int ret) +{ + BDRVQcowState *s = bs->opaque; + Qcow2DiscardRegion *d, *next; + + QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { + QTAILQ_REMOVE(&s->discards, d, next); + + /* Discard is optional, ignore the return value */ + if (ret >= 0) { + bdrv_discard(bs->file, + d->offset >> BDRV_SECTOR_BITS, + d->bytes >> BDRV_SECTOR_BITS); + } + + g_free(d); + } +} + +static void update_refcount_discard(BlockDriverState *bs, + uint64_t offset, uint64_t length) +{ + BDRVQcowState *s = bs->opaque; + Qcow2DiscardRegion *d, *p, *next; + + QTAILQ_FOREACH(d, &s->discards, next) { + uint64_t new_start = MIN(offset, d->offset); + uint64_t new_end = MAX(offset + length, d->offset + d->bytes); + + if (new_end - new_start <= length + d->bytes) { + /* There can't be any overlap, areas ending up here have no + * references any more and therefore shouldn't get freed another + * time. */ + assert(d->bytes + length == new_end - new_start); + d->offset = new_start; + d->bytes = new_end - new_start; + goto found; + } + } + + d = g_malloc(sizeof(*d)); + *d = (Qcow2DiscardRegion) { + .bs = bs, + .offset = offset, + .bytes = length, + }; + QTAILQ_INSERT_TAIL(&s->discards, d, next); + +found: + /* Merge discard requests if they are adjacent now */ + QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { + if (p == d + || p->offset > d->offset + d->bytes + || d->offset > p->offset + p->bytes) + { + continue; + } + + /* Still no overlap possible */ + assert(p->offset == d->offset + d->bytes + || d->offset == p->offset + p->bytes); + + QTAILQ_REMOVE(&s->discards, p, next); + d->offset = MIN(d->offset, p->offset); + d->bytes += p->bytes; + } +} + /* XXX: cache several refcount block clusters ? */ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, int addend) + int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int64_t start, last, cluster_offset; @@ -480,10 +556,18 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, s->free_cluster_index = cluster_index; } refcount_block[block_index] = cpu_to_be16(refcount); + + if (refcount == 0 && s->discard_passthrough[type]) { + update_refcount_discard(bs, cluster_offset, s->cluster_size); + } } ret = 0; fail: + if (!s->cache_discards) { + qcow2_process_discards(bs, ret); + } + /* Write last changed block to disk */ if (refcount_block) { int wret; @@ -500,7 +584,8 @@ fail: */ if (ret < 0) { int dummy; - dummy = update_refcount(bs, offset, cluster_offset - offset, -addend); + dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, + QCOW2_DISCARD_NEVER); (void)dummy; } @@ -516,18 +601,18 @@ fail: */ static int update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, - int addend) + int addend, + enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int ret; - ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend); + ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, + type); if (ret < 0) { return ret; } - bdrv_flush(bs->file); - return get_refcount(bs, cluster_index); } @@ -575,7 +660,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) return offset; } - ret = update_refcount(bs, offset, size, 1); + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } @@ -607,7 +692,8 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, old_free_cluster_index = s->free_cluster_index; s->free_cluster_index = cluster_index + i; - ret = update_refcount(bs, offset, i << s->cluster_bits, 1); + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } @@ -645,7 +731,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) if (free_in_cluster == 0) s->free_byte_offset = 0; if ((offset & (s->cluster_size - 1)) != 0) - update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); } else { offset = qcow2_alloc_clusters(bs, s->cluster_size); if (offset < 0) { @@ -655,7 +742,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) if ((cluster_offset + s->cluster_size) == offset) { /* we are lucky: contiguous data */ offset = s->free_byte_offset; - update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); s->free_byte_offset += size; } else { s->free_byte_offset = offset; @@ -663,17 +751,22 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } } - bdrv_flush(bs->file); + /* The cluster refcount was incremented, either by qcow2_alloc_clusters() + * or explicitly by update_cluster_refcount(). Refcount blocks must be + * flushed before the caller's L2 table updates. + */ + qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); return offset; } void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size) + int64_t offset, int64_t size, + enum qcow2_discard_type type) { int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); - ret = update_refcount(bs, offset, size, -1); + ret = update_refcount(bs, offset, size, -1, type); if (ret < 0) { fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); /* TODO Remember the clusters to free them later and avoid leaking */ @@ -684,8 +777,8 @@ void qcow2_free_clusters(BlockDriverState *bs, * Free a cluster using its L2 entry (handles clusters of all types, e.g. * normal cluster, compressed cluster, etc.) */ -void qcow2_free_any_clusters(BlockDriverState *bs, - uint64_t l2_entry, int nb_clusters) +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; @@ -697,12 +790,12 @@ void qcow2_free_any_clusters(BlockDriverState *bs, s->csize_mask) + 1; qcow2_free_clusters(bs, (l2_entry & s->cluster_offset_mask) & ~511, - nb_csectors * 512); + nb_csectors * 512, type); } break; case QCOW2_CLUSTER_NORMAL: qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits); + nb_clusters << s->cluster_bits, type); break; case QCOW2_CLUSTER_UNALLOCATED: case QCOW2_CLUSTER_ZERO: @@ -733,20 +826,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, l1_table = NULL; l1_size2 = l1_size * sizeof(uint64_t); + s->cache_discards = true; + /* WARNING: qcow2_snapshot_goto relies on this function not using the * l1_table_offset when it is the current s->l1_table_offset! Be careful * when changing this! */ if (l1_table_offset != s->l1_table_offset) { - if (l1_size2 != 0) { - l1_table = g_malloc0(align_offset(l1_size2, 512)); - } else { - l1_table = NULL; - } + l1_table = g_malloc0(align_offset(l1_size2, 512)); l1_allocated = 1; - if (bdrv_pread(bs->file, l1_table_offset, - l1_table, l1_size2) != l1_size2) - { - ret = -EIO; + + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); + if (ret < 0) { goto fail; } @@ -782,27 +872,25 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int ret; ret = update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512, addend); + nb_csectors * 512, addend, + QCOW2_DISCARD_SNAPSHOT); if (ret < 0) { goto fail; } - - /* TODO Flushing once for the whole function should - * be enough */ - bdrv_flush(bs->file); } /* compressed clusters are never modified */ refcount = 2; } else { uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; if (addend != 0) { - refcount = update_cluster_refcount(bs, cluster_index, addend); + refcount = update_cluster_refcount(bs, cluster_index, addend, + QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, cluster_index); } if (refcount < 0) { - ret = -EIO; + ret = refcount; goto fail; } } @@ -828,12 +916,13 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (addend != 0) { - refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend); + refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, + QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, l2_offset >> s->cluster_bits); } if (refcount < 0) { - ret = -EIO; + ret = refcount; goto fail; } else if (refcount == 1) { l2_offset |= QCOW_OFLAG_COPIED; @@ -845,21 +934,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } } - ret = 0; + ret = bdrv_flush(bs); fail: if (l2_table) { qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); } + s->cache_discards = false; + qcow2_process_discards(bs, ret); + /* Update L1 only if it isn't deleted anyway (addend = -1) */ - if (addend >= 0 && l1_modified) { - for(i = 0; i < l1_size; i++) + if (ret == 0 && addend >= 0 && l1_modified) { + for (i = 0; i < l1_size; i++) { cpu_to_be64s(&l1_table[i]); - if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, - l1_size2) < 0) - goto fail; - for(i = 0; i < l1_size; i++) + } + + ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); + + for (i = 0; i < l1_size; i++) { be64_to_cpus(&l1_table[i]); + } } if (l1_allocated) g_free(l1_table); @@ -918,6 +1012,12 @@ static void inc_refcounts(BlockDriverState *bs, } } +/* Flags for check_refcounts_l1() and check_refcounts_l2() */ +enum { + CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */ + CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ +}; + /* * Increases the refcount in the given refcount table for the all clusters * referenced in the L2 table. While doing so, performs some checks on L2 @@ -928,10 +1028,11 @@ static void inc_refcounts(BlockDriverState *bs, */ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, - int check_copied) + int flags) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table, l2_entry; + uint64_t next_contiguous_offset = 0; int i, l2_size, nb_csectors, refcount; /* Read L2 table from disk */ @@ -962,6 +1063,18 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, l2_entry &= s->cluster_offset_mask; inc_refcounts(bs, res, refcount_table, refcount_table_size, l2_entry & ~511, nb_csectors * 512); + + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + res->bfi.compressed_clusters++; + + /* Compressed clusters are fragmented by nature. Since they + * take up sub-sector space but we only have sector granularity + * I/O we need to re-read the same sectors even for adjacent + * compressed clusters. + */ + res->bfi.fragmented_clusters++; + } break; case QCOW2_CLUSTER_ZERO: @@ -975,7 +1088,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ uint64_t offset = l2_entry & L2E_OFFSET_MASK; - if (check_copied) { + if (flags & CHECK_OFLAG_COPIED) { refcount = get_refcount(bs, offset >> s->cluster_bits); if (refcount < 0) { fprintf(stderr, "Can't get refcount for offset %" @@ -989,6 +1102,15 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } } + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + if (next_contiguous_offset && + offset != next_contiguous_offset) { + res->bfi.fragmented_clusters++; + } + next_contiguous_offset = offset + s->cluster_size; + } + /* Mark cluster as used */ inc_refcounts(bs, res, refcount_table,refcount_table_size, offset, s->cluster_size); @@ -1032,7 +1154,7 @@ static int check_refcounts_l1(BlockDriverState *bs, uint16_t *refcount_table, int refcount_table_size, int64_t l1_table_offset, int l1_size, - int check_copied) + int flags) { BDRVQcowState *s = bs->opaque; uint64_t *l1_table, l2_offset, l1_size2; @@ -1061,7 +1183,7 @@ static int check_refcounts_l1(BlockDriverState *bs, l2_offset = l1_table[i]; if (l2_offset) { /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ - if (check_copied) { + if (flags & CHECK_OFLAG_COPIED) { refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits); if (refcount < 0) { @@ -1090,7 +1212,7 @@ static int check_refcounts_l1(BlockDriverState *bs, /* Process and check L2 entries */ ret = check_refcounts_l2(bs, res, refcount_table, - refcount_table_size, l2_offset, check_copied); + refcount_table_size, l2_offset, flags); if (ret < 0) { goto fail; } @@ -1116,7 +1238,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { BDRVQcowState *s = bs->opaque; - int64_t size, i; + int64_t size, i, highest_cluster; int nb_clusters, refcount1, refcount2; QCowSnapshot *sn; uint16_t *refcount_table; @@ -1126,13 +1248,17 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, nb_clusters = size_to_clusters(s, size); refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + res->bfi.total_clusters = + size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); + /* header */ inc_refcounts(bs, res, refcount_table, nb_clusters, 0, s->cluster_size); /* current L1 table */ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - s->l1_table_offset, s->l1_size, 1); + s->l1_table_offset, s->l1_size, + CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); if (ret < 0) { goto fail; } @@ -1187,7 +1313,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } /* compare ref counts */ - for(i = 0; i < nb_clusters; i++) { + for (i = 0, highest_cluster = 0; i < nb_clusters; i++) { refcount1 = get_refcount(bs, i); if (refcount1 < 0) { fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", @@ -1197,6 +1323,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } refcount2 = refcount_table[i]; + + if (refcount1 > 0 || refcount2 > 0) { + highest_cluster = i; + } + if (refcount1 != refcount2) { /* Check if we're allowed to fix the mismatch */ @@ -1215,7 +1346,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, if (num_fixed) { ret = update_refcount(bs, i << s->cluster_bits, 1, - refcount2 - refcount1); + refcount2 - refcount1, + QCOW2_DISCARD_ALWAYS); if (ret >= 0) { (*num_fixed)++; continue; @@ -1231,6 +1363,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } } + res->image_end_offset = (highest_cluster + 1) * s->cluster_size; ret = 0; fail: diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 4e7c93b8b..0caac9055 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -23,7 +23,7 @@ */ #include "qemu-common.h" -#include "block_int.h" +#include "block/block_int.h" #include "block/qcow2.h" typedef struct QEMU_PACKED QCowSnapshotHeader { @@ -180,11 +180,14 @@ static int qcow2_write_snapshots(BlockDriverState *bs) /* Allocate space for the new snapshot list */ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); - bdrv_flush(bs->file); offset = snapshots_offset; if (offset < 0) { return offset; } + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } /* Write all snapshots to the new list */ for(i = 0; i < s->nb_snapshots; i++) { @@ -259,7 +262,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs) } /* free the old snapshot table */ - qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size); + qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, + QCOW2_DISCARD_SNAPSHOT); s->snapshots_offset = snapshots_offset; s->snapshots_size = snapshots_size; return 0; @@ -378,11 +382,6 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto fail; } - ret = bdrv_flush(bs); - if (ret < 0) { - goto fail; - } - /* Append the new snapshot to the snapshot list */ new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); if (s->snapshots) { @@ -571,7 +570,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) if (ret < 0) { return ret; } - qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t)); + qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), + QCOW2_DISCARD_SNAPSHOT); /* must update the copied flag on the current cluster offsets */ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); diff --git a/block/qcow2.c b/block/qcow2.c index c1ff31f48..3376901bd 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -22,13 +22,14 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" #include <zlib.h> -#include "aes.h" +#include "qemu/aes.h" #include "block/qcow2.h" -#include "qemu-error.h" -#include "qerror.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qbool.h" #include "trace.h" /* @@ -222,7 +223,7 @@ static void report_unsupported_feature(BlockDriverState *bs, * updated successfully. Therefore it is not required to check the return * value of this function. */ -static int qcow2_mark_dirty(BlockDriverState *bs) +int qcow2_mark_dirty(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; uint64_t val; @@ -285,12 +286,44 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, return ret; } -static int qcow2_open(BlockDriverState *bs, int flags) +static QemuOptsList qcow2_runtime_opts = { + .name = "qcow2", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), + .desc = { + { + .name = QCOW2_OPT_LAZY_REFCOUNTS, + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + }, + { + .name = QCOW2_OPT_DISCARD_REQUEST, + .type = QEMU_OPT_BOOL, + .help = "Pass guest discard requests to the layer below", + }, + { + .name = QCOW2_OPT_DISCARD_SNAPSHOT, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when snapshot related space " + "is freed", + }, + { + .name = QCOW2_OPT_DISCARD_OTHER, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when other clusters are freed", + }, + { /* end of list */ } + }, +}; + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQcowState *s = bs->opaque; int len, i, ret = 0; QCowHeader header; + QemuOpts *opts; + Error *local_err = NULL; uint64_t ext_end; + uint64_t l1_vm_state_index; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { @@ -311,7 +344,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) be32_to_cpus(&header.nb_snapshots); if (header.magic != QCOW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } if (header.version < 2 || header.version > 3) { @@ -408,7 +441,14 @@ static int qcow2_open(BlockDriverState *bs, int flags) /* read the level 1 table */ s->l1_size = header.l1_size; - s->l1_vm_state_index = size_to_l1(s, header.size); + + l1_vm_state_index = size_to_l1(s, header.size); + if (l1_vm_state_index > INT_MAX) { + ret = -EFBIG; + goto fail; + } + s->l1_vm_state_index = l1_vm_state_index; + /* the L1 table must contain at least enough entries to put header.size bytes */ if (s->l1_size < s->l1_vm_state_index) { @@ -446,6 +486,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) } QLIST_INIT(&s->cluster_allocs); + QTAILQ_INIT(&s->discards); /* read qcow2 extensions */ if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { @@ -495,6 +536,38 @@ static int qcow2_open(BlockDriverState *bs, int flags) } } + /* Enable lazy_refcounts according to image and command line options */ + opts = qemu_opts_create_nofail(&qcow2_runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + + s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; + s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; + s->discard_passthrough[QCOW2_DISCARD_REQUEST] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, + flags & BDRV_O_UNMAP); + s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); + s->discard_passthrough[QCOW2_DISCARD_OTHER] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + + qemu_opts_del(opts); + + if (s->use_lazy_refcounts && s->qcow_version < 3) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " + "a qcow2 image with at least qemu 1.1 compatibility level"); + ret = -EINVAL; + goto fail; + } + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -584,7 +657,7 @@ static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, *pnum = 0; } - return (cluster_offset != 0); + return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); } /* handle reading after the end of the backing file */ @@ -665,10 +738,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, break; case QCOW2_CLUSTER_ZERO: - if (s->qcow_version < 3) { - ret = -EIO; - goto fail; - } qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); break; @@ -745,21 +814,6 @@ fail: return ret; } -static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) -{ - /* Take the request off the list of running requests */ - if (m->nb_clusters != 0) { - QLIST_REMOVE(m, next_in_flight); - } - - /* Restart all dependent requests */ - if (!qemu_co_queue_empty(&m->dependent_requests)) { - qemu_co_mutex_unlock(&s->lock); - qemu_co_queue_restart_all(&m->dependent_requests); - qemu_co_mutex_lock(&s->lock); - } -} - static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, @@ -774,15 +828,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, QEMUIOVector hd_qiov; uint64_t bytes_done = 0; uint8_t *cluster_data = NULL; - QCowL2Meta l2meta = { - .nb_clusters = 0, - }; + QCowL2Meta *l2meta = NULL; trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, remaining_sectors); - qemu_co_queue_init(&l2meta.dependent_requests); - qemu_iovec_init(&hd_qiov, qiov->niov); s->cluster_cache_offset = -1; /* disable compressed cache */ @@ -791,6 +841,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, while (remaining_sectors != 0) { + l2meta = NULL; + trace_qcow2_writev_start_part(qemu_coroutine_self()); index_in_cluster = sector_num & (s->cluster_sectors - 1); n_end = index_in_cluster + remaining_sectors; @@ -800,17 +852,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, } ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, n_end, &cur_nr_sectors, &l2meta); + index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); if (ret < 0) { goto fail; } - if (l2meta.nb_clusters > 0 && - (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) { - qcow2_mark_dirty(bs); - } - - cluster_offset = l2meta.cluster_offset; assert((cluster_offset & 511) == 0); qemu_iovec_reset(&hd_qiov); @@ -835,8 +881,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, cur_nr_sectors * 512); } - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); qemu_co_mutex_unlock(&s->lock); + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), (cluster_offset >> 9) + index_in_cluster); ret = bdrv_co_writev(bs->file, @@ -847,12 +893,25 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, goto fail; } - ret = qcow2_alloc_cluster_link_l2(bs, &l2meta); - if (ret < 0) { - goto fail; - } + while (l2meta != NULL) { + QCowL2Meta *next; + + ret = qcow2_alloc_cluster_link_l2(bs, l2meta); + if (ret < 0) { + goto fail; + } + + /* Take the request off the list of running requests */ + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + + qemu_co_queue_restart_all(&l2meta->dependent_requests); - run_dependent_requests(s, &l2meta); + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } remaining_sectors -= cur_nr_sectors; sector_num += cur_nr_sectors; @@ -862,10 +921,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, ret = 0; fail: - run_dependent_requests(s, &l2meta); - qemu_co_mutex_unlock(&s->lock); + while (l2meta != NULL) { + QCowL2Meta *next; + + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } + qemu_iovec_destroy(&hd_qiov); qemu_vfree(cluster_data); trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); @@ -902,6 +972,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) AES_KEY aes_encrypt_key; AES_KEY aes_decrypt_key; uint32_t crypt_method = 0; + QDict *options; /* * Backing files are read-only which makes all of their metadata immutable, @@ -916,8 +987,14 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) qcow2_close(bs); + options = qdict_new(); + qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, + qbool_from_int(s->use_lazy_refcounts)); + memset(s, 0, sizeof(BDRVQcowState)); - qcow2_open(bs, flags); + qcow2_open(bs, options, flags); + + QDECREF(options); if (crypt_method) { s->crypt_method = crypt_method; @@ -1128,31 +1205,34 @@ static int preallocate(BlockDriverState *bs) { uint64_t nb_sectors; uint64_t offset; + uint64_t host_offset = 0; int num; int ret; - QCowL2Meta meta; + QCowL2Meta *meta; nb_sectors = bdrv_getlength(bs) >> 9; offset = 0; - qemu_co_queue_init(&meta.dependent_requests); - meta.cluster_offset = 0; while (nb_sectors) { num = MIN(nb_sectors, INT_MAX >> 9); - ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta); + ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, + &host_offset, &meta); if (ret < 0) { return ret; } - ret = qcow2_alloc_cluster_link_l2(bs, &meta); + ret = qcow2_alloc_cluster_link_l2(bs, meta); if (ret < 0) { - qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters); + qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, + QCOW2_DISCARD_NEVER); return ret; } /* There are no dependent requests, but we need to remove our request * from the list of in-flight requests */ - run_dependent_requests(bs->opaque, &meta); + if (meta != NULL) { + QLIST_REMOVE(meta, next_in_flight); + } /* TODO Preallocate data if requested */ @@ -1165,10 +1245,10 @@ static int preallocate(BlockDriverState *bs) * all of the allocated clusters (otherwise we get failing reads after * EOF). Extend the image to the last allocated sector. */ - if (meta.cluster_offset != 0) { + if (host_offset != 0) { uint8_t buf[512]; memset(buf, 0, 512); - ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1); + ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); if (ret < 0) { return ret; } @@ -1216,7 +1296,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } - ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR); + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); if (ret < 0) { return ret; } @@ -1268,7 +1348,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ BlockDriver* drv = bdrv_find_format("qcow2"); assert(drv != NULL); - ret = bdrv_open(bs, filename, + ret = bdrv_open(bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); if (ret < 0) { goto out; @@ -1436,7 +1516,8 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, static int qcow2_truncate(BlockDriverState *bs, int64_t offset) { BDRVQcowState *s = bs->opaque; - int ret, new_l1_size; + int64_t new_l1_size; + int ret; if (offset & 511) { error_report("The new size must be a multiple of 512"); @@ -1493,8 +1574,21 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, return 0; } - if (nb_sectors != s->cluster_sectors) - return -EINVAL; + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow2_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); @@ -1608,8 +1702,8 @@ static void dump_refcounts(BlockDriverState *bs) } #endif -static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) +static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) { BDRVQcowState *s = bs->opaque; int growable = bs->growable; @@ -1617,7 +1711,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf, BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); bs->growable = 1; - ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size); + ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); bs->growable = growable; return ret; @@ -1691,6 +1785,7 @@ static BlockDriver bdrv_qcow2 = { .bdrv_close = qcow2_close, .bdrv_reopen_prepare = qcow2_reopen_prepare, .bdrv_create = qcow2_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_is_allocated = qcow2_co_is_allocated, .bdrv_set_key = qcow2_set_key, .bdrv_make_empty = qcow2_make_empty, diff --git a/block/qcow2.h b/block/qcow2.h index b4eb65470..dba977141 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -25,8 +25,8 @@ #ifndef BLOCK_QCOW2_H #define BLOCK_QCOW2_H -#include "aes.h" -#include "qemu-coroutine.h" +#include "qemu/aes.h" +#include "block/coroutine.h" //#define DEBUG_ALLOC //#define DEBUG_ALLOC2 @@ -58,6 +58,12 @@ #define DEFAULT_CLUSTER_SIZE 65536 + +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" + typedef struct QCowHeader { uint32_t magic; uint32_t version; @@ -126,12 +132,28 @@ enum { QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, }; +enum qcow2_discard_type { + QCOW2_DISCARD_NEVER = 0, + QCOW2_DISCARD_ALWAYS, + QCOW2_DISCARD_REQUEST, + QCOW2_DISCARD_SNAPSHOT, + QCOW2_DISCARD_OTHER, + QCOW2_DISCARD_MAX +}; + typedef struct Qcow2Feature { uint8_t type; uint8_t bit; char name[46]; } QEMU_PACKED Qcow2Feature; +typedef struct Qcow2DiscardRegion { + BlockDriverState *bs; + uint64_t offset; + uint64_t bytes; + QTAILQ_ENTRY(Qcow2DiscardRegion) next; +} Qcow2DiscardRegion; + typedef struct BDRVQcowState { int cluster_bits; int cluster_size; @@ -173,6 +195,9 @@ typedef struct BDRVQcowState { int flags; int qcow_version; + bool use_lazy_refcounts; + + bool discard_passthrough[QCOW2_DISCARD_MAX]; uint64_t incompatible_features; uint64_t compatible_features; @@ -181,6 +206,8 @@ typedef struct BDRVQcowState { size_t unknown_header_fields_size; void* unknown_header_fields; QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; + QTAILQ_HEAD (, Qcow2DiscardRegion) discards; + bool cache_discards; } BDRVQcowState; /* XXX: use std qcow open function ? */ @@ -196,17 +223,59 @@ typedef struct QCowCreateState { struct QCowAIOCB; -/* XXX This could be private for qcow2-cluster.c */ +typedef struct Qcow2COWRegion { + /** + * Offset of the COW region in bytes from the start of the first cluster + * touched by the request. + */ + uint64_t offset; + + /** Number of sectors to copy */ + int nb_sectors; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ typedef struct QCowL2Meta { + /** Guest offset of the first newly allocated cluster */ uint64_t offset; - uint64_t cluster_offset; + + /** Host offset of the first newly allocated cluster */ uint64_t alloc_offset; - int n_start; + + /** + * Number of sectors from the start of the first allocated cluster to + * the end of the (possibly shortened) request + */ int nb_available; + + /** Number of newly allocated clusters */ int nb_clusters; + + /** + * Requests that overlap with this allocation and wait to be restarted + * when the allocating request has completed. + */ CoQueue dependent_requests; + /** + * The COW Region between the start of the first allocated cluster and the + * area the guest actually writes to. + */ + Qcow2COWRegion cow_start; + + /** + * The COW Region between the area the guest actually writes to and the + * end of the last allocated cluster. + */ + Qcow2COWRegion cow_end; + + /** Pointer to next L2Meta of the same write request */ + struct QCowL2Meta *next; + QLIST_ENTRY(QCowL2Meta) next_in_flight; } QCowL2Meta; @@ -223,17 +292,32 @@ enum { #define REFT_OFFSET_MASK 0xffffffffffffff00ULL +static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) +{ + return offset & ~(s->cluster_size - 1); +} + +static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) +{ + return offset & (s->cluster_size - 1); +} + static inline int size_to_clusters(BDRVQcowState *s, int64_t size) { return (size + (s->cluster_size - 1)) >> s->cluster_bits; } -static inline int size_to_l1(BDRVQcowState *s, int64_t size) +static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) { int shift = s->cluster_bits + s->l2_bits; return (size + (1ULL << shift) - 1) >> shift; } +static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) +{ + return (offset >> s->cluster_bits) & (s->l2_size - 1); +} + static inline int64_t align_offset(int64_t offset, int n) { offset = (offset + n - 1) & ~(n - 1); @@ -259,11 +343,24 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); } +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ + return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ + return m->offset + m->cow_end.offset + + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); +} + // FIXME Need qcow2_ prefix to global functions /* qcow2.c functions */ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); + +int qcow2_mark_dirty(BlockDriverState *bs); int qcow2_update_header(BlockDriverState *bs); /* qcow2-refcount.c functions */ @@ -275,9 +372,10 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int nb_clusters); int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size); -void qcow2_free_any_clusters(BlockDriverState *bs, - uint64_t cluster_offset, int nb_clusters); + int64_t offset, int64_t size, + enum qcow2_discard_type type); +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type); int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend); @@ -285,8 +383,11 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); +void qcow2_process_discards(BlockDriverState *bs, int ret); + /* qcow2-cluster.c functions */ -int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size); +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size); void qcow2_l2_cache_reset(BlockDriverState *bs); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, @@ -297,7 +398,7 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset); int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, QCowL2Meta *m); + int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size); diff --git a/block/qed-table.c b/block/qed-table.c index de845ec3d..76d2dcccf 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -13,7 +13,7 @@ */ #include "trace.h" -#include "qemu_socket.h" /* for EINPROGRESS on Windows */ +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ #include "qed.h" typedef struct { diff --git a/block/qed.c b/block/qed.c index 0b5374a20..f767b0528 100644 --- a/block/qed.c +++ b/block/qed.c @@ -12,11 +12,11 @@ * */ -#include "qemu-timer.h" +#include "qemu/timer.h" #include "trace.h" #include "qed.h" -#include "qerror.h" -#include "migration.h" +#include "qapi/qmp/qerror.h" +#include "migration/migration.h" static void qed_aio_cancel(BlockDriverAIOCB *blockacb) { @@ -373,7 +373,7 @@ static void bdrv_qed_rebind(BlockDriverState *bs) s->bs = bs; } -static int bdrv_qed_open(BlockDriverState *bs, int flags) +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQEDState *s = bs->opaque; QEDHeader le_header; @@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags) qed_header_le_to_cpu(&le_header, &s->header); if (s->header.magic != QED_MAGIC) { - return -EINVAL; + return -EMEDIUMTYPE; } if (s->header.features & ~QED_FEATURE_MASK) { /* image uses unsupported feature bits */ @@ -558,7 +558,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } - ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB); + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); if (ret < 0) { return ret; } @@ -1526,7 +1526,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs) bdrv_qed_close(bs); memset(s, 0, sizeof(BDRVQEDState)); - bdrv_qed_open(bs, bs->open_flags); + bdrv_qed_open(bs, NULL, bs->open_flags); } static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, @@ -1574,6 +1574,7 @@ static BlockDriver bdrv_qed = { .bdrv_close = bdrv_qed_close, .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, .bdrv_create = bdrv_qed_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, .bdrv_make_empty = bdrv_qed_make_empty, .bdrv_aio_readv = bdrv_qed_aio_readv, diff --git a/block/qed.h b/block/qed.h index a063bf70a..2b4ddedf3 100644 --- a/block/qed.h +++ b/block/qed.h @@ -15,7 +15,7 @@ #ifndef BLOCK_QED_H #define BLOCK_QED_H -#include "block_int.h" +#include "block/block_int.h" /* The layout of a QED file is as follows: * diff --git a/block/raw-aio.h b/block/raw-aio.h index e77f36114..c61f1595d 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -20,11 +20,14 @@ #define QEMU_AIO_WRITE 0x0002 #define QEMU_AIO_IOCTL 0x0004 #define QEMU_AIO_FLUSH 0x0008 +#define QEMU_AIO_DISCARD 0x0010 #define QEMU_AIO_TYPE_MASK \ - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH) + (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ + QEMU_AIO_DISCARD) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 +#define QEMU_AIO_BLKDEV 0x2000 /* linux-aio.c - Linux native implementation */ diff --git a/block/raw-posix.c b/block/raw-posix.c index 550c81f22..ba721d3f5 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -22,14 +22,13 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "qemu-timer.h" -#include "qemu-char.h" -#include "qemu-log.h" -#include "block_int.h" -#include "module.h" +#include "qemu/timer.h" +#include "qemu/log.h" +#include "block/block_int.h" +#include "qemu/module.h" #include "trace.h" -#include "thread-pool.h" -#include "iov.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" #include "raw-aio.h" #if defined(__APPLE__) && (__MACH__) @@ -60,6 +59,9 @@ #ifdef CONFIG_FIEMAP #include <linux/fiemap.h> #endif +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#include <linux/falloc.h> +#endif #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) #include <sys/disk.h> #include <sys/cdio.h> @@ -139,6 +141,7 @@ typedef struct BDRVRawState { #ifdef CONFIG_XFS bool is_xfs : 1; #endif + bool has_discard : 1; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -160,7 +163,7 @@ typedef struct RawPosixAIOData { void *aio_ioctl_buf; }; int aio_niov; - size_t aio_nbytes; + uint64_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ off_t aio_offset; int aio_type; @@ -259,15 +262,42 @@ error: } #endif -static int raw_open_common(BlockDriverState *bs, const char *filename, +static QemuOptsList raw_runtime_opts = { + .name = "raw", + .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "File name of the image", + }, + { /* end of list */ } + }, +}; + +static int raw_open_common(BlockDriverState *bs, QDict *options, int bdrv_flags, int open_flags) { BDRVRawState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; int fd, ret; + opts = qemu_opts_create_nofail(&raw_runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + filename = qemu_opt_get(opts, "filename"); + ret = raw_normalize_devicepath(&filename); if (ret != 0) { - return ret; + goto fail; } s->open_flags = open_flags; @@ -277,34 +307,40 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, fd = qemu_open(filename, s->open_flags, 0644); if (fd < 0) { ret = -errno; - if (ret == -EROFS) + if (ret == -EROFS) { ret = -EACCES; - return ret; + } + goto fail; } s->fd = fd; #ifdef CONFIG_LINUX_AIO if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { qemu_close(fd); - return -errno; + ret = -errno; + goto fail; } #endif + s->has_discard = 1; #ifdef CONFIG_XFS if (platform_test_xfs_fd(s->fd)) { s->is_xfs = 1; } #endif - return 0; + ret = 0; +fail: + qemu_opts_del(opts); + return ret; } -static int raw_open(BlockDriverState *bs, const char *filename, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; s->type = FTYPE_FILE; - return raw_open_common(bs, filename, flags, 0); + return raw_open_common(bs, options, flags, 0); } static int raw_reopen_prepare(BDRVReopenState *state, @@ -341,11 +377,20 @@ static int raw_reopen_prepare(BDRVReopenState *state, raw_s->fd = -1; - int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK; + int fcntl_flags = O_APPEND | O_NONBLOCK; #ifdef O_NOATIME fcntl_flags |= O_NOATIME; #endif +#ifdef O_ASYNC + /* Not all operating systems have O_ASYNC, and those that don't + * will not let us track the state into raw_s->open_flags (typically + * you achieve the same effect with an ioctl, for example I_SETSIG + * on Solaris). But we do not use O_ASYNC, so that's fine. + */ + assert((s->open_flags & O_ASYNC) == 0); +#endif + if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { /* dup the original fd */ /* TODO: use qemu fcntl wrapper */ @@ -431,22 +476,6 @@ static void raw_reopen_abort(BDRVReopenState *state) #endif */ -/* - * Check if all memory in this vector is sector aligned. - */ -static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) -{ - int i; - - for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { - return 0; - } - } - - return 1; -} - static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { int ret; @@ -456,15 +485,7 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) return -errno; } - /* - * This looks weird, but the aio code only considers a request - * successful if it has written the full number of bytes. - * - * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command, - * so in fact we return the ioctl command here to make posix_aio_read() - * happy.. - */ - return aiocb->aio_nbytes; + return 0; } static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) @@ -643,6 +664,72 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) return nbytes; } +#ifdef CONFIG_XFS +static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { + DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} +#endif + +static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +{ + int ret = -EOPNOTSUPP; + BDRVRawState *s = aiocb->bs->opaque; + + if (s->has_discard == 0) { + return 0; + } + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKDISCARD + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } else { +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + do { + if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } + + if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || + ret == -ENOTTY) { + s->has_discard = 0; + ret = 0; + } + return ret; +} + static int aio_worker(void *arg) { RawPosixAIOData *aiocb = arg; @@ -677,6 +764,9 @@ static int aio_worker(void *arg) case QEMU_AIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); break; + case QEMU_AIO_DISCARD: + ret = handle_aiocb_discard(aiocb); + break; default: fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); ret = -EINVAL; @@ -692,6 +782,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, BlockDriverCompletionFunc *cb, void *opaque, int type) { RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + ThreadPool *pool; acb->bs = bs; acb->aio_type = type; @@ -705,23 +796,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_offset = sector_num * 512; trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); -} - -static BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, - unsigned long int req, void *buf, - BlockDriverCompletionFunc *cb, void *opaque) -{ - RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); - - acb->bs = bs; - acb->aio_type = QEMU_AIO_IOCTL; - acb->aio_fildes = fd; - acb->aio_offset = 0; - acb->aio_ioctl_buf = buf; - acb->aio_ioctl_cmd = req; - - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, @@ -739,7 +815,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, * driver that it needs to copy the buffer. */ if ((bs->open_flags & BDRV_O_NOCACHE)) { - if (!qiov_is_aligned(bs, qiov)) { + if (!bdrv_qiov_is_aligned(bs, qiov)) { type |= QEMU_AIO_MISALIGNED; #ifdef CONFIG_LINUX_AIO } else if (s->use_aio) { @@ -1093,37 +1169,14 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, } } -#ifdef CONFIG_XFS -static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) -{ - struct xfs_flock64 fl; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = sector_num << 9; - fl.l_len = (int64_t)nb_sectors << 9; - - if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { - DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); - return -errno; - } - - return 0; -} -#endif - -static coroutine_fn int raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { -#ifdef CONFIG_XFS BDRVRawState *s = bs->opaque; - if (s->is_xfs) { - return xfs_discard(s, sector_num, nb_sectors); - } -#endif - - return 0; + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD); } static QEMUOptionParameter raw_create_options[] = { @@ -1146,12 +1199,13 @@ static BlockDriver bdrv_file = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_close = raw_close, .bdrv_create = raw_create, - .bdrv_co_discard = raw_co_discard, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_is_allocated = raw_co_is_allocated, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = raw_aio_discard, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1238,9 +1292,44 @@ static int hdev_probe_device(const char *filename) return 0; } -static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +static int check_hdev_writable(BDRVRawState *s) +{ +#if defined(BLKROGET) + /* Linux block devices can be configured "read-only" using blockdev(8). + * This is independent of device node permissions and therefore open(2) + * with O_RDWR succeeds. Actual writes fail with EPERM. + * + * bdrv_open() is supposed to fail if the disk is read-only. Explicitly + * check for read-only block devices so that Linux block devices behave + * properly. + */ + struct stat st; + int readonly = 0; + + if (fstat(s->fd, &st)) { + return -errno; + } + + if (!S_ISBLK(st.st_mode)) { + return 0; + } + + if (ioctl(s->fd, BLKROGET, &readonly) < 0) { + return -errno; + } + + if (readonly) { + return -EACCES; + } +#endif /* defined(BLKROGET) */ + return 0; +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; + int ret; + const char *filename = qdict_get_str(options, "filename"); #if defined(__APPLE__) && defined(__MACH__) if (strstart(filename, "/dev/cdrom", NULL)) { @@ -1262,6 +1351,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) qemu_close(fd); } filename = bsdPath; + qdict_put(options, "filename", qstring_from_str(filename)); } if ( mediaIterator ) @@ -1281,7 +1371,20 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) } #endif - return raw_open_common(bs, filename, flags, 0); + ret = raw_open_common(bs, options, flags, 0); + if (ret < 0) { + return ret; + } + + if (flags & BDRV_O_RDWR) { + ret = check_hdev_writable(s); + if (ret < 0) { + raw_close(bs); + return ret; + } + } + + return ret; } #if defined(__linux__) @@ -1346,10 +1449,21 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; + RawPosixAIOData *acb; + ThreadPool *pool; if (fd_open(bs) < 0) return NULL; - return paio_ioctl(bs, s->fd, req, buf, cb, opaque); + + acb = g_slice_new(RawPosixAIOData); + acb->bs = bs; + acb->aio_type = QEMU_AIO_IOCTL; + acb->aio_fildes = s->fd; + acb->aio_offset = 0; + acb->aio_ioctl_buf = buf; + acb->aio_ioctl_cmd = req; + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -1371,6 +1485,19 @@ static int fd_open(BlockDriverState *bs) #endif /* !linux && !FreeBSD */ +static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) { + return NULL; + } + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); +} + static int hdev_create(const char *filename, QEMUOptionParameter *options) { int fd; @@ -1401,11 +1528,6 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options) return ret; } -static int hdev_has_zero_init(BlockDriverState *bs) -{ - return 0; -} - static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", @@ -1418,11 +1540,11 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, - .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = hdev_aio_discard, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1437,7 +1559,7 @@ static BlockDriver bdrv_host_device = { }; #ifdef __linux__ -static int floppy_open(BlockDriverState *bs, const char *filename, int flags) +static int floppy_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; int ret; @@ -1445,7 +1567,7 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags) s->type = FTYPE_FD; /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, filename, flags, O_NONBLOCK); + ret = raw_open_common(bs, options, flags, O_NONBLOCK); if (ret) return ret; @@ -1542,7 +1664,6 @@ static BlockDriver bdrv_host_floppy = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, - .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, @@ -1559,14 +1680,14 @@ static BlockDriver bdrv_host_floppy = { .bdrv_eject = floppy_eject, }; -static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; s->type = FTYPE_CD; /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ - return raw_open_common(bs, filename, flags, O_NONBLOCK); + return raw_open_common(bs, options, flags, O_NONBLOCK); } static int cdrom_probe_device(const char *filename) @@ -1644,7 +1765,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, - .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, @@ -1667,14 +1787,14 @@ static BlockDriver bdrv_host_cdrom = { #endif /* __linux__ */ #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; int ret; s->type = FTYPE_CD; - ret = raw_open_common(bs, filename, flags, 0); + ret = raw_open_common(bs, options, flags, 0); if (ret) return ret; @@ -1766,7 +1886,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, - .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, @@ -1784,6 +1903,40 @@ static BlockDriver bdrv_host_cdrom = { }; #endif /* __FreeBSD__ */ +#ifdef CONFIG_LINUX_AIO +/** + * Return the file descriptor for Linux AIO + * + * This function is a layering violation and should be removed when it becomes + * possible to call the block layer outside the global mutex. It allows the + * caller to hijack the file descriptor so I/O can be performed outside the + * block layer. + */ +int raw_get_aio_fd(BlockDriverState *bs) +{ + BDRVRawState *s; + + if (!bs->drv) { + return -ENOMEDIUM; + } + + if (bs->drv == bdrv_find_format("raw")) { + bs = bs->file; + } + + /* raw-posix has several protocols so just check for raw_aio_readv */ + if (bs->drv->bdrv_aio_readv != raw_aio_readv) { + return -ENOTSUP; + } + + s = bs->opaque; + if (!s->use_aio) { + return -ENOTSUP; + } + return s->fd; +} +#endif /* CONFIG_LINUX_AIO */ + static void bdrv_file_init(void) { /* diff --git a/block/raw-win32.c b/block/raw-win32.c index 0c05c58c5..9b5b2af4e 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -22,13 +22,13 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "qemu-timer.h" -#include "block_int.h" -#include "module.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" #include "raw-aio.h" #include "trace.h" -#include "thread-pool.h" -#include "iov.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" #include <windows.h> #include <winioctl.h> @@ -144,6 +144,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, BlockDriverCompletionFunc *cb, void *opaque, int type) { RawWin32AIOData *acb = g_slice_new(RawWin32AIOData); + ThreadPool *pool; acb->bs = bs; acb->hfile = hfile; @@ -157,7 +158,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, acb->aio_offset = sector_num * 512; trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } int qemu_ftruncate64(int fd, int64_t length) @@ -219,20 +221,49 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) } } -static int raw_open(BlockDriverState *bs, const char *filename, int flags) +static QemuOptsList raw_runtime_opts = { + .name = "raw", + .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "File name of the image", + }, + { /* end of list */ } + }, +}; + +static int raw_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; int access_flags; DWORD overlapped; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + int ret; s->type = FTYPE_FILE; + opts = qemu_opts_create_nofail(&raw_runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + filename = qemu_opt_get(opts, "filename"); + raw_parse_flags(flags, &access_flags, &overlapped); - + if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) { aio = win32_aio_init(); if (aio == NULL) { - return -EINVAL; + ret = -EINVAL; + goto fail; } } @@ -242,20 +273,27 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) if (s->hfile == INVALID_HANDLE_VALUE) { int err = GetLastError(); - if (err == ERROR_ACCESS_DENIED) - return -EACCES; - return -EINVAL; + if (err == ERROR_ACCESS_DENIED) { + ret = -EACCES; + } else { + ret = -EINVAL; + } + goto fail; } if (flags & BDRV_O_NATIVE_AIO) { - int ret = win32_aio_attach(aio, s->hfile); + ret = win32_aio_attach(aio, s->hfile); if (ret < 0) { CloseHandle(s->hfile); - return ret; + goto fail; } s->aio = aio; } - return 0; + + ret = 0; +fail: + qemu_opts_del(opts); + return ret; } static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, @@ -303,13 +341,24 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset) { BDRVRawState *s = bs->opaque; LONG low, high; + DWORD dwPtrLow; low = offset; high = offset >> 32; - if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN)) - return -EIO; - if (!SetEndOfFile(s->hfile)) + + /* + * An error has occurred if the return value is INVALID_SET_FILE_POINTER + * and GetLastError doesn't return NO_ERROR. + */ + dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN); + if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError()); + return -EIO; + } + if (SetEndOfFile(s->hfile) == 0) { + fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError()); return -EIO; + } return 0; } @@ -410,6 +459,7 @@ static BlockDriver bdrv_file = { .bdrv_file_open = raw_open, .bdrv_close = raw_close, .bdrv_create = raw_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, @@ -481,12 +531,13 @@ static int hdev_probe_device(const char *filename) return 0; } -static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +static int hdev_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRawState *s = bs->opaque; int access_flags, create_flags; DWORD overlapped; char device_name[64]; + const char *filename = qdict_get_str(options, "filename"); if (strstart(filename, "/dev/cdrom", NULL)) { if (find_cdrom(device_name, sizeof(device_name)) < 0) @@ -520,11 +571,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) return 0; } -static int hdev_has_zero_init(BlockDriverState *bs) -{ - return 0; -} - static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", @@ -532,7 +578,6 @@ static BlockDriver bdrv_host_device = { .bdrv_probe_device = hdev_probe_device, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, - .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, diff --git a/block/raw.c b/block/raw.c index 253e949b8..47518253f 100644 --- a/block/raw.c +++ b/block/raw.c @@ -1,9 +1,32 @@ +/* + * Block driver for RAW format + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" +#include "block/block_int.h" +#include "qemu/module.h" -static int raw_open(BlockDriverState *bs, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags) { bs->sg = bs->file->sg; return 0; @@ -42,6 +65,13 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); } +static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors); +} + static int64_t raw_getlength(BlockDriverState *bs) { return bdrv_getlength(bs->file); @@ -114,6 +144,11 @@ static int raw_has_zero_init(BlockDriverState *bs) return bdrv_has_zero_init(bs->file); } +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + return bdrv_get_info(bs->file, bdi); +} + static BlockDriver bdrv_raw = { .format_name = "raw", @@ -128,10 +163,12 @@ static BlockDriver bdrv_raw = { .bdrv_co_readv = raw_co_readv, .bdrv_co_writev = raw_co_writev, .bdrv_co_is_allocated = raw_co_is_allocated, + .bdrv_co_write_zeroes = raw_co_write_zeroes, .bdrv_co_discard = raw_co_discard, .bdrv_probe = raw_probe, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_truncate = raw_truncate, .bdrv_is_inserted = raw_is_inserted, diff --git a/block/rbd.c b/block/rbd.c index f3becc7a8..cb7175121 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -14,8 +14,8 @@ #include <inttypes.h> #include "qemu-common.h" -#include "qemu-error.h" -#include "block_int.h" +#include "qemu/error-report.h" +#include "block/block_int.h" #include <rbd/librbd.h> @@ -63,7 +63,8 @@ typedef enum { RBD_AIO_READ, RBD_AIO_WRITE, - RBD_AIO_DISCARD + RBD_AIO_DISCARD, + RBD_AIO_FLUSH } RBDAIOCmd; typedef struct RBDAIOCB { @@ -77,6 +78,7 @@ typedef struct RBDAIOCB { int error; struct BDRVRBDState *s; int cancelled; + int status; } RBDAIOCB; typedef struct RADOSCB { @@ -376,16 +378,9 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) RBDAIOCB *acb = rcb->acb; int64_t r; - if (acb->cancelled) { - qemu_vfree(acb->bounce); - qemu_aio_release(acb); - goto done; - } - r = rcb->ret; - if (acb->cmd == RBD_AIO_WRITE || - acb->cmd == RBD_AIO_DISCARD) { + if (acb->cmd != RBD_AIO_READ) { if (r < 0) { acb->ret = r; acb->error = 1; @@ -409,7 +404,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) /* Note that acb->bh can be NULL in case where the aio was cancelled */ acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); qemu_bh_schedule(acb->bh); -done: g_free(rcb); } @@ -447,7 +441,21 @@ static int qemu_rbd_aio_flush_cb(void *opaque) return (s->qemu_aio_count > 0); } -static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "rbd", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "Specification of the rbd image", + }, + { /* end of list */ } + }, +}; + +static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) { BDRVRBDState *s = bs->opaque; char pool[RBD_MAX_POOL_NAME_SIZE]; @@ -455,20 +463,35 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; int r; + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + qemu_opts_del(opts); + return -EINVAL; + } + + filename = qemu_opt_get(opts, "filename"); + if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), s->name, sizeof(s->name), conf, sizeof(conf)) < 0) { - return -EINVAL; + r = -EINVAL; + goto failed_opts; } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); r = rados_create(&s->cluster, clientname); if (r < 0) { error_report("error initializing"); - return r; + goto failed_opts; } s->snap = NULL; @@ -534,6 +557,7 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) NULL, qemu_rbd_aio_flush_cb, s); + qemu_opts_del(opts); return 0; failed: @@ -543,6 +567,8 @@ failed_open: failed_shutdown: rados_shutdown(s->cluster); g_free(s->snap); +failed_opts: + qemu_opts_del(opts); return r; } @@ -568,6 +594,12 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) { RBDAIOCB *acb = (RBDAIOCB *) blockacb; acb->cancelled = 1; + + while (acb->status == -EINPROGRESS) { + qemu_aio_wait(); + } + + qemu_aio_release(acb); } static const AIOCBInfo rbd_aiocb_info = { @@ -639,8 +671,11 @@ static void rbd_aio_bh_cb(void *opaque) acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); qemu_bh_delete(acb->bh); acb->bh = NULL; + acb->status = 0; - qemu_aio_release(acb); + if (!acb->cancelled) { + qemu_aio_release(acb); + } } static int rbd_aio_discard_wrapper(rbd_image_t image, @@ -655,6 +690,16 @@ static int rbd_aio_discard_wrapper(rbd_image_t image, #endif } +static int rbd_aio_flush_wrapper(rbd_image_t image, + rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH + return rbd_aio_flush(image, comp); +#else + return -ENOTSUP; +#endif +} + static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -675,7 +720,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); acb->cmd = cmd; acb->qiov = qiov; - if (cmd == RBD_AIO_DISCARD) { + if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { acb->bounce = qemu_blockalign(bs, qiov->size); @@ -685,6 +730,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, acb->s = s; acb->cancelled = 0; acb->bh = NULL; + acb->status = -EINPROGRESS; if (cmd == RBD_AIO_WRITE) { qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); @@ -718,6 +764,9 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, case RBD_AIO_DISCARD: r = rbd_aio_discard_wrapper(s->image, off, size, c); break; + case RBD_AIO_FLUSH: + r = rbd_aio_flush_wrapper(s->image, c); + break; default: r = -EINVAL; } @@ -757,6 +806,16 @@ static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, RBD_AIO_WRITE); } +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH +static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); +} + +#else + static int qemu_rbd_co_flush(BlockDriverState *bs) { #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) @@ -767,6 +826,7 @@ static int qemu_rbd_co_flush(BlockDriverState *bs) return 0; #endif } +#endif static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) { @@ -936,6 +996,7 @@ static BlockDriver bdrv_rbd = { .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_create = qemu_rbd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_get_info = qemu_rbd_getinfo, .create_options = qemu_rbd_create_options, .bdrv_getlength = qemu_rbd_getlength, @@ -944,7 +1005,12 @@ static BlockDriver bdrv_rbd = { .bdrv_aio_readv = qemu_rbd_aio_readv, .bdrv_aio_writev = qemu_rbd_aio_writev, + +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH + .bdrv_aio_flush = qemu_rbd_aio_flush, +#else .bdrv_co_flush_to_disk = qemu_rbd_co_flush, +#endif #ifdef LIBRBD_SUPPORTS_DISCARD .bdrv_aio_discard = qemu_rbd_aio_discard, diff --git a/block/sheepdog.c b/block/sheepdog.c index a48f58cfe..afe053376 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -13,19 +13,22 @@ */ #include "qemu-common.h" -#include "qemu-error.h" -#include "qemu_socket.h" -#include "block_int.h" -#include "bitops.h" +#include "qemu/uri.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "block/block_int.h" +#include "qemu/bitops.h" #define SD_PROTO_VER 0x01 #define SD_DEFAULT_ADDR "localhost" -#define SD_DEFAULT_PORT "7000" +#define SD_DEFAULT_PORT 7000 #define SD_OP_CREATE_AND_WRITE_OBJ 0x01 #define SD_OP_READ_OBJ 0x02 #define SD_OP_WRITE_OBJ 0x03 +/* 0x04 is used internally by Sheepdog */ +#define SD_OP_DISCARD_OBJ 0x05 #define SD_OP_NEW_VDI 0x11 #define SD_OP_LOCK_VDI 0x12 @@ -33,10 +36,12 @@ #define SD_OP_GET_VDI_INFO 0x14 #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 +#define SD_OP_DEL_VDI 0x17 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 -#define SD_FLAG_CMD_CACHE 0x04 +#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */ +#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */ #define SD_RES_SUCCESS 0x00 /* Success */ #define SD_RES_UNKNOWN 0x01 /* Unknown error */ @@ -63,6 +68,8 @@ #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */ #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ +#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */ +#define SD_RES_READONLY 0x1A /* Object is read-only */ /* * Object ID rules @@ -84,7 +91,6 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) -#define SECTOR_SIZE 512 #define SD_INODE_SIZE (sizeof(SheepdogInode)) #define CURRENT_VDI_ID 0 @@ -144,7 +150,7 @@ typedef struct SheepdogVdiReq { uint32_t id; uint32_t data_length; uint64_t vdi_size; - uint32_t base_vdi_id; + uint32_t vdi_id; uint32_t copies; uint32_t snapid; uint32_t pad[3]; @@ -236,14 +242,14 @@ static inline bool is_snapshot(struct SheepdogInode *inode) return !!inode->snap_ctime; } -#undef dprintf +#undef DPRINTF #ifdef DEBUG_SDOG -#define dprintf(fmt, args...) \ +#define DPRINTF(fmt, args...) \ do { \ fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ } while (0) #else -#define dprintf(fmt, args...) +#define DPRINTF(fmt, args...) #endif typedef struct SheepdogAIOCB SheepdogAIOCB; @@ -265,6 +271,8 @@ typedef struct AIOReq { enum AIOCBState { AIOCB_WRITE_UDATA, AIOCB_READ_UDATA, + AIOCB_FLUSH_CACHE, + AIOCB_DISCARD_OBJ, }; struct SheepdogAIOCB { @@ -293,12 +301,12 @@ typedef struct BDRVSheepdogState { char name[SD_MAX_VDI_LEN]; bool is_snapshot; - bool cache_enabled; + uint32_t cache_flags; + bool discard_supported; - char *addr; - char *port; + char *host_spec; + bool is_unix; int fd; - int flush_fd; CoMutex lock; Coroutine *co_send; @@ -342,6 +350,8 @@ static const char * sd_strerror(int err) {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, + {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, + {SD_RES_READONLY, "Object is read-only"}, }; for (i = 0; i < ARRAY_SIZE(errors); ++i) { @@ -426,12 +436,11 @@ static const AIOCBInfo sd_aiocb_info = { }; static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) + int64_t sector_num, int nb_sectors) { SheepdogAIOCB *acb; - acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque); + acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); acb->qiov = qiov; @@ -446,56 +455,31 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, return acb; } -static int connect_to_sdog(const char *addr, const char *port) +static int connect_to_sdog(BDRVSheepdogState *s) { - char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; - int fd, ret; - struct addrinfo hints, *res, *res0; - - if (!addr) { - addr = SD_DEFAULT_ADDR; - port = SD_DEFAULT_PORT; - } - - memset(&hints, 0, sizeof(hints)); - hints.ai_socktype = SOCK_STREAM; - - ret = getaddrinfo(addr, port, &hints, &res0); - if (ret) { - error_report("unable to get address info %s, %s", - addr, strerror(errno)); - return -errno; - } - - for (res = res0; res; res = res->ai_next) { - ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf), - sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV); - if (ret) { - continue; - } + int fd; + Error *err = NULL; - fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); - if (fd < 0) { - continue; - } + if (s->is_unix) { + fd = unix_connect(s->host_spec, &err); + } else { + fd = inet_connect(s->host_spec, &err); - reconnect: - ret = connect(fd, res->ai_addr, res->ai_addrlen); - if (ret < 0) { - if (errno == EINTR) { - goto reconnect; + if (err == NULL) { + int ret = socket_set_nodelay(fd); + if (ret < 0) { + error_report("%s", strerror(errno)); } - close(fd); - break; } + } - dprintf("connected to %s:%s\n", addr, port); - goto success; + if (err != NULL) { + qerror_report_err(err); + error_free(err); + } else { + qemu_set_nonblock(fd); } - fd = -errno; - error_report("failed connect to %s:%s", addr, port); -success: - freeaddrinfo(res0); + return fd; } @@ -525,6 +509,13 @@ static void restart_co_req(void *opaque) qemu_coroutine_enter(co, NULL); } +static int have_co_req(void *opaque) +{ + /* this handler is set only when there is a pending request, so + * always returns 1. */ + return 1; +} + typedef struct SheepdogReqCo { int sockfd; SheepdogReq *hdr; @@ -547,15 +538,14 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co); - socket_set_block(sockfd); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co); + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); if (ret < sizeof(*hdr)) { @@ -578,8 +568,9 @@ static coroutine_fn void do_co_req(void *opaque) } ret = 0; out: + /* there is at most one request for this sockfd, so it is safe to + * set each handler to NULL. */ qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); - socket_set_nonblock(sockfd); srco->ret = ret; srco->finished = true; @@ -615,6 +606,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type); +static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) @@ -671,7 +663,7 @@ static void coroutine_fn aio_read_response(void *opaque) int ret; AIOReq *aio_req = NULL; SheepdogAIOCB *acb; - unsigned long idx; + uint64_t idx; if (QLIST_EMPTY(&s->inflight_aio_head)) { goto out; @@ -714,16 +706,17 @@ static void coroutine_fn aio_read_response(void *opaque) * and max_dirty_data_idx are changed to include updated * index between them. */ - s->inode.data_vdi_id[idx] = s->inode.vdi_id; - s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); - s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); - + if (rsp.result == SD_RES_SUCCESS) { + s->inode.data_vdi_id[idx] = s->inode.vdi_id; + s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); + s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); + } /* * Some requests may be blocked because simultaneous * create requests are not allowed, so we search the * pending requests here. */ - send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx)); + send_pending_req(s, aio_req->oid); } break; case AIOCB_READ_UDATA: @@ -734,11 +727,43 @@ static void coroutine_fn aio_read_response(void *opaque) goto out; } break; + case AIOCB_FLUSH_CACHE: + if (rsp.result == SD_RES_INVALID_PARMS) { + DPRINTF("disable cache since the server doesn't support it\n"); + s->cache_flags = SD_FLAG_CMD_DIRECT; + rsp.result = SD_RES_SUCCESS; + } + break; + case AIOCB_DISCARD_OBJ: + switch (rsp.result) { + case SD_RES_INVALID_PARMS: + error_report("sheep(%s) doesn't support discard command", + s->host_spec); + rsp.result = SD_RES_SUCCESS; + s->discard_supported = false; + break; + case SD_RES_SUCCESS: + idx = data_oid_to_idx(aio_req->oid); + s->inode.data_vdi_id[idx] = 0; + break; + default: + break; + } } - if (rsp.result != SD_RES_SUCCESS) { + switch (rsp.result) { + case SD_RES_SUCCESS: + break; + case SD_RES_READONLY: + ret = resend_aioreq(s, aio_req); + if (ret == SD_RES_SUCCESS) { + goto out; + } + /* fall through */ + default: acb->ret = -EIO; error_report("%s", sd_strerror(rsp.result)); + break; } free_aio_req(s, aio_req); @@ -779,15 +804,6 @@ static int aio_flush_request(void *opaque) !QLIST_EMPTY(&s->pending_aio_head); } -static int set_nodelay(int fd) -{ - int ret, opt; - - opt = 1; - ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); - return ret; -} - /* * Return a socket discriptor to read/write objects. * @@ -796,29 +812,86 @@ static int set_nodelay(int fd) */ static int get_sheep_fd(BDRVSheepdogState *s) { - int ret, fd; + int fd; - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { - error_report("%s", strerror(errno)); return fd; } - socket_set_nonblock(fd); + qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); + return fd; +} - ret = set_nodelay(fd); - if (ret) { - error_report("%s", strerror(errno)); - closesocket(fd); - return -errno; +static int sd_parse_uri(BDRVSheepdogState *s, const char *filename, + char *vdi, uint32_t *snapid, char *tag) +{ + URI *uri; + QueryParams *qp = NULL; + int ret = 0; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; } - qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); - return fd; + /* transport */ + if (!strcmp(uri->scheme, "sheepdog")) { + s->is_unix = false; + } else if (!strcmp(uri->scheme, "sheepdog+tcp")) { + s->is_unix = false; + } else if (!strcmp(uri->scheme, "sheepdog+unix")) { + s->is_unix = true; + } else { + ret = -EINVAL; + goto out; + } + + if (uri->path == NULL || !strcmp(uri->path, "/")) { + ret = -EINVAL; + goto out; + } + pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1); + + qp = query_params_parse(uri->query); + if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) { + ret = -EINVAL; + goto out; + } + + if (s->is_unix) { + /* sheepdog+unix:///vdiname?socket=path */ + if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { + ret = -EINVAL; + goto out; + } + s->host_spec = g_strdup(qp->p[0].value); + } else { + /* sheepdog[+tcp]://[host:port]/vdiname */ + s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR, + uri->port ?: SD_DEFAULT_PORT); + } + + /* snapshot tag */ + if (uri->fragment) { + *snapid = strtoul(uri->fragment, NULL, 10); + if (*snapid == 0) { + pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment); + } + } else { + *snapid = CURRENT_VDI_ID; /* search current vdi */ + } + +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + return ret; } /* - * Parse a filename + * Parse a filename (old syntax) * * filename must be one of the following formats: * 1. [vdiname] @@ -837,9 +910,11 @@ static int get_sheep_fd(BDRVSheepdogState *s) static int parse_vdiname(BDRVSheepdogState *s, const char *filename, char *vdi, uint32_t *snapid, char *tag) { - char *p, *q; - int nr_sep; + char *p, *q, *uri; + const char *host_spec, *vdi_spec; + int nr_sep, ret; + strstart(filename, "sheepdog:", (const char **)&filename); p = q = g_strdup(filename); /* count the number of separators */ @@ -852,42 +927,37 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename, } p = q; - /* use the first two tokens as hostname and port number. */ + /* use the first two tokens as host_spec. */ if (nr_sep >= 2) { - s->addr = p; + host_spec = p; p = strchr(p, ':'); - *p++ = '\0'; - - s->port = p; + p++; p = strchr(p, ':'); *p++ = '\0'; } else { - s->addr = NULL; - s->port = 0; + host_spec = ""; } - pstrcpy(vdi, SD_MAX_VDI_LEN, p); + vdi_spec = p; - p = strchr(vdi, ':'); + p = strchr(vdi_spec, ':'); if (p) { - *p++ = '\0'; - *snapid = strtoul(p, NULL, 10); - if (*snapid == 0) { - pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p); - } - } else { - *snapid = CURRENT_VDI_ID; /* search current vdi */ + *p++ = '#'; } - if (s->addr == NULL) { - g_free(q); - } + uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec); - return 0; + ret = sd_parse_uri(s, uri, vdi, snapid, tag); + + g_free(q); + g_free(uri); + + return ret; } -static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, - char *tag, uint32_t *vid, int for_snapshot) +static int find_vdi_name(BDRVSheepdogState *s, const char *filename, + uint32_t snapid, const char *tag, uint32_t *vid, + bool lock) { int ret, fd; SheepdogVdiReq hdr; @@ -895,7 +965,7 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, unsigned int wlen, rlen = 0; char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { return fd; } @@ -908,10 +978,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); memset(&hdr, 0, sizeof(hdr)); - if (for_snapshot) { - hdr.opcode = SD_OP_GET_VDI_INFO; - } else { + if (lock) { hdr.opcode = SD_OP_LOCK_VDI; + } else { + hdr.opcode = SD_OP_GET_VDI_INFO; } wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; hdr.proto_ver = SD_PROTO_VER; @@ -948,7 +1018,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, { int nr_copies = s->inode.nr_copies; SheepdogObjReq hdr; - unsigned int wlen; + unsigned int wlen = 0; int ret; uint64_t oid = aio_req->oid; unsigned int datalen = aio_req->data_len; @@ -962,22 +1032,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, memset(&hdr, 0, sizeof(hdr)); - if (aiocb_type == AIOCB_READ_UDATA) { - wlen = 0; + switch (aiocb_type) { + case AIOCB_FLUSH_CACHE: + hdr.opcode = SD_OP_FLUSH_VDI; + break; + case AIOCB_READ_UDATA: hdr.opcode = SD_OP_READ_OBJ; hdr.flags = flags; - } else if (create) { - wlen = datalen; - hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; - hdr.flags = SD_FLAG_CMD_WRITE | flags; - } else { + break; + case AIOCB_WRITE_UDATA: + if (create) { + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + } else { + hdr.opcode = SD_OP_WRITE_OBJ; + } wlen = datalen; - hdr.opcode = SD_OP_WRITE_OBJ; hdr.flags = SD_FLAG_CMD_WRITE | flags; + break; + case AIOCB_DISCARD_OBJ: + hdr.opcode = SD_OP_DISCARD_OBJ; + break; } - if (s->cache_enabled) { - hdr.flags |= SD_FLAG_CMD_CACHE; + if (s->cache_flags) { + hdr.flags |= s->cache_flags; } hdr.oid = oid; @@ -1022,7 +1100,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, static int read_write_object(int fd, char *buf, uint64_t oid, int copies, unsigned int datalen, uint64_t offset, - bool write, bool create, bool cache) + bool write, bool create, uint32_t cache_flags) { SheepdogObjReq hdr; SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; @@ -1046,9 +1124,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies, hdr.opcode = SD_OP_READ_OBJ; } - if (cache) { - hdr.flags |= SD_FLAG_CMD_CACHE; - } + hdr.flags |= cache_flags; hdr.oid = oid; hdr.data_length = datalen; @@ -1071,21 +1147,119 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies, } static int read_object(int fd, char *buf, uint64_t oid, int copies, - unsigned int datalen, uint64_t offset, bool cache) + unsigned int datalen, uint64_t offset, + uint32_t cache_flags) { return read_write_object(fd, buf, oid, copies, datalen, offset, false, - false, cache); + false, cache_flags); } static int write_object(int fd, char *buf, uint64_t oid, int copies, unsigned int datalen, uint64_t offset, bool create, - bool cache) + uint32_t cache_flags) { return read_write_object(fd, buf, oid, copies, datalen, offset, true, - create, cache); + create, cache_flags); } -static int sd_open(BlockDriverState *bs, const char *filename, int flags) +/* update inode with the latest state */ +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) +{ + SheepdogInode *inode; + int ret = 0, fd; + uint32_t vid = 0; + + fd = connect_to_sdog(s); + if (fd < 0) { + return -EIO; + } + + inode = g_malloc(sizeof(s->inode)); + + ret = find_vdi_name(s, s->name, snapid, tag, &vid, false); + if (ret) { + goto out; + } + + ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid), + s->inode.nr_copies, sizeof(*inode), 0, s->cache_flags); + if (ret < 0) { + goto out; + } + + if (inode->vdi_id != s->inode.vdi_id) { + memcpy(&s->inode, inode, sizeof(s->inode)); + } + +out: + g_free(inode); + closesocket(fd); + + return ret; +} + +static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + bool create = false; + int ret; + + ret = reload_inode(s, 0, ""); + if (ret < 0) { + return ret; + } + + aio_req->oid = vid_to_data_oid(s->inode.vdi_id, + data_oid_to_idx(aio_req->oid)); + + /* check whether this request becomes a CoW one */ + if (acb->aiocb_type == AIOCB_WRITE_UDATA) { + int idx = data_oid_to_idx(aio_req->oid); + AIOReq *areq; + + if (s->inode.data_vdi_id[idx] == 0) { + create = true; + goto out; + } + if (is_data_obj_writable(&s->inode, idx)) { + goto out; + } + + /* link to the pending list if there is another CoW request to + * the same object */ + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { + if (areq != aio_req && areq->oid == aio_req->oid) { + DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid); + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); + return SD_RES_SUCCESS; + } + } + + aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); + aio_req->flags |= SD_FLAG_CMD_COW; + create = true; + } +out: + return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, + create, acb->aiocb_type); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "sheepdog", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the sheepdog image", + }, + { /* end of list */ } + }, +}; + +static int sd_open(BlockDriverState *bs, QDict *options, int flags) { int ret, fd; uint32_t vid = 0; @@ -1093,8 +1267,20 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; char *buf = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto out; + } - strstart(filename, "sheepdog:", (const char **)&filename); + filename = qemu_opt_get(opts, "filename"); QLIST_INIT(&s->inflight_aio_head); QLIST_INIT(&s->pending_aio_head); @@ -1102,8 +1288,13 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) memset(vdi, 0, sizeof(vdi)); memset(tag, 0, sizeof(tag)); - if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) { - ret = -EINVAL; + + if (strstr(filename, "://")) { + ret = sd_parse_uri(s, filename, vdi, &snapid, tag); + } else { + ret = parse_vdiname(s, filename, vdi, &snapid, tag); + } + if (ret < 0) { goto out; } s->fd = get_sheep_fd(s); @@ -1112,34 +1303,35 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) goto out; } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0); + ret = find_vdi_name(s, vdi, snapid, tag, &vid, true); if (ret) { goto out; } - s->cache_enabled = true; - s->flush_fd = connect_to_sdog(s->addr, s->port); - if (s->flush_fd < 0) { - error_report("failed to connect"); - ret = s->flush_fd; - goto out; + /* + * QEMU block layer emulates writethrough cache as 'writeback + flush', so + * we always set SD_FLAG_CMD_CACHE (writeback cache) as default. + */ + s->cache_flags = SD_FLAG_CMD_CACHE; + if (flags & BDRV_O_NOCACHE) { + s->cache_flags = SD_FLAG_CMD_DIRECT; } + s->discard_supported = true; if (snapid || tag[0] != '\0') { - dprintf("%" PRIx32 " snapshot inode was open.\n", vid); + DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid); s->is_snapshot = true; } - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { - error_report("failed to connect"); ret = fd; goto out; } buf = g_malloc(SD_INODE_SIZE); ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0, - s->cache_enabled); + s->cache_flags); closesocket(fd); @@ -1151,9 +1343,10 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) s->min_dirty_data_idx = UINT32_MAX; s->max_dirty_data_idx = 0; - bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE; + bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; pstrcpy(s->name, sizeof(s->name), vdi); qemu_co_mutex_init(&s->lock); + qemu_opts_del(opts); g_free(buf); return 0; out: @@ -1161,13 +1354,13 @@ out: if (s->fd >= 0) { closesocket(s->fd); } + qemu_opts_del(opts); g_free(buf); return ret; } -static int do_sd_create(char *filename, int64_t vdi_size, - uint32_t base_vid, uint32_t *vdi_id, int snapshot, - const char *addr, const char *port) +static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, + uint32_t base_vid, uint32_t *vdi_id, int snapshot) { SheepdogVdiReq hdr; SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; @@ -1175,7 +1368,7 @@ static int do_sd_create(char *filename, int64_t vdi_size, unsigned int wlen, rlen = 0; char buf[SD_MAX_VDI_LEN]; - fd = connect_to_sdog(addr, port); + fd = connect_to_sdog(s); if (fd < 0) { return fd; } @@ -1188,7 +1381,7 @@ static int do_sd_create(char *filename, int64_t vdi_size, memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_NEW_VDI; - hdr.base_vdi_id = base_vid; + hdr.vdi_id = base_vid; wlen = SD_MAX_VDI_LEN; @@ -1226,7 +1419,7 @@ static int sd_prealloc(const char *filename) void *buf = g_malloc0(SD_DATA_OBJ_SIZE); int ret; - ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR); + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); if (ret < 0) { goto out; } @@ -1271,17 +1464,17 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; bool prealloc = false; - const char *vdiname; s = g_malloc0(sizeof(BDRVSheepdogState)); - strstart(filename, "sheepdog:", &vdiname); - memset(vdi, 0, sizeof(vdi)); memset(tag, 0, sizeof(tag)); - if (parse_vdiname(s, vdiname, vdi, &snapid, tag) < 0) { - error_report("invalid filename"); - ret = -EINVAL; + if (strstr(filename, "://")) { + ret = sd_parse_uri(s, filename, vdi, &snapid, tag); + } else { + ret = parse_vdiname(s, filename, vdi, &snapid, tag); + } + if (ret < 0) { goto out; } @@ -1317,14 +1510,14 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) BlockDriver *drv; /* Currently, only Sheepdog backing image is supported. */ - drv = bdrv_find_protocol(backing_file); + drv = bdrv_find_protocol(backing_file, true); if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { error_report("backing_file must be a sheepdog image"); ret = -EINVAL; goto out; } - ret = bdrv_file_open(&bs, backing_file, 0); + ret = bdrv_file_open(&bs, backing_file, NULL, 0); if (ret < 0) { goto out; } @@ -1342,7 +1535,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) bdrv_delete(bs); } - ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port); + ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0); if (!prealloc || ret) { goto out; } @@ -1361,9 +1554,9 @@ static void sd_close(BlockDriverState *bs) unsigned int wlen, rlen = 0; int fd, ret; - dprintf("%s\n", s->name); + DPRINTF("%s\n", s->name); - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { return; } @@ -1371,6 +1564,7 @@ static void sd_close(BlockDriverState *bs) memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_RELEASE_VDI; + hdr.vdi_id = s->inode.vdi_id; wlen = strlen(s->name) + 1; hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; @@ -1386,10 +1580,7 @@ static void sd_close(BlockDriverState *bs) qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); closesocket(s->fd); - if (s->cache_enabled) { - closesocket(s->flush_fd); - } - g_free(s->addr); + g_free(s->host_spec); } static int64_t sd_getlength(BlockDriverState *bs) @@ -1413,7 +1604,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) return -EINVAL; } - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { return fd; } @@ -1422,7 +1613,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); s->inode.vdi_size = offset; ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), - s->inode.nr_copies, datalen, 0, false, s->cache_enabled); + s->inode.nr_copies, datalen, 0, false, s->cache_flags); close(fd); if (ret < 0) { @@ -1476,6 +1667,43 @@ out: sd_finish_aiocb(acb); } +/* Delete current working VDI on the snapshot chain */ +static bool sd_delete(BDRVSheepdogState *s) +{ + unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; + SheepdogVdiReq hdr = { + .opcode = SD_OP_DEL_VDI, + .vdi_id = s->inode.vdi_id, + .data_length = wlen, + .flags = SD_FLAG_CMD_WRITE, + }; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + int fd, ret; + + fd = connect_to_sdog(s); + if (fd < 0) { + return false; + } + + ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); + closesocket(fd); + if (ret) { + return false; + } + switch (rsp->result) { + case SD_RES_NO_VDI: + error_report("%s was already deleted", s->name); + /* fall through */ + case SD_RES_SUCCESS: + break; + default: + error_report("%s, %s", sd_strerror(rsp->result), s->name); + return false; + } + + return true; +} + /* * Create a writable VDI from a snapshot */ @@ -1484,28 +1712,34 @@ static int sd_create_branch(BDRVSheepdogState *s) int ret, fd; uint32_t vid; char *buf; + bool deleted; - dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); + DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); buf = g_malloc(SD_INODE_SIZE); - ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1, - s->addr, s->port); + /* + * Even If deletion fails, we will just create extra snapshot based on + * the workding VDI which was supposed to be deleted. So no need to + * false bail out. + */ + deleted = sd_delete(s); + ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, + !deleted); if (ret) { goto out; } - dprintf("%" PRIx32 " is created.\n", vid); + DPRINTF("%" PRIx32 " is created.\n", vid); - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { - error_report("failed to connect"); ret = fd; goto out; } ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, - SD_INODE_SIZE, 0, s->cache_enabled); + SD_INODE_SIZE, 0, s->cache_flags); closesocket(fd); @@ -1517,7 +1751,7 @@ static int sd_create_branch(BDRVSheepdogState *s) s->is_snapshot = false; ret = 0; - dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id); + DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id); out: g_free(buf); @@ -1541,10 +1775,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) { SheepdogAIOCB *acb = p; int ret = 0; - unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE; - unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE; + unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; + unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE; uint64_t oid; - uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE; + uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE; BDRVSheepdogState *s = acb->common.bs->opaque; SheepdogInode *inode = &s->inode; AIOReq *aio_req; @@ -1593,16 +1827,25 @@ static int coroutine_fn sd_co_rw_vector(void *p) flags = SD_FLAG_CMD_COW; } break; + case AIOCB_DISCARD_OBJ: + /* + * We discard the object only when the whole object is + * 1) allocated 2) trimmed. Otherwise, simply skip it. + */ + if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) { + goto done; + } + break; default: break; } if (create) { - dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", + DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", inode->vdi_id, oid, vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); oid = vid_to_data_oid(inode->vdi_id, idx); - dprintf("new oid %" PRIx64 "\n", oid); + DPRINTF("new oid %" PRIx64 "\n", oid); } aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); @@ -1654,14 +1897,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, int ret; if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE); + ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE); if (ret < 0) { return ret; } bs->total_sectors = sector_num + nb_sectors; } - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); acb->aio_done_func = sd_write_done; acb->aiocb_type = AIOCB_WRITE_UDATA; @@ -1682,7 +1925,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, SheepdogAIOCB *acb; int ret; - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); acb->aiocb_type = AIOCB_READ_UDATA; acb->aio_done_func = sd_finish_aiocb; @@ -1700,39 +1943,31 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) { BDRVSheepdogState *s = bs->opaque; - SheepdogObjReq hdr = { 0 }; - SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; - SheepdogInode *inode = &s->inode; + SheepdogAIOCB *acb; + AIOReq *aio_req; int ret; - unsigned int wlen = 0, rlen = 0; - if (!s->cache_enabled) { + if (s->cache_flags != SD_FLAG_CMD_CACHE) { return 0; } - hdr.opcode = SD_OP_FLUSH_VDI; - hdr.oid = vid_to_vdi_oid(inode->vdi_id); + acb = sd_aio_setup(bs, NULL, 0, 0); + acb->aiocb_type = AIOCB_FLUSH_CACHE; + acb->aio_done_func = sd_finish_aiocb; - ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); - if (ret) { - error_report("failed to send a request to the sheep"); + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), + 0, 0, 0, 0, 0); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); + if (ret < 0) { + error_report("add_aio_request is failed"); + free_aio_req(s, aio_req); + qemu_aio_release(acb); return ret; } - if (rsp->result == SD_RES_INVALID_PARMS) { - dprintf("disable write cache since the server doesn't support it\n"); - - s->cache_enabled = false; - closesocket(s->flush_fd); - return 0; - } - - if (rsp->result != SD_RES_SUCCESS) { - error_report("%s", sd_strerror(rsp->result)); - return -EIO; - } - - return 0; + qemu_coroutine_yield(); + return acb->ret; } static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) @@ -1743,7 +1978,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) SheepdogInode *inode; unsigned int datalen; - dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " + DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " "is_snapshot %d\n", sn_info->name, sn_info->id_str, s->name, sn_info->vm_state_size, s->is_snapshot); @@ -1754,7 +1989,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) return -EINVAL; } - dprintf("%s %s\n", sn_info->name, sn_info->id_str); + DPRINTF("%s %s\n", sn_info->name, sn_info->id_str); s->inode.vm_state_size = sn_info->vm_state_size; s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; @@ -1766,21 +2001,21 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); /* refresh inode. */ - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { ret = fd; goto cleanup; } ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), - s->inode.nr_copies, datalen, 0, false, s->cache_enabled); + s->inode.nr_copies, datalen, 0, false, s->cache_flags); if (ret < 0) { error_report("failed to write snapshot's inode."); goto cleanup; } - ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1, - s->addr, s->port); + ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, + 1); if (ret < 0) { error_report("failed to create inode for snapshot. %s", strerror(errno)); @@ -1790,7 +2025,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) inode = (SheepdogInode *)g_malloc(datalen); ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid), - s->inode.nr_copies, datalen, 0, s->cache_enabled); + s->inode.nr_copies, datalen, 0, s->cache_flags); if (ret < 0) { error_report("failed to read new inode info. %s", strerror(errno)); @@ -1798,7 +2033,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) } memcpy(&s->inode, inode, datalen); - dprintf("s->inode: name %s snap_id %x oid %x\n", + DPRINTF("s->inode: name %s snap_id %x oid %x\n", s->inode.name, s->inode.snap_id, s->inode.vdi_id); cleanup: @@ -1806,70 +2041,47 @@ cleanup: return ret; } +/* + * We implement rollback(loadvm) operation to the specified snapshot by + * 1) switch to the snapshot + * 2) rely on sd_create_branch to delete working VDI and + * 3) create a new working VDI based on the speicified snapshot + */ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { BDRVSheepdogState *s = bs->opaque; BDRVSheepdogState *old_s; - char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; - char *buf = NULL; - uint32_t vid; + char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid = 0; - int ret = 0, fd; + int ret = 0; old_s = g_malloc(sizeof(BDRVSheepdogState)); memcpy(old_s, s, sizeof(BDRVSheepdogState)); - pstrcpy(vdi, sizeof(vdi), s->name); - snapid = strtoul(snapshot_id, NULL, 10); if (snapid) { tag[0] = 0; } else { - pstrcpy(tag, sizeof(tag), s->name); + pstrcpy(tag, sizeof(tag), snapshot_id); } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1); + ret = reload_inode(s, snapid, tag); if (ret) { - error_report("Failed to find_vdi_name"); goto out; } - fd = connect_to_sdog(s->addr, s->port); - if (fd < 0) { - error_report("failed to connect"); - ret = fd; - goto out; - } - - buf = g_malloc(SD_INODE_SIZE); - ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, - SD_INODE_SIZE, 0, s->cache_enabled); - - closesocket(fd); - + ret = sd_create_branch(s); if (ret) { goto out; } - memcpy(&s->inode, buf, sizeof(s->inode)); - - if (!s->inode.vm_state_size) { - error_report("Invalid snapshot"); - ret = -ENOENT; - goto out; - } - - s->is_snapshot = true; - - g_free(buf); g_free(old_s); return 0; out: /* recover bdrv_sd_state */ memcpy(s, old_s, sizeof(BDRVSheepdogState)); - g_free(buf); g_free(old_s); error_report("failed to open. recover old bdrv_sd_state."); @@ -1899,7 +2111,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) vdi_inuse = g_malloc(max); - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { ret = fd; goto out; @@ -1926,9 +2138,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); start_nr = hval & (SD_NR_VDIS - 1); - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { - error_report("failed to connect"); ret = fd; goto out; } @@ -1941,7 +2152,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) /* we don't need to read entire object */ ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, - s->cache_enabled); + s->cache_flags); if (ret) { continue; @@ -1982,10 +2193,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, int fd, ret = 0, remaining = size; unsigned int data_len; uint64_t vmstate_oid; - uint32_t vdi_index; uint64_t offset; + uint32_t vdi_index; + uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; - fd = connect_to_sdog(s->addr, s->port); + fd = connect_to_sdog(s); if (fd < 0) { return fd; } @@ -1996,17 +2208,17 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset); - vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index); + vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); create = (offset == 0); if (load) { ret = read_object(fd, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, - s->cache_enabled); + s->cache_flags); } else { ret = write_object(fd, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, create, - s->cache_enabled); + s->cache_flags); } if (ret < 0) { @@ -2024,12 +2236,19 @@ cleanup: return ret; } -static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data, - int64_t pos, int size) +static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) { BDRVSheepdogState *s = bs->opaque; + void *buf; + int ret; + + buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buf(qiov, 0, buf, qiov->size); + ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0); + qemu_vfree(buf); - return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0); + return ret; } static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, @@ -2041,6 +2260,67 @@ static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, } +static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + SheepdogAIOCB *acb; + QEMUIOVector dummy; + BDRVSheepdogState *s = bs->opaque; + int ret; + + if (!s->discard_supported) { + return 0; + } + + acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors); + acb->aiocb_type = AIOCB_DISCARD_OBJ; + acb->aio_done_func = sd_finish_aiocb; + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; +} + +static coroutine_fn int +sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE, + end = DIV_ROUND_UP((sector_num + nb_sectors) * + BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); + unsigned long idx; + int ret = 1; + + for (idx = start; idx < end; idx++) { + if (inode->data_vdi_id[idx] == 0) { + break; + } + } + if (idx == start) { + /* Get the longest length of unallocated sectors */ + ret = 0; + for (idx = start + 1; idx < end; idx++) { + if (inode->data_vdi_id[idx] != 0) { + break; + } + } + } + + *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE; + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } + return ret; +} + static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -2060,19 +2340,78 @@ static QEMUOptionParameter sd_create_options[] = { { NULL } }; -BlockDriver bdrv_sheepdog = { +static BlockDriver bdrv_sheepdog = { .format_name = "sheepdog", .protocol_name = "sheepdog", .instance_size = sizeof(BDRVSheepdogState), .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_getlength = sd_getlength, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .create_options = sd_create_options, +}; + +static BlockDriver bdrv_sheepdog_tcp = { + .format_name = "sheepdog", + .protocol_name = "sheepdog+tcp", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_file_open = sd_open, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_getlength = sd_getlength, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .create_options = sd_create_options, +}; + +static BlockDriver bdrv_sheepdog_unix = { + .format_name = "sheepdog", + .protocol_name = "sheepdog+unix", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_file_open = sd_open, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2088,5 +2427,7 @@ BlockDriver bdrv_sheepdog = { static void bdrv_sheepdog_init(void) { bdrv_register(&bdrv_sheepdog); + bdrv_register(&bdrv_sheepdog_tcp); + bdrv_register(&bdrv_sheepdog_unix); } block_init(bdrv_sheepdog_init); diff --git a/block/snapshot.c b/block/snapshot.c new file mode 100644 index 000000000..6c6d9deea --- /dev/null +++ b/block/snapshot.c @@ -0,0 +1,157 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/snapshot.h" +#include "block/block_int.h" + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i, ret; + + ret = -ENOENT; + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + return ret; + } + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = 0; + break; + } + } + g_free(sn_tab); + return ret; +} + +int bdrv_can_snapshot(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + if (!drv->bdrv_snapshot_create) { + if (bs->file != NULL) { + return bdrv_can_snapshot(bs->file); + } + return 0; + } + + return 1; +} + +int bdrv_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_create) { + return drv->bdrv_snapshot_create(bs, sn_info); + } + if (bs->file) { + return bdrv_snapshot_create(bs->file, sn_info); + } + return -ENOTSUP; +} + +int bdrv_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id) +{ + BlockDriver *drv = bs->drv; + int ret, open_ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_goto) { + return drv->bdrv_snapshot_goto(bs, snapshot_id); + } + + if (bs->file) { + drv->bdrv_close(bs); + ret = bdrv_snapshot_goto(bs->file, snapshot_id); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); + if (open_ret < 0) { + bdrv_delete(bs->file); + bs->drv = NULL; + return open_ret; + } + return ret; + } + + return -ENOTSUP; +} + +int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_delete) { + return drv->bdrv_snapshot_delete(bs, snapshot_id); + } + if (bs->file) { + return bdrv_snapshot_delete(bs->file, snapshot_id); + } + return -ENOTSUP; +} + +int bdrv_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_list) { + return drv->bdrv_snapshot_list(bs, psn_info); + } + if (bs->file) { + return bdrv_snapshot_list(bs->file, psn_info); + } + return -ENOTSUP; +} + +int bdrv_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_name) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (!bs->read_only) { + return -EINVAL; + } + if (drv->bdrv_snapshot_load_tmp) { + return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); + } + return -ENOTSUP; +} diff --git a/block/ssh.c b/block/ssh.c new file mode 100644 index 000000000..d7e7bf8dd --- /dev/null +++ b/block/ssh.c @@ -0,0 +1,1076 @@ +/* + * Secure Shell (ssh) backend for QEMU. + * + * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones <rjones@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> + +#include <libssh2.h> +#include <libssh2_sftp.h> + +#include "block/block_int.h" +#include "qemu/sockets.h" +#include "qemu/uri.h" +#include "qapi/qmp/qint.h" + +/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in + * this block driver code. + * + * TRACE_LIBSSH2=<bitmask> enables tracing in libssh2 itself. Note + * that this requires that libssh2 was specially compiled with the + * `./configure --enable-debug' option, so most likely you will have + * to compile it yourself. The meaning of <bitmask> is described + * here: http://www.libssh2.org/libssh2_trace.html + */ +#define DEBUG_SSH 0 +#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */ + +#define DPRINTF(fmt, ...) \ + do { \ + if (DEBUG_SSH) { \ + fprintf(stderr, "ssh: %-15s " fmt "\n", \ + __func__, ##__VA_ARGS__); \ + } \ + } while (0) + +typedef struct BDRVSSHState { + /* Coroutine. */ + CoMutex lock; + + /* SSH connection. */ + int sock; /* socket */ + LIBSSH2_SESSION *session; /* ssh session */ + LIBSSH2_SFTP *sftp; /* sftp session */ + LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */ + + /* See ssh_seek() function below. */ + int64_t offset; + bool offset_op_read; + + /* File attributes at open. We try to keep the .filesize field + * updated if it changes (eg by writing at the end of the file). + */ + LIBSSH2_SFTP_ATTRIBUTES attrs; + + /* Used to warn if 'flush' is not supported. */ + char *hostport; + bool unsafe_flush_warning; +} BDRVSSHState; + +static void ssh_state_init(BDRVSSHState *s) +{ + memset(s, 0, sizeof *s); + s->sock = -1; + s->offset = -1; + qemu_co_mutex_init(&s->lock); +} + +static void ssh_state_free(BDRVSSHState *s) +{ + g_free(s->hostport); + if (s->sftp_handle) { + libssh2_sftp_close(s->sftp_handle); + } + if (s->sftp) { + libssh2_sftp_shutdown(s->sftp); + } + if (s->session) { + libssh2_session_disconnect(s->session, + "from qemu ssh client: " + "user closed the connection"); + libssh2_session_free(s->session); + } + if (s->sock >= 0) { + close(s->sock); + } +} + +/* Wrappers around error_report which make sure to dump as much + * information from libssh2 as possible. + */ +static void GCC_FMT_ATTR(2, 3) +session_error_report(BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + + va_start(args, fs); + error_vprintf(fs, args); + + if ((s)->session) { + char *ssh_err; + int ssh_err_code; + + libssh2_session_last_error((s)->session, &ssh_err, NULL, 0); + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_errno((s)->session); + + error_printf(": %s (libssh2 error code: %d)", ssh_err, ssh_err_code); + } + + va_end(args); + error_printf("\n"); +} + +static void GCC_FMT_ATTR(2, 3) +sftp_error_report(BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + + va_start(args, fs); + error_vprintf(fs, args); + + if ((s)->sftp) { + char *ssh_err; + int ssh_err_code; + unsigned long sftp_err_code; + + libssh2_session_last_error((s)->session, &ssh_err, NULL, 0); + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_errno((s)->session); + /* See <libssh2_sftp.h>. */ + sftp_err_code = libssh2_sftp_last_error((s)->sftp); + + error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)", + ssh_err, ssh_err_code, sftp_err_code); + } + + va_end(args); + error_printf("\n"); +} + +static int parse_uri(const char *filename, QDict *options, Error **errp) +{ + URI *uri = NULL; + QueryParams *qp = NULL; + int i; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; + } + + if (strcmp(uri->scheme, "ssh") != 0) { + error_setg(errp, "URI scheme must be 'ssh'"); + goto err; + } + + if (!uri->server || strcmp(uri->server, "") == 0) { + error_setg(errp, "missing hostname in URI"); + goto err; + } + + if (!uri->path || strcmp(uri->path, "") == 0) { + error_setg(errp, "missing remote path in URI"); + goto err; + } + + qp = query_params_parse(uri->query); + if (!qp) { + error_setg(errp, "could not parse query parameters"); + goto err; + } + + if(uri->user && strcmp(uri->user, "") != 0) { + qdict_put(options, "user", qstring_from_str(uri->user)); + } + + qdict_put(options, "host", qstring_from_str(uri->server)); + + if (uri->port) { + qdict_put(options, "port", qint_from_int(uri->port)); + } + + qdict_put(options, "path", qstring_from_str(uri->path)); + + /* Pick out any query parameters that we understand, and ignore + * the rest. + */ + for (i = 0; i < qp->n; ++i) { + if (strcmp(qp->p[i].name, "host_key_check") == 0) { + qdict_put(options, "host_key_check", + qstring_from_str(qp->p[i].value)); + } + } + + query_params_free(qp); + uri_free(uri); + return 0; + + err: + if (qp) { + query_params_free(qp); + } + if (uri) { + uri_free(uri); + } + return -EINVAL; +} + +static void ssh_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + if (qdict_haskey(options, "user") || + qdict_haskey(options, "host") || + qdict_haskey(options, "port") || + qdict_haskey(options, "path") || + qdict_haskey(options, "host_key_check")) { + error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option"); + return; + } + + parse_uri(filename, options, errp); +} + +static int check_host_key_knownhosts(BDRVSSHState *s, + const char *host, int port) +{ + const char *home; + char *knh_file = NULL; + LIBSSH2_KNOWNHOSTS *knh = NULL; + struct libssh2_knownhost *found; + int ret, r; + const char *hostkey; + size_t len; + int type; + + hostkey = libssh2_session_hostkey(s->session, &len, &type); + if (!hostkey) { + ret = -EINVAL; + session_error_report(s, "failed to read remote host key"); + goto out; + } + + knh = libssh2_knownhost_init(s->session); + if (!knh) { + ret = -EINVAL; + session_error_report(s, "failed to initialize known hosts support"); + goto out; + } + + home = getenv("HOME"); + if (home) { + knh_file = g_strdup_printf("%s/.ssh/known_hosts", home); + } else { + knh_file = g_strdup_printf("/root/.ssh/known_hosts"); + } + + /* Read all known hosts from OpenSSH-style known_hosts file. */ + libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH); + + r = libssh2_knownhost_checkp(knh, host, port, hostkey, len, + LIBSSH2_KNOWNHOST_TYPE_PLAIN| + LIBSSH2_KNOWNHOST_KEYENC_RAW, + &found); + switch (r) { + case LIBSSH2_KNOWNHOST_CHECK_MATCH: + /* OK */ + DPRINTF("host key OK: %s", found->key); + break; + case LIBSSH2_KNOWNHOST_CHECK_MISMATCH: + ret = -EINVAL; + session_error_report(s, "host key does not match the one in known_hosts (found key %s)", + found->key); + goto out; + case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND: + ret = -EINVAL; + session_error_report(s, "no host key was found in known_hosts"); + goto out; + case LIBSSH2_KNOWNHOST_CHECK_FAILURE: + ret = -EINVAL; + session_error_report(s, "failure matching the host key with known_hosts"); + goto out; + default: + ret = -EINVAL; + session_error_report(s, "unknown error matching the host key with known_hosts (%d)", + r); + goto out; + } + + /* known_hosts checking successful. */ + ret = 0; + + out: + if (knh != NULL) { + libssh2_knownhost_free(knh); + } + g_free(knh_file); + return ret; +} + +static unsigned hex2decimal(char ch) +{ + if (ch >= '0' && ch <= '9') { + return (ch - '0'); + } else if (ch >= 'a' && ch <= 'f') { + return 10 + (ch - 'a'); + } else if (ch >= 'A' && ch <= 'F') { + return 10 + (ch - 'A'); + } + + return -1; +} + +/* Compare the binary fingerprint (hash of host key) with the + * host_key_check parameter. + */ +static int compare_fingerprint(const unsigned char *fingerprint, size_t len, + const char *host_key_check) +{ + unsigned c; + + while (len > 0) { + while (*host_key_check == ':') + host_key_check++; + if (!qemu_isxdigit(host_key_check[0]) || + !qemu_isxdigit(host_key_check[1])) + return 1; + c = hex2decimal(host_key_check[0]) * 16 + + hex2decimal(host_key_check[1]); + if (c - *fingerprint != 0) + return c - *fingerprint; + fingerprint++; + len--; + host_key_check += 2; + } + return *host_key_check - '\0'; +} + +static int +check_host_key_hash(BDRVSSHState *s, const char *hash, + int hash_type, size_t fingerprint_len) +{ + const char *fingerprint; + + fingerprint = libssh2_hostkey_hash(s->session, hash_type); + if (!fingerprint) { + session_error_report(s, "failed to read remote host key"); + return -EINVAL; + } + + if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len, + hash) != 0) { + error_report("remote host key does not match host_key_check '%s'", + hash); + return -EPERM; + } + + return 0; +} + +static int check_host_key(BDRVSSHState *s, const char *host, int port, + const char *host_key_check) +{ + /* host_key_check=no */ + if (strcmp(host_key_check, "no") == 0) { + return 0; + } + + /* host_key_check=md5:xx:yy:zz:... */ + if (strncmp(host_key_check, "md5:", 4) == 0) { + return check_host_key_hash(s, &host_key_check[4], + LIBSSH2_HOSTKEY_HASH_MD5, 16); + } + + /* host_key_check=sha1:xx:yy:zz:... */ + if (strncmp(host_key_check, "sha1:", 5) == 0) { + return check_host_key_hash(s, &host_key_check[5], + LIBSSH2_HOSTKEY_HASH_SHA1, 20); + } + + /* host_key_check=yes */ + if (strcmp(host_key_check, "yes") == 0) { + return check_host_key_knownhosts(s, host, port); + } + + error_report("unknown host_key_check setting (%s)", host_key_check); + return -EINVAL; +} + +static int authenticate(BDRVSSHState *s, const char *user) +{ + int r, ret; + const char *userauthlist; + LIBSSH2_AGENT *agent = NULL; + struct libssh2_agent_publickey *identity; + struct libssh2_agent_publickey *prev_identity = NULL; + + userauthlist = libssh2_userauth_list(s->session, user, strlen(user)); + if (strstr(userauthlist, "publickey") == NULL) { + ret = -EPERM; + error_report("remote server does not support \"publickey\" authentication"); + goto out; + } + + /* Connect to ssh-agent and try each identity in turn. */ + agent = libssh2_agent_init(s->session); + if (!agent) { + ret = -EINVAL; + session_error_report(s, "failed to initialize ssh-agent support"); + goto out; + } + if (libssh2_agent_connect(agent)) { + ret = -ECONNREFUSED; + session_error_report(s, "failed to connect to ssh-agent"); + goto out; + } + if (libssh2_agent_list_identities(agent)) { + ret = -EINVAL; + session_error_report(s, "failed requesting identities from ssh-agent"); + goto out; + } + + for(;;) { + r = libssh2_agent_get_identity(agent, &identity, prev_identity); + if (r == 1) { /* end of list */ + break; + } + if (r < 0) { + ret = -EINVAL; + session_error_report(s, "failed to obtain identity from ssh-agent"); + goto out; + } + r = libssh2_agent_userauth(agent, user, identity); + if (r == 0) { + /* Authenticated! */ + ret = 0; + goto out; + } + /* Failed to authenticate with this identity, try the next one. */ + prev_identity = identity; + } + + ret = -EPERM; + error_report("failed to authenticate using publickey authentication " + "and the identities held by your ssh-agent"); + + out: + if (agent != NULL) { + /* Note: libssh2 implementation implicitly calls + * libssh2_agent_disconnect if necessary. + */ + libssh2_agent_free(agent); + } + + return ret; +} + +static int connect_to_ssh(BDRVSSHState *s, QDict *options, + int ssh_flags, int creat_mode) +{ + int r, ret; + Error *err = NULL; + const char *host, *user, *path, *host_key_check; + int port; + + host = qdict_get_str(options, "host"); + + if (qdict_haskey(options, "port")) { + port = qdict_get_int(options, "port"); + } else { + port = 22; + } + + path = qdict_get_str(options, "path"); + + if (qdict_haskey(options, "user")) { + user = qdict_get_str(options, "user"); + } else { + user = g_get_user_name(); + if (!user) { + ret = -errno; + goto err; + } + } + + if (qdict_haskey(options, "host_key_check")) { + host_key_check = qdict_get_str(options, "host_key_check"); + } else { + host_key_check = "yes"; + } + + /* Construct the host:port name for inet_connect. */ + g_free(s->hostport); + s->hostport = g_strdup_printf("%s:%d", host, port); + + /* Open the socket and connect. */ + s->sock = inet_connect(s->hostport, &err); + if (err != NULL) { + ret = -errno; + qerror_report_err(err); + error_free(err); + goto err; + } + + /* Create SSH session. */ + s->session = libssh2_session_init(); + if (!s->session) { + ret = -EINVAL; + session_error_report(s, "failed to initialize libssh2 session"); + goto err; + } + +#if TRACE_LIBSSH2 != 0 + libssh2_trace(s->session, TRACE_LIBSSH2); +#endif + + r = libssh2_session_handshake(s->session, s->sock); + if (r != 0) { + ret = -EINVAL; + session_error_report(s, "failed to establish SSH session"); + goto err; + } + + /* Check the remote host's key against known_hosts. */ + ret = check_host_key(s, host, port, host_key_check); + if (ret < 0) { + goto err; + } + + /* Authenticate. */ + ret = authenticate(s, user); + if (ret < 0) { + goto err; + } + + /* Start SFTP. */ + s->sftp = libssh2_sftp_init(s->session); + if (!s->sftp) { + session_error_report(s, "failed to initialize sftp handle"); + ret = -EINVAL; + goto err; + } + + /* Open the remote file. */ + DPRINTF("opening file %s flags=0x%x creat_mode=0%o", + path, ssh_flags, creat_mode); + s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode); + if (!s->sftp_handle) { + session_error_report(s, "failed to open remote file '%s'", path); + ret = -EINVAL; + goto err; + } + + r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); + if (r < 0) { + sftp_error_report(s, "failed to read file attributes"); + return -EINVAL; + } + + /* Delete the options we've used; any not deleted will cause the + * block layer to give an error about unused options. + */ + qdict_del(options, "host"); + qdict_del(options, "port"); + qdict_del(options, "user"); + qdict_del(options, "path"); + qdict_del(options, "host_key_check"); + + return 0; + + err: + if (s->sftp_handle) { + libssh2_sftp_close(s->sftp_handle); + } + s->sftp_handle = NULL; + if (s->sftp) { + libssh2_sftp_shutdown(s->sftp); + } + s->sftp = NULL; + if (s->session) { + libssh2_session_disconnect(s->session, + "from qemu ssh client: " + "error opening connection"); + libssh2_session_free(s->session); + } + s->session = NULL; + + return ret; +} + +static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags) +{ + BDRVSSHState *s = bs->opaque; + int ret; + int ssh_flags; + + ssh_state_init(s); + + ssh_flags = LIBSSH2_FXF_READ; + if (bdrv_flags & BDRV_O_RDWR) { + ssh_flags |= LIBSSH2_FXF_WRITE; + } + + /* Start up SSH. */ + ret = connect_to_ssh(s, options, ssh_flags, 0); + if (ret < 0) { + goto err; + } + + /* Go non-blocking. */ + libssh2_session_set_blocking(s->session, 0); + + return 0; + + err: + if (s->sock >= 0) { + close(s->sock); + } + s->sock = -1; + + return ret; +} + +static QEMUOptionParameter ssh_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static int ssh_create(const char *filename, QEMUOptionParameter *options) +{ + int r, ret; + Error *local_err = NULL; + int64_t total_size = 0; + QDict *uri_options = NULL; + BDRVSSHState s; + ssize_t r2; + char c[1] = { '\0' }; + + ssh_state_init(&s); + + /* Get desired file size. */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n; + } + options++; + } + DPRINTF("total_size=%" PRIi64, total_size); + + uri_options = qdict_new(); + r = parse_uri(filename, uri_options, &local_err); + if (r < 0) { + qerror_report_err(local_err); + error_free(local_err); + ret = r; + goto out; + } + + r = connect_to_ssh(&s, uri_options, + LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE| + LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, 0644); + if (r < 0) { + ret = r; + goto out; + } + + if (total_size > 0) { + libssh2_sftp_seek64(s.sftp_handle, total_size-1); + r2 = libssh2_sftp_write(s.sftp_handle, c, 1); + if (r2 < 0) { + sftp_error_report(&s, "truncate failed"); + ret = -EINVAL; + goto out; + } + s.attrs.filesize = total_size; + } + + ret = 0; + + out: + ssh_state_free(&s); + if (uri_options != NULL) { + QDECREF(uri_options); + } + return ret; +} + +static void ssh_close(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + + ssh_state_free(s); +} + +static int ssh_has_zero_init(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + /* Assume false, unless we can positively prove it's true. */ + int has_zero_init = 0; + + if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) { + if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) { + has_zero_init = 1; + } + } + + return has_zero_init; +} + +static void restart_coroutine(void *opaque) +{ + Coroutine *co = opaque; + + DPRINTF("co=%p", co); + + qemu_coroutine_enter(co, NULL); +} + +/* Always true because when we have called set_fd_handler there is + * always a request being processed. + */ +static int return_true(void *opaque) +{ + return 1; +} + +static coroutine_fn void set_fd_handler(BDRVSSHState *s) +{ + int r; + IOHandler *rd_handler = NULL, *wr_handler = NULL; + Coroutine *co = qemu_coroutine_self(); + + r = libssh2_session_block_directions(s->session); + + if (r & LIBSSH2_SESSION_BLOCK_INBOUND) { + rd_handler = restart_coroutine; + } + if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) { + wr_handler = restart_coroutine; + } + + DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, + rd_handler, wr_handler); + + qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co); +} + +static coroutine_fn void clear_fd_handler(BDRVSSHState *s) +{ + DPRINTF("s->sock=%d", s->sock); + qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); +} + +/* A non-blocking call returned EAGAIN, so yield, ensuring the + * handlers are set up so that we'll be rescheduled when there is an + * interesting event on the socket. + */ +static coroutine_fn void co_yield(BDRVSSHState *s) +{ + set_fd_handler(s); + qemu_coroutine_yield(); + clear_fd_handler(s); +} + +/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position + * in the remote file. Notice that it just updates a field in the + * sftp_handle structure, so there is no network traffic and it cannot + * fail. + * + * However, `libssh2_sftp_seek64' does have a catastrophic effect on + * performance since it causes the handle to throw away all in-flight + * reads and buffered readahead data. Therefore this function tries + * to be intelligent about when to call the underlying libssh2 function. + */ +#define SSH_SEEK_WRITE 0 +#define SSH_SEEK_READ 1 +#define SSH_SEEK_FORCE 2 + +static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags) +{ + bool op_read = (flags & SSH_SEEK_READ) != 0; + bool force = (flags & SSH_SEEK_FORCE) != 0; + + if (force || op_read != s->offset_op_read || offset != s->offset) { + DPRINTF("seeking to offset=%" PRIi64, offset); + libssh2_sftp_seek64(s->sftp_handle, offset); + s->offset = offset; + s->offset_op_read = op_read; + } +} + +static coroutine_fn int ssh_read(BDRVSSHState *s, + int64_t offset, size_t size, + QEMUIOVector *qiov) +{ + ssize_t r; + size_t got; + char *buf, *end_of_vec; + struct iovec *i; + + DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + + ssh_seek(s, offset, SSH_SEEK_READ); + + /* This keeps track of the current iovec element ('i'), where we + * will write to next ('buf'), and the end of the current iovec + * ('end_of_vec'). + */ + i = &qiov->iov[0]; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + + /* libssh2 has a hard-coded limit of 2000 bytes per request, + * although it will also do readahead behind our backs. Therefore + * we may have to do repeated reads here until we have read 'size' + * bytes. + */ + for (got = 0; got < size; ) { + again: + DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf); + r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf); + DPRINTF("sftp_read returned %zd", r); + + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s); + goto again; + } + if (r < 0) { + sftp_error_report(s, "read failed"); + s->offset = -1; + return -EIO; + } + if (r == 0) { + /* EOF: Short read so pad the buffer with zeroes and return it. */ + qemu_iovec_memset(qiov, got, 0, size - got); + return 0; + } + + got += r; + buf += r; + s->offset += r; + if (buf >= end_of_vec && got < size) { + i++; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + } + } + + return 0; +} + +static coroutine_fn int ssh_co_readv(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_read(s, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static int ssh_write(BDRVSSHState *s, + int64_t offset, size_t size, + QEMUIOVector *qiov) +{ + ssize_t r; + size_t written; + char *buf, *end_of_vec; + struct iovec *i; + + DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + + ssh_seek(s, offset, SSH_SEEK_WRITE); + + /* This keeps track of the current iovec element ('i'), where we + * will read from next ('buf'), and the end of the current iovec + * ('end_of_vec'). + */ + i = &qiov->iov[0]; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + + for (written = 0; written < size; ) { + again: + DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf); + r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf); + DPRINTF("sftp_write returned %zd", r); + + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s); + goto again; + } + if (r < 0) { + sftp_error_report(s, "write failed"); + s->offset = -1; + return -EIO; + } + /* The libssh2 API is very unclear about this. A comment in + * the code says "nothing was acked, and no EAGAIN was + * received!" which apparently means that no data got sent + * out, and the underlying channel didn't return any EAGAIN + * indication. I think this is a bug in either libssh2 or + * OpenSSH (server-side). In any case, forcing a seek (to + * discard libssh2 internal buffers), and then trying again + * works for me. + */ + if (r == 0) { + ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE); + co_yield(s); + goto again; + } + + written += r; + buf += r; + s->offset += r; + if (buf >= end_of_vec && written < size) { + i++; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + } + + if (offset + written > s->attrs.filesize) + s->attrs.filesize = offset + written; + } + + return 0; +} + +static coroutine_fn int ssh_co_writev(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_write(s, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static void unsafe_flush_warning(BDRVSSHState *s, const char *what) +{ + if (!s->unsafe_flush_warning) { + error_report("warning: ssh server %s does not support fsync", + s->hostport); + if (what) { + error_report("to support fsync, you need %s", what); + } + s->unsafe_flush_warning = true; + } +} + +#ifdef HAS_LIBSSH2_SFTP_FSYNC + +static coroutine_fn int ssh_flush(BDRVSSHState *s) +{ + int r; + + DPRINTF("fsync"); + again: + r = libssh2_sftp_fsync(s->sftp_handle); + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s); + goto again; + } + if (r == LIBSSH2_ERROR_SFTP_PROTOCOL && + libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) { + unsafe_flush_warning(s, "OpenSSH >= 6.3"); + return 0; + } + if (r < 0) { + sftp_error_report(s, "fsync failed"); + return -EIO; + } + + return 0; +} + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_flush(s); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +#else /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + + unsafe_flush_warning(s, "libssh2 >= 1.4.4"); + return 0; +} + +#endif /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static int64_t ssh_getlength(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + int64_t length; + + /* Note we cannot make a libssh2 call here. */ + length = (int64_t) s->attrs.filesize; + DPRINTF("length=%" PRIi64, length); + + return length; +} + +static BlockDriver bdrv_ssh = { + .format_name = "ssh", + .protocol_name = "ssh", + .instance_size = sizeof(BDRVSSHState), + .bdrv_parse_filename = ssh_parse_filename, + .bdrv_file_open = ssh_file_open, + .bdrv_create = ssh_create, + .bdrv_close = ssh_close, + .bdrv_has_zero_init = ssh_has_zero_init, + .bdrv_co_readv = ssh_co_readv, + .bdrv_co_writev = ssh_co_writev, + .bdrv_getlength = ssh_getlength, + .bdrv_co_flush_to_disk = ssh_co_flush, + .create_options = ssh_create_options, +}; + +static void bdrv_ssh_init(void) +{ + int r; + + r = libssh2_init(0); + if (r != 0) { + fprintf(stderr, "libssh2 initialization failed, %d\n", r); + exit(EXIT_FAILURE); + } + + bdrv_register(&bdrv_ssh); +} + +block_init(bdrv_ssh_init); diff --git a/block/stream.c b/block/stream.c index 0c0fc7a13..7fe9e486b 100644 --- a/block/stream.c +++ b/block/stream.c @@ -12,8 +12,8 @@ */ #include "trace.h" -#include "block_int.h" -#include "blockjob.h" +#include "block/block_int.h" +#include "block/blockjob.h" #include "qemu/ratelimit.h" enum { @@ -108,7 +108,7 @@ static void coroutine_fn stream_run(void *opaque) wait: /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that qemu_aio_flush() returns. + * with no pending I/O here so that bdrv_drain_all() returns. */ block_job_sleep_ns(&s->common, rt_clock, delay_ns); if (block_job_is_cancelled(&s->common)) { @@ -198,7 +198,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static BlockJobType stream_job_type = { +static const BlockJobType stream_job_type = { .instance_size = sizeof(StreamBlockJob), .job_type = "stream", .set_speed = stream_set_speed, diff --git a/block/vdi.c b/block/vdi.c index c8330b7ea..8a915257e 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -50,15 +50,15 @@ */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" -#include "migration.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" #if defined(CONFIG_UUID) #include <uuid/uuid.h> #else /* TODO: move uuid emulation to some central place in QEMU. */ -#include "sysemu.h" /* UUID_FMT */ +#include "sysemu/sysemu.h" /* UUID_FMT */ typedef unsigned char uuid_t[16]; #endif @@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header) { char uuid[37]; logout("text %s", header->text); - logout("signature 0x%04x\n", header->signature); + logout("signature 0x%08x\n", header->signature); logout("header size 0x%04x\n", header->header_size); logout("image type 0x%04x\n", header->image_type); logout("image flags 0x%04x\n", header->image_flags); @@ -364,15 +364,17 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) return result; } -static int vdi_open(BlockDriverState *bs, int flags) +static int vdi_open(BlockDriverState *bs, QDict *options, int flags) { BDRVVdiState *s = bs->opaque; VdiHeader header; size_t bmap_size; + int ret; logout("\n"); - if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) { + ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1); + if (ret < 0) { goto fail; } @@ -390,33 +392,45 @@ static int vdi_open(BlockDriverState *bs, int flags) header.disk_size &= ~(SECTOR_SIZE - 1); } - if (header.version != VDI_VERSION_1_1) { + if (header.signature != VDI_SIGNATURE) { + logout("bad vdi signature %08x\n", header.signature); + ret = -EMEDIUMTYPE; + goto fail; + } else if (header.version != VDI_VERSION_1_1) { logout("unsupported version %u.%u\n", header.version >> 16, header.version & 0xffff); + ret = -ENOTSUP; goto fail; } else if (header.offset_bmap % SECTOR_SIZE != 0) { /* We only support block maps which start on a sector boundary. */ logout("unsupported block map offset 0x%x B\n", header.offset_bmap); + ret = -ENOTSUP; goto fail; } else if (header.offset_data % SECTOR_SIZE != 0) { /* We only support data blocks which start on a sector boundary. */ logout("unsupported data offset 0x%x B\n", header.offset_data); + ret = -ENOTSUP; goto fail; } else if (header.sector_size != SECTOR_SIZE) { logout("unsupported sector size %u B\n", header.sector_size); + ret = -ENOTSUP; goto fail; } else if (header.block_size != 1 * MiB) { logout("unsupported block size %u B\n", header.block_size); + ret = -ENOTSUP; goto fail; } else if (header.disk_size > (uint64_t)header.blocks_in_image * header.block_size) { logout("unsupported disk size %" PRIu64 " B\n", header.disk_size); + ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_link)) { logout("link uuid != 0, unsupported\n"); + ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_parent)) { logout("parent uuid != 0, unsupported\n"); + ret = -ENOTSUP; goto fail; } @@ -429,10 +443,9 @@ static int vdi_open(BlockDriverState *bs, int flags) bmap_size = header.blocks_in_image * sizeof(uint32_t); bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE; - if (bmap_size > 0) { - s->bmap = g_malloc(bmap_size * SECTOR_SIZE); - } - if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) { + s->bmap = g_malloc(bmap_size * SECTOR_SIZE); + ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); + if (ret < 0) { goto fail_free_bmap; } @@ -448,7 +461,7 @@ static int vdi_open(BlockDriverState *bs, int flags) g_free(s->bmap); fail: - return -1; + return ret; } static int vdi_reopen_prepare(BDRVReopenState *state, @@ -766,6 +779,7 @@ static BlockDriver bdrv_vdi = { .bdrv_close = vdi_close, .bdrv_reopen_prepare = vdi_reopen_prepare, .bdrv_create = vdi_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_is_allocated = vdi_co_is_allocated, .bdrv_make_empty = vdi_make_empty, diff --git a/block/vhdx.c b/block/vhdx.c new file mode 100644 index 000000000..e9704b1fd --- /dev/null +++ b/block/vhdx.c @@ -0,0 +1,972 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/crc32c.h" +#include "block/vhdx.h" + + +/* Several metadata and region table data entries are identified by + * guids in a MS-specific GUID format. */ + + +/* ------- Known Region Table GUIDs ---------------------- */ +static const MSGUID bat_guid = { .data1 = 0x2dc27766, + .data2 = 0xf623, + .data3 = 0x4200, + .data4 = { 0x9d, 0x64, 0x11, 0x5e, + 0x9b, 0xfd, 0x4a, 0x08} }; + +static const MSGUID metadata_guid = { .data1 = 0x8b7ca206, + .data2 = 0x4790, + .data3 = 0x4b9a, + .data4 = { 0xb8, 0xfe, 0x57, 0x5f, + 0x05, 0x0f, 0x88, 0x6e} }; + + + +/* ------- Known Metadata Entry GUIDs ---------------------- */ +static const MSGUID file_param_guid = { .data1 = 0xcaa16737, + .data2 = 0xfa36, + .data3 = 0x4d43, + .data4 = { 0xb3, 0xb6, 0x33, 0xf0, + 0xaa, 0x44, 0xe7, 0x6b} }; + +static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224, + .data2 = 0xcd1b, + .data3 = 0x4876, + .data4 = { 0xb2, 0x11, 0x5d, 0xbe, + 0xd8, 0x3b, 0xf4, 0xb8} }; + +static const MSGUID page83_guid = { .data1 = 0xbeca12ab, + .data2 = 0xb2e6, + .data3 = 0x4523, + .data4 = { 0x93, 0xef, 0xc3, 0x09, + 0xe0, 0x00, 0xc7, 0x46} }; + + +static const MSGUID phys_sector_guid = { .data1 = 0xcda348c7, + .data2 = 0x445d, + .data3 = 0x4471, + .data4 = { 0x9c, 0xc9, 0xe9, 0x88, + 0x52, 0x51, 0xc5, 0x56} }; + +static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d, + .data2 = 0xb30b, + .data3 = 0x454d, + .data4 = { 0xab, 0xf7, 0xd3, + 0xd8, 0x48, 0x34, + 0xab, 0x0c} }; + +static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d, + .data2 = 0xa96f, + .data3 = 0x4709, + .data4 = { 0xba, 0x47, 0xf2, + 0x33, 0xa8, 0xfa, + 0xab, 0x5f} }; + +/* Each parent type must have a valid GUID; this is for parent images + * of type 'VHDX'. If we were to allow e.g. a QCOW2 parent, we would + * need to make up our own QCOW2 GUID type */ +static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7, + .data2 = 0xd19e, + .data3 = 0x4a81, + .data4 = { 0xb7, 0x89, 0x25, 0xb8, + 0xe9, 0x44, 0x59, 0x13} }; + + +#define META_FILE_PARAMETER_PRESENT 0x01 +#define META_VIRTUAL_DISK_SIZE_PRESENT 0x02 +#define META_PAGE_83_PRESENT 0x04 +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08 +#define META_PHYS_SECTOR_SIZE_PRESENT 0x10 +#define META_PARENT_LOCATOR_PRESENT 0x20 + +#define META_ALL_PRESENT \ + (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \ + META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ + META_PHYS_SECTOR_SIZE_PRESENT) + +typedef struct VHDXMetadataEntries { + VHDXMetadataTableEntry file_parameters_entry; + VHDXMetadataTableEntry virtual_disk_size_entry; + VHDXMetadataTableEntry page83_data_entry; + VHDXMetadataTableEntry logical_sector_size_entry; + VHDXMetadataTableEntry phys_sector_size_entry; + VHDXMetadataTableEntry parent_locator_entry; + uint16_t present; +} VHDXMetadataEntries; + + +typedef struct VHDXSectorInfo { + uint32_t bat_idx; /* BAT entry index */ + uint32_t sectors_avail; /* sectors available in payload block */ + uint32_t bytes_left; /* bytes left in the block after data to r/w */ + uint32_t bytes_avail; /* bytes available in payload block */ + uint64_t file_offset; /* absolute offset in bytes, in file */ + uint64_t block_offset; /* block offset, in bytes */ +} VHDXSectorInfo; + + + +typedef struct BDRVVHDXState { + CoMutex lock; + + int curr_header; + VHDXHeader *headers[2]; + + VHDXRegionTableHeader rt; + VHDXRegionTableEntry bat_rt; /* region table for the BAT */ + VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ + + VHDXMetadataTableHeader metadata_hdr; + VHDXMetadataEntries metadata_entries; + + VHDXFileParameters params; + uint32_t block_size; + uint32_t block_size_bits; + uint32_t sectors_per_block; + uint32_t sectors_per_block_bits; + + uint64_t virtual_disk_size; + uint32_t logical_sector_size; + uint32_t physical_sector_size; + + uint64_t chunk_ratio; + uint32_t chunk_ratio_bits; + uint32_t logical_sector_size_bits; + + uint32_t bat_entries; + VHDXBatEntry *bat; + uint64_t bat_offset; + + VHDXParentLocatorHeader parent_header; + VHDXParentLocatorEntry *parent_entries; + +} BDRVVHDXState; + +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, + int crc_offset) +{ + uint32_t crc_new; + uint32_t crc_orig; + assert(buf != NULL); + + if (crc_offset > 0) { + memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); + memset(buf + crc_offset, 0, sizeof(crc_orig)); + } + + crc_new = crc32c(crc, buf, size); + if (crc_offset > 0) { + memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig)); + } + + return crc_new; +} + +/* Validates the checksum of the buffer, with an in-place CRC. + * + * Zero is substituted during crc calculation for the original crc field, + * and the crc field is restored afterwards. But the buffer will be modifed + * during the calculation, so this may not be not suitable for multi-threaded + * use. + * + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * returns true if checksum is valid, false otherwise + */ +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) +{ + uint32_t crc_orig; + uint32_t crc; + + assert(buf != NULL); + assert(size > (crc_offset + 4)); + + memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); + crc_orig = le32_to_cpu(crc_orig); + + crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset); + + return crc == crc_orig; +} + + +/* + * Per the MS VHDX Specification, for every VHDX file: + * - The header section is fixed size - 1 MB + * - The header section is always the first "object" + * - The first 64KB of the header is the File Identifier + * - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile") + * - The following 512 bytes constitute a UTF-16 string identifiying the + * software that created the file, and is optional and diagnostic only. + * + * Therefore, we probe by looking for the vhdxfile signature "vhdxfile" + */ +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) { + return 100; + } + return 0; +} + +/* All VHDX structures on disk are little endian */ +static void vhdx_header_le_import(VHDXHeader *h) +{ + assert(h != NULL); + + le32_to_cpus(&h->signature); + le32_to_cpus(&h->checksum); + le64_to_cpus(&h->sequence_number); + + leguid_to_cpus(&h->file_write_guid); + leguid_to_cpus(&h->data_write_guid); + leguid_to_cpus(&h->log_guid); + + le16_to_cpus(&h->log_version); + le16_to_cpus(&h->version); + le32_to_cpus(&h->log_length); + le64_to_cpus(&h->log_offset); +} + + +/* opens the specified header block from the VHDX file header section */ +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + VHDXHeader *header1; + VHDXHeader *header2; + bool h1_valid = false; + bool h2_valid = false; + uint64_t h1_seq = 0; + uint64_t h2_seq = 0; + uint8_t *buffer; + + header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); + header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); + + buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE); + + s->headers[0] = header1; + s->headers[1] = header2; + + /* We have to read the whole VHDX_HEADER_SIZE instead of + * sizeof(VHDXHeader), because the checksum is over the whole + * region */ + ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE); + if (ret < 0) { + goto fail; + } + /* copy over just the relevant portion that we need */ + memcpy(header1, buffer, sizeof(VHDXHeader)); + vhdx_header_le_import(header1); + + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && + !memcmp(&header1->signature, "head", 4) && + header1->version == 1) { + h1_seq = header1->sequence_number; + h1_valid = true; + } + + ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); + if (ret < 0) { + goto fail; + } + /* copy over just the relevant portion that we need */ + memcpy(header2, buffer, sizeof(VHDXHeader)); + vhdx_header_le_import(header2); + + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && + !memcmp(&header2->signature, "head", 4) && + header2->version == 1) { + h2_seq = header2->sequence_number; + h2_valid = true; + } + + /* If there is only 1 valid header (or no valid headers), we + * don't care what the sequence numbers are */ + if (h1_valid && !h2_valid) { + s->curr_header = 0; + } else if (!h1_valid && h2_valid) { + s->curr_header = 1; + } else if (!h1_valid && !h2_valid) { + ret = -EINVAL; + goto fail; + } else { + /* If both headers are valid, then we choose the active one by the + * highest sequence number. If the sequence numbers are equal, that is + * invalid */ + if (h1_seq > h2_seq) { + s->curr_header = 0; + } else if (h2_seq > h1_seq) { + s->curr_header = 1; + } else { + ret = -EINVAL; + goto fail; + } + } + + ret = 0; + + goto exit; + +fail: + qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found"); + qemu_vfree(header1); + qemu_vfree(header2); + s->headers[0] = NULL; + s->headers[1] = NULL; +exit: + qemu_vfree(buffer); + return ret; +} + + +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + uint8_t *buffer; + int offset = 0; + VHDXRegionTableEntry rt_entry; + uint32_t i; + bool bat_rt_found = false; + bool metadata_rt_found = false; + + /* We have to read the whole 64KB block, because the crc32 is over the + * whole block */ + buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); + + ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto fail; + } + memcpy(&s->rt, buffer, sizeof(s->rt)); + le32_to_cpus(&s->rt.signature); + le32_to_cpus(&s->rt.checksum); + le32_to_cpus(&s->rt.entry_count); + le32_to_cpus(&s->rt.reserved); + offset += sizeof(s->rt); + + if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) || + memcmp(&s->rt.signature, "regi", 4)) { + ret = -EINVAL; + goto fail; + } + + /* Per spec, maximum region table entry count is 2047 */ + if (s->rt.entry_count > 2047) { + ret = -EINVAL; + goto fail; + } + + for (i = 0; i < s->rt.entry_count; i++) { + memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); + offset += sizeof(rt_entry); + + leguid_to_cpus(&rt_entry.guid); + le64_to_cpus(&rt_entry.file_offset); + le32_to_cpus(&rt_entry.length); + le32_to_cpus(&rt_entry.data_bits); + + /* see if we recognize the entry */ + if (guid_eq(rt_entry.guid, bat_guid)) { + /* must be unique; if we have already found it this is invalid */ + if (bat_rt_found) { + ret = -EINVAL; + goto fail; + } + bat_rt_found = true; + s->bat_rt = rt_entry; + continue; + } + + if (guid_eq(rt_entry.guid, metadata_guid)) { + /* must be unique; if we have already found it this is invalid */ + if (metadata_rt_found) { + ret = -EINVAL; + goto fail; + } + metadata_rt_found = true; + s->metadata_rt = rt_entry; + continue; + } + + if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) { + /* cannot read vhdx file - required region table entry that + * we do not understand. per spec, we must fail to open */ + ret = -ENOTSUP; + goto fail; + } + } + ret = 0; + +fail: + qemu_vfree(buffer); + return ret; +} + + + +/* Metadata initial parser + * + * This loads all the metadata entry fields. This may cause additional + * fields to be processed (e.g. parent locator, etc..). + * + * There are 5 Metadata items that are always required: + * - File Parameters (block size, has a parent) + * - Virtual Disk Size (size, in bytes, of the virtual drive) + * - Page 83 Data (scsi page 83 guid) + * - Logical Sector Size (logical sector size in bytes, either 512 or + * 4096. We only support 512 currently) + * - Physical Sector Size (512 or 4096) + * + * Also, if the File Parameters indicate this is a differencing file, + * we must also look for the Parent Locator metadata item. + */ +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + uint8_t *buffer; + int offset = 0; + uint32_t i = 0; + VHDXMetadataTableEntry md_entry; + + buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); + + ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer, + VHDX_METADATA_TABLE_MAX_SIZE); + if (ret < 0) { + goto exit; + } + memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); + offset += sizeof(s->metadata_hdr); + + le64_to_cpus(&s->metadata_hdr.signature); + le16_to_cpus(&s->metadata_hdr.reserved); + le16_to_cpus(&s->metadata_hdr.entry_count); + + if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) { + ret = -EINVAL; + goto exit; + } + + s->metadata_entries.present = 0; + + if ((s->metadata_hdr.entry_count * sizeof(md_entry)) > + (VHDX_METADATA_TABLE_MAX_SIZE - offset)) { + ret = -EINVAL; + goto exit; + } + + for (i = 0; i < s->metadata_hdr.entry_count; i++) { + memcpy(&md_entry, buffer + offset, sizeof(md_entry)); + offset += sizeof(md_entry); + + leguid_to_cpus(&md_entry.item_id); + le32_to_cpus(&md_entry.offset); + le32_to_cpus(&md_entry.length); + le32_to_cpus(&md_entry.data_bits); + le32_to_cpus(&md_entry.reserved2); + + if (guid_eq(md_entry.item_id, file_param_guid)) { + if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.file_parameters_entry = md_entry; + s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, virtual_size_guid)) { + if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.virtual_disk_size_entry = md_entry; + s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, page83_guid)) { + if (s->metadata_entries.present & META_PAGE_83_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.page83_data_entry = md_entry; + s->metadata_entries.present |= META_PAGE_83_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, logical_sector_guid)) { + if (s->metadata_entries.present & + META_LOGICAL_SECTOR_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.logical_sector_size_entry = md_entry; + s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, phys_sector_guid)) { + if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.phys_sector_size_entry = md_entry; + s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, parent_locator_guid)) { + if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.parent_locator_entry = md_entry; + s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT; + continue; + } + + if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) { + /* cannot read vhdx file - required region table entry that + * we do not understand. per spec, we must fail to open */ + ret = -ENOTSUP; + goto exit; + } + } + + if (s->metadata_entries.present != META_ALL_PRESENT) { + ret = -ENOTSUP; + goto exit; + } + + ret = bdrv_pread(bs->file, + s->metadata_entries.file_parameters_entry.offset + + s->metadata_rt.file_offset, + &s->params, + sizeof(s->params)); + + if (ret < 0) { + goto exit; + } + + le32_to_cpus(&s->params.block_size); + le32_to_cpus(&s->params.data_bits); + + + /* We now have the file parameters, so we can tell if this is a + * differencing file (i.e.. has_parent), is dynamic or fixed + * sized (leave_blocks_allocated), and the block size */ + + /* The parent locator required iff the file parameters has_parent set */ + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { + /* TODO: parse parent locator fields */ + ret = -ENOTSUP; /* temp, until differencing files are supported */ + goto exit; + } else { + /* if has_parent is set, but there is not parent locator present, + * then that is an invalid combination */ + ret = -EINVAL; + goto exit; + } + } + + /* determine virtual disk size, logical sector size, + * and phys sector size */ + + ret = bdrv_pread(bs->file, + s->metadata_entries.virtual_disk_size_entry.offset + + s->metadata_rt.file_offset, + &s->virtual_disk_size, + sizeof(uint64_t)); + if (ret < 0) { + goto exit; + } + ret = bdrv_pread(bs->file, + s->metadata_entries.logical_sector_size_entry.offset + + s->metadata_rt.file_offset, + &s->logical_sector_size, + sizeof(uint32_t)); + if (ret < 0) { + goto exit; + } + ret = bdrv_pread(bs->file, + s->metadata_entries.phys_sector_size_entry.offset + + s->metadata_rt.file_offset, + &s->physical_sector_size, + sizeof(uint32_t)); + if (ret < 0) { + goto exit; + } + + le64_to_cpus(&s->virtual_disk_size); + le32_to_cpus(&s->logical_sector_size); + le32_to_cpus(&s->physical_sector_size); + + if (s->logical_sector_size == 0 || s->params.block_size == 0) { + ret = -EINVAL; + goto exit; + } + + /* both block_size and sector_size are guaranteed powers of 2 */ + s->sectors_per_block = s->params.block_size / s->logical_sector_size; + s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * + (uint64_t)s->logical_sector_size / + (uint64_t)s->params.block_size; + + /* These values are ones we will want to use for division / multiplication + * later on, and they are all guaranteed (per the spec) to be powers of 2, + * so we can take advantage of that for shift operations during + * reads/writes */ + if (s->logical_sector_size & (s->logical_sector_size - 1)) { + ret = -EINVAL; + goto exit; + } + if (s->sectors_per_block & (s->sectors_per_block - 1)) { + ret = -EINVAL; + goto exit; + } + if (s->chunk_ratio & (s->chunk_ratio - 1)) { + ret = -EINVAL; + goto exit; + } + s->block_size = s->params.block_size; + if (s->block_size & (s->block_size - 1)) { + ret = -EINVAL; + goto exit; + } + + s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); + s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); + s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); + s->block_size_bits = 31 - clz32(s->block_size); + + ret = 0; + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Parse the replay log. Per the VHDX spec, if the log is present + * it must be replayed prior to opening the file, even read-only. + * + * If read-only, we must replay the log in RAM (or refuse to open + * a dirty VHDX file read-only */ +static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + int i; + VHDXHeader *hdr; + + hdr = s->headers[s->curr_header]; + + /* either the log guid, or log length is zero, + * then a replay log is present */ + for (i = 0; i < sizeof(hdr->log_guid.data4); i++) { + ret |= hdr->log_guid.data4[i]; + } + if (hdr->log_guid.data1 == 0 && + hdr->log_guid.data2 == 0 && + hdr->log_guid.data3 == 0 && + ret == 0) { + goto exit; + } + + /* per spec, only log version of 0 is supported */ + if (hdr->log_version != 0) { + ret = -EINVAL; + goto exit; + } + + if (hdr->log_length == 0) { + goto exit; + } + + /* We currently do not support images with logs to replay */ + ret = -ENOTSUP; + +exit: + return ret; +} + + +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVVHDXState *s = bs->opaque; + int ret = 0; + uint32_t i; + uint64_t signature; + uint32_t data_blocks_cnt, bitmap_blocks_cnt; + + + s->bat = NULL; + + qemu_co_mutex_init(&s->lock); + + /* validate the file signature */ + ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + if (memcmp(&signature, "vhdxfile", 8)) { + ret = -EINVAL; + goto fail; + } + + ret = vhdx_parse_header(bs, s); + if (ret) { + goto fail; + } + + ret = vhdx_parse_log(bs, s); + if (ret) { + goto fail; + } + + ret = vhdx_open_region_tables(bs, s); + if (ret) { + goto fail; + } + + ret = vhdx_parse_metadata(bs, s); + if (ret) { + goto fail; + } + s->block_size = s->params.block_size; + + /* the VHDX spec dictates that virtual_disk_size is always a multiple of + * logical_sector_size */ + bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; + + data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; + if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { + data_blocks_cnt++; + } + bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; + if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { + bitmap_blocks_cnt++; + } + + if (s->parent_entries) { + s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); + } else { + s->bat_entries = data_blocks_cnt + + ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); + } + + s->bat_offset = s->bat_rt.file_offset; + + if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) { + /* BAT allocation is not large enough for all entries */ + ret = -EINVAL; + goto fail; + } + + s->bat = qemu_blockalign(bs, s->bat_rt.length); + + ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->bat_entries; i++) { + le64_to_cpus(&s->bat[i]); + } + + if (flags & BDRV_O_RDWR) { + ret = -ENOTSUP; + goto fail; + } + + /* TODO: differencing files, write */ + + return 0; +fail: + qemu_vfree(s->headers[0]); + qemu_vfree(s->headers[1]); + qemu_vfree(s->bat); + qemu_vfree(s->parent_entries); + return ret; +} + +static int vhdx_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + + +/* + * Perform sector to block offset translations, to get various + * sector and file offsets into the image. See VHDXSectorInfo + */ +static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, + int nb_sectors, VHDXSectorInfo *sinfo) +{ + uint32_t block_offset; + + sinfo->bat_idx = sector_num >> s->sectors_per_block_bits; + /* effectively a modulo - this gives us the offset into the block + * (in sector sizes) for our sector number */ + block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits); + /* the chunk ratio gives us the interleaving of the sector + * bitmaps, so we need to advance our page block index by the + * sector bitmaps entry number */ + sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits; + + /* the number of sectors we can read/write in this cycle */ + sinfo->sectors_avail = s->sectors_per_block - block_offset; + + sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits; + + if (sinfo->sectors_avail > nb_sectors) { + sinfo->sectors_avail = nb_sectors; + } + + sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; + + sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS; + + sinfo->block_offset = block_offset << s->logical_sector_size_bits; + + /* The file offset must be past the header section, so must be > 0 */ + if (sinfo->file_offset == 0) { + return; + } + + /* block offset is the offset in vhdx logical sectors, in + * the payload data block. Convert that to a byte offset + * in the block, and add in the payload data block offset + * in the file, in bytes, to get the final read address */ + + sinfo->file_offset <<= 20; /* now in bytes, rather than 1MB units */ + sinfo->file_offset += sinfo->block_offset; +} + + + +static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVVHDXState *s = bs->opaque; + int ret = 0; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors > 0) { + /* We are a differencing file, so we need to inspect the sector bitmap + * to see if we have the data or not */ + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, sinfo.bytes_avail); + + /* check the payload block state */ + switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) { + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNDEFINED: /* fall through */ + case PAYLOAD_BLOCK_UNMAPPED: /* fall through */ + case PAYLOAD_BLOCK_ZERO: + /* return zero */ + qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); + break; + case PAYLOAD_BLOCK_FULL_PRESENT: + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sinfo.sectors_avail, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto exit; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + } + } + ret = 0; +exit: + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; +} + + + +static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + return -ENOTSUP; +} + + +static void vhdx_close(BlockDriverState *bs) +{ + BDRVVHDXState *s = bs->opaque; + qemu_vfree(s->headers[0]); + qemu_vfree(s->headers[1]); + qemu_vfree(s->bat); + qemu_vfree(s->parent_entries); +} + +static BlockDriver bdrv_vhdx = { + .format_name = "vhdx", + .instance_size = sizeof(BDRVVHDXState), + .bdrv_probe = vhdx_probe, + .bdrv_open = vhdx_open, + .bdrv_close = vhdx_close, + .bdrv_reopen_prepare = vhdx_reopen_prepare, + .bdrv_co_readv = vhdx_co_readv, + .bdrv_co_writev = vhdx_co_writev, +}; + +static void bdrv_vhdx_init(void) +{ + bdrv_register(&bdrv_vhdx); +} + +block_init(bdrv_vhdx_init); diff --git a/block/vhdx.h b/block/vhdx.h new file mode 100644 index 000000000..fb687ed2d --- /dev/null +++ b/block/vhdx.h @@ -0,0 +1,325 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_VHDX_H +#define BLOCK_VHDX_H + +/* Structures and fields present in the VHDX file */ + +/* The header section has the following blocks, + * each block is 64KB: + * + * _____________________________________________________________________________ + * | File Id. | Header 1 | Header 2 | Region Table | Reserved (768KB) | + * |----------|---------------|------------|--------------|--------------------| + * | | | | | | + * 0.........64KB...........128KB........192KB..........256KB................1MB + */ + +#define VHDX_HEADER_BLOCK_SIZE (64*1024) + +#define VHDX_FILE_ID_OFFSET 0 +#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE*1) +#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE*2) +#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE*3) + + +/* + * A note on the use of MS-GUID fields. For more details on the GUID, + * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. + * + * The VHDX specification only states that these are MS GUIDs, and which + * bytes are data1-data4. It makes no mention of what algorithm should be used + * to generate the GUID, nor what standard. However, looking at the specified + * known GUID fields, it appears the GUIDs are: + * Standard/DCE GUID type (noted by 10b in the MSB of byte 0 of .data4) + * Random algorithm (noted by 0x4XXX for .data3) + */ + +/* ---- HEADER SECTION STRUCTURES ---- */ + +/* These structures are ones that are defined in the VHDX specification + * document */ + +typedef struct VHDXFileIdentifier { + uint64_t signature; /* "vhdxfile" in ASCII */ + uint16_t creator[256]; /* optional; utf-16 string to identify + the vhdx file creator. Diagnotistic + only */ +} VHDXFileIdentifier; + + +/* the guid is a 16 byte unique ID - the definition for this used by + * Microsoft is not just 16 bytes though - it is a structure that is defined, + * so we need to follow it here so that endianness does not trip us up */ + +typedef struct MSGUID { + uint32_t data1; + uint16_t data2; + uint16_t data3; + uint8_t data4[8]; +} MSGUID; + +#define guid_eq(a, b) \ + (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) + +#define VHDX_HEADER_SIZE (4*1024) /* although the vhdx_header struct in disk + is only 582 bytes, for purposes of crc + the header is the first 4KB of the 64KB + block */ + +/* The full header is 4KB, although the actual header data is much smaller. + * But for the checksum calculation, it is over the entire 4KB structure, + * not just the defined portion of it */ +typedef struct QEMU_PACKED VHDXHeader { + uint32_t signature; /* "head" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the whole header */ + uint64_t sequence_number; /* Seq number of this header. Each + VHDX file has 2 of these headers, + and only the header with the highest + sequence number is valid */ + MSGUID file_write_guid; /* 128 bit unique identifier. Must be + updated to new, unique value before + the first modification is made to + file */ + MSGUID data_write_guid; /* 128 bit unique identifier. Must be + updated to new, unique value before + the first modification is made to + visible data. Visbile data is + defined as: + - system & user metadata + - raw block data + - disk size + - any change that will + cause the virtual disk + sector read to differ + + This does not need to change if + blocks are re-arranged */ + MSGUID log_guid; /* 128 bit unique identifier. If zero, + there is no valid log. If non-zero, + log entries with this guid are + valid. */ + uint16_t log_version; /* version of the log format. Mustn't be + zero, unless log_guid is also zero */ + uint16_t version; /* version of th evhdx file. Currently, + only supported version is "1" */ + uint32_t log_length; /* length of the log. Must be multiple + of 1MB */ + uint64_t log_offset; /* byte offset in the file of the log. + Must also be a multiple of 1MB */ +} VHDXHeader; + +/* Header for the region table block */ +typedef struct QEMU_PACKED VHDXRegionTableHeader { + uint32_t signature; /* "regi" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the 64KB table */ + uint32_t entry_count; /* number of valid entries */ + uint32_t reserved; +} VHDXRegionTableHeader; + +/* Individual region table entry. There may be a maximum of 2047 of these + * + * There are two known region table properties. Both are required. + * BAT (block allocation table): 2DC27766F62342009D64115E9BFD4A08 + * Metadata: 8B7CA20647904B9AB8FE575F050F886E + */ +#define VHDX_REGION_ENTRY_REQUIRED 0x01 /* if set, parser must understand + this entry in order to open + file */ +typedef struct QEMU_PACKED VHDXRegionTableEntry { + MSGUID guid; /* 128-bit unique identifier */ + uint64_t file_offset; /* offset of the object in the file. + Must be multiple of 1MB */ + uint32_t length; /* length, in bytes, of the object */ + uint32_t data_bits; +} VHDXRegionTableEntry; + + +/* ---- LOG ENTRY STRUCTURES ---- */ +#define VHDX_LOG_HDR_SIZE 64 +typedef struct QEMU_PACKED VHDXLogEntryHeader { + uint32_t signature; /* "loge" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the 64KB table */ + uint32_t entry_length; /* length in bytes, multiple of 1MB */ + uint32_t tail; /* byte offset of first log entry of a + seq, where this entry is the last + entry */ + uint64_t sequence_number; /* incremented with each log entry. + May not be zero. */ + uint32_t descriptor_count; /* number of descriptors in this log + entry, must be >= 0 */ + uint32_t reserved; + MSGUID log_guid; /* value of the log_guid from + vhdx_header. If not found in + vhdx_header, it is invalid */ + uint64_t flushed_file_offset; /* see spec for full details - this + should be vhdx file size in bytes */ + uint64_t last_file_offset; /* size in bytes that all allocated + file structures fit into */ +} VHDXLogEntryHeader; + +#define VHDX_LOG_DESC_SIZE 32 + +typedef struct QEMU_PACKED VHDXLogDescriptor { + uint32_t signature; /* "zero" or "desc" in ASCII */ + union { + uint32_t reserved; /* zero desc */ + uint32_t trailing_bytes; /* data desc: bytes 4092-4096 of the + data sector */ + }; + union { + uint64_t zero_length; /* zero desc: length of the section to + zero */ + uint64_t leading_bytes; /* data desc: bytes 0-7 of the data + sector */ + }; + uint64_t file_offset; /* file offset to write zeros - multiple + of 4kB */ + uint64_t sequence_number; /* must match same field in + vhdx_log_entry_header */ +} VHDXLogDescriptor; + +typedef struct QEMU_PACKED VHDXLogDataSector { + uint32_t data_signature; /* "data" in ASCII */ + uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */ + uint8_t data[4084]; /* raw data, bytes 8-4091 (inclusive). + see the data descriptor field for the + other mising bytes */ + uint32_t sequence_low; /* 4 LSB of 8 byte sequence_number */ +} VHDXLogDataSector; + + + +/* block states - different state values depending on whether it is a + * payload block, or a sector block. */ + +#define PAYLOAD_BLOCK_NOT_PRESENT 0 +#define PAYLOAD_BLOCK_UNDEFINED 1 +#define PAYLOAD_BLOCK_ZERO 2 +#define PAYLOAD_BLOCK_UNMAPPED 5 +#define PAYLOAD_BLOCK_FULL_PRESENT 6 +#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 + +#define SB_BLOCK_NOT_PRESENT 0 +#define SB_BLOCK_PRESENT 6 + +/* per the spec */ +#define VHDX_MAX_SECTORS_PER_BLOCK (1<<23) + +/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state + other bits are reserved */ +#define VHDX_BAT_STATE_BIT_MASK 0x07 +#define VHDX_BAT_FILE_OFF_BITS (64-44) +typedef uint64_t VHDXBatEntry; + +/* ---- METADATA REGION STRUCTURES ---- */ + +#define VHDX_METADATA_ENTRY_SIZE 32 +#define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */ +#define VHDX_METADATA_TABLE_MAX_SIZE \ + (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) +typedef struct QEMU_PACKED VHDXMetadataTableHeader { + uint64_t signature; /* "metadata" in ASCII */ + uint16_t reserved; + uint16_t entry_count; /* number table entries. <= 2047 */ + uint32_t reserved2[5]; +} VHDXMetadataTableHeader; + +#define VHDX_META_FLAGS_IS_USER 0x01 /* max 1024 entries */ +#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02 /* virtual disk metadata if set, + otherwise file metdata */ +#define VHDX_META_FLAGS_IS_REQUIRED 0x04 /* parse must understand this + entry to open the file */ +typedef struct QEMU_PACKED VHDXMetadataTableEntry { + MSGUID item_id; /* 128-bit identifier for metadata */ + uint32_t offset; /* byte offset of the metadata. At + least 64kB. Relative to start of + metadata region */ + /* note: if length = 0, so is offset */ + uint32_t length; /* length of metadata. <= 1MB. */ + uint32_t data_bits; /* least-significant 3 bits are flags, the + rest are reserved (see above) */ + uint32_t reserved2; +} VHDXMetadataTableEntry; + +#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01 /* Do not change any blocks to + be BLOCK_NOT_PRESENT. + If set indicates a fixed + size VHDX file */ +#define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */ +typedef struct QEMU_PACKED VHDXFileParameters { + uint32_t block_size; /* size of each payload block, always + power of 2, <= 256MB and >= 1MB. */ + uint32_t data_bits; /* least-significant 2 bits are flags, the rest + are reserved (see above) */ +} VHDXFileParameters; + +typedef struct QEMU_PACKED VHDXVirtualDiskSize { + uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes. + Must be multiple of the sector size, + max of 64TB */ +} VHDXVirtualDiskSize; + +typedef struct QEMU_PACKED VHDXPage83Data { + MSGUID page_83_data[16]; /* unique id for scsi devices that + support page 0x83 */ +} VHDXPage83Data; + +typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize { + uint32_t logical_sector_size; /* virtual disk sector size (in bytes). + Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskLogicalSectorSize; + +typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { + uint32_t physical_sector_size; /* physical sector size (in bytes). + Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskPhysicalSectorSize; + +typedef struct QEMU_PACKED VHDXParentLocatorHeader { + MSGUID locator_type[16]; /* type of the parent virtual disk. */ + uint16_t reserved; + uint16_t key_value_count; /* number of key/value pairs for this + locator */ +} VHDXParentLocatorHeader; + +/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */ +typedef struct QEMU_PACKED VHDXParentLocatorEntry { + uint32_t key_offset; /* offset in metadata for key, > 0 */ + uint32_t value_offset; /* offset in metadata for value, >0 */ + uint16_t key_length; /* length of entry key, > 0 */ + uint16_t value_length; /* length of entry value, > 0 */ +} VHDXParentLocatorEntry; + + +/* ----- END VHDX SPECIFICATION STRUCTURES ---- */ + + +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, + int crc_offset); + +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); + + +static void leguid_to_cpus(MSGUID *guid) +{ + le32_to_cpus(&guid->data1); + le16_to_cpus(&guid->data2); + le16_to_cpus(&guid->data3); +} + +#endif diff --git a/block/vmdk.c b/block/vmdk.c index 51398c0c0..346bb5cad 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -24,19 +24,33 @@ */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" -#include "migration.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" #include <zlib.h> #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') #define VMDK4_COMPRESSION_DEFLATE 1 +#define VMDK4_FLAG_NL_DETECT (1 << 0) #define VMDK4_FLAG_RGD (1 << 1) +/* Zeroed-grain enable bit */ +#define VMDK4_FLAG_ZERO_GRAIN (1 << 2) #define VMDK4_FLAG_COMPRESS (1 << 16) #define VMDK4_FLAG_MARKER (1 << 17) #define VMDK4_GD_AT_END 0xffffffffffffffffULL +#define VMDK_GTE_ZEROED 0x1 + +/* VMDK internal error codes */ +#define VMDK_OK 0 +#define VMDK_ERROR (-1) +/* Cluster not allocated */ +#define VMDK_UNALLOC (-2) +#define VMDK_ZEROED (-3) + +#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" + typedef struct { uint32_t version; uint32_t flags; @@ -48,19 +62,20 @@ typedef struct { uint32_t cylinders; uint32_t heads; uint32_t sectors_per_track; -} VMDK3Header; +} QEMU_PACKED VMDK3Header; typedef struct { uint32_t version; uint32_t flags; - int64_t capacity; - int64_t granularity; - int64_t desc_offset; - int64_t desc_size; - int32_t num_gtes_per_gte; - int64_t rgd_offset; - int64_t gd_offset; - int64_t grain_offset; + uint64_t capacity; + uint64_t granularity; + uint64_t desc_offset; + uint64_t desc_size; + /* Number of GrainTableEntries per GrainTable */ + uint32_t num_gtes_per_gt; + uint64_t rgd_offset; + uint64_t gd_offset; + uint64_t grain_offset; char filler[1]; char check_bytes[4]; uint16_t compressAlgorithm; @@ -73,6 +88,8 @@ typedef struct VmdkExtent { bool flat; bool compressed; bool has_marker; + bool has_zero_grain; + int version; int64_t sectors; int64_t end_sector; int64_t flat_start_offset; @@ -93,7 +110,7 @@ typedef struct VmdkExtent { typedef struct BDRVVmdkState { CoMutex lock; - int desc_offset; + uint64_t desc_offset; bool cid_updated; uint32_t parent_cid; int num_extents; @@ -108,13 +125,14 @@ typedef struct VmdkMetaData { unsigned int l2_index; unsigned int l2_offset; int valid; + uint32_t *l2_cache_entry; } VmdkMetaData; typedef struct VmdkGrainMarker { uint64_t lba; uint32_t size; uint8_t data[0]; -} VmdkGrainMarker; +} QEMU_PACKED VmdkGrainMarker; enum { MARKER_END_OF_STREAM = 0, @@ -368,15 +386,22 @@ static int vmdk_parent_open(BlockDriverState *bs) /* Create and append extent to the extent array. Return the added VmdkExtent * address. return NULL if allocation failed. */ -static VmdkExtent *vmdk_add_extent(BlockDriverState *bs, +static int vmdk_add_extent(BlockDriverState *bs, BlockDriverState *file, bool flat, int64_t sectors, int64_t l1_offset, int64_t l1_backup_offset, uint32_t l1_size, - int l2_size, unsigned int cluster_sectors) + int l2_size, uint64_t cluster_sectors, + VmdkExtent **new_extent) { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; + if (cluster_sectors > 0x200000) { + /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ + error_report("invalid granularity, image may be corrupt"); + return -EINVAL; + } + s->extents = g_realloc(s->extents, (s->num_extents + 1) * sizeof(VmdkExtent)); extent = &s->extents[s->num_extents]; @@ -399,7 +424,10 @@ static VmdkExtent *vmdk_add_extent(BlockDriverState *bs, extent->end_sector = extent->sectors; } bs->total_sectors = extent->end_sector; - return extent; + if (new_extent) { + *new_extent = extent; + } + return 0; } static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) @@ -458,12 +486,17 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, if (ret < 0) { return ret; } - extent = vmdk_add_extent(bs, + + ret = vmdk_add_extent(bs, bs->file, false, le32_to_cpu(header.disk_sectors), le32_to_cpu(header.l1dir_offset) << 9, 0, 1 << 6, 1 << 9, - le32_to_cpu(header.granularity)); + le32_to_cpu(header.granularity), + &extent); + if (ret < 0) { + return ret; + } ret = vmdk_init_tables(bs, extent); if (ret) { /* free extent allocated by vmdk_add_extent */ @@ -473,7 +506,7 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, } static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - int64_t desc_offset); + uint64_t desc_offset); static int vmdk_open_vmdk4(BlockDriverState *bs, BlockDriverState *file, @@ -490,8 +523,11 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (ret < 0) { return ret; } - if (header.capacity == 0 && header.desc_offset) { - return vmdk_open_desc_file(bs, flags, header.desc_offset << 9); + if (header.capacity == 0) { + uint64_t desc_offset = le64_to_cpu(header.desc_offset); + if (desc_offset) { + return vmdk_open_desc_file(bs, flags, desc_offset << 9); + } } if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { @@ -541,26 +577,54 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, header = footer.header; } - l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte) + if (le32_to_cpu(header.version) >= 3) { + char buf[64]; + snprintf(buf, sizeof(buf), "VMDK version %d", + le32_to_cpu(header.version)); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "vmdk", buf); + return -ENOTSUP; + } + + if (le32_to_cpu(header.num_gtes_per_gt) > 512) { + error_report("L2 table size too big"); + return -EINVAL; + } + + l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) * le64_to_cpu(header.granularity); if (l1_entry_sectors == 0) { return -EINVAL; } l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) / l1_entry_sectors; + if (l1_size > 512 * 1024 * 1024) { + /* although with big capacity and small l1_entry_sectors, we can get a + * big l1_size, we don't want unbounded value to allocate the table. + * Limit it to 512M, which is 16PB for default cluster and L2 table + * size */ + error_report("L1 size too big"); + return -EFBIG; + } if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } - extent = vmdk_add_extent(bs, file, false, + ret = vmdk_add_extent(bs, file, false, le64_to_cpu(header.capacity), le64_to_cpu(header.gd_offset) << 9, l1_backup_offset, l1_size, - le32_to_cpu(header.num_gtes_per_gte), - le64_to_cpu(header.granularity)); + le32_to_cpu(header.num_gtes_per_gt), + le64_to_cpu(header.granularity), + &extent); + if (ret < 0) { + return ret; + } extent->compressed = le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; + extent->version = le32_to_cpu(header.version); + extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; ret = vmdk_init_tables(bs, extent); if (ret) { /* free extent allocated by vmdk_add_extent */ @@ -578,22 +642,22 @@ static int vmdk_parse_description(const char *desc, const char *opt_name, opt_pos = strstr(desc, opt_name); if (!opt_pos) { - return -1; + return VMDK_ERROR; } /* Skip "=\"" following opt_name */ opt_pos += strlen(opt_name) + 2; if (opt_pos >= end) { - return -1; + return VMDK_ERROR; } opt_end = opt_pos; while (opt_end < end && *opt_end != '"') { opt_end++; } if (opt_end == end || buf_size < opt_end - opt_pos + 1) { - return -1; + return VMDK_ERROR; } pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); - return 0; + return VMDK_OK; } /* Open an extent file and append to bs array */ @@ -616,7 +680,7 @@ static int vmdk_open_sparse(BlockDriverState *bs, return vmdk_open_vmdk4(bs, file, flags); break; default: - return -EINVAL; + return -EMEDIUMTYPE; break; } } @@ -641,7 +705,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, * RW [size in sectors] SPARSE "file-name.vmdk" */ flat_offset = -1; - ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64, + ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, access, §ors, type, fname, &flat_offset); if (ret < 4 || strcmp(access, "RW")) { goto next_line; @@ -653,14 +717,6 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, return -EINVAL; } - /* trim the quotation marks around */ - if (fname[0] == '"') { - memmove(fname, fname + 1, strlen(fname)); - if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') { - return -EINVAL; - } - fname[strlen(fname) - 1] = '\0'; - } if (sectors <= 0 || (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) || (strcmp(access, "RW"))) { @@ -669,7 +725,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, path_combine(extent_path, sizeof(extent_path), desc_file_path, fname); - ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags); + ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags); if (ret) { return ret; } @@ -679,8 +735,11 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, /* FLAT extent */ VmdkExtent *extent; - extent = vmdk_add_extent(bs, extent_file, true, sectors, - 0, 0, 0, 0, sectors); + ret = vmdk_add_extent(bs, extent_file, true, sectors, + 0, 0, 0, 0, sectors, &extent); + if (ret < 0) { + return ret; + } extent->flat_start_offset = flat_offset << 9; } else if (!strcmp(type, "SPARSE")) { /* SPARSE extent */ @@ -705,33 +764,46 @@ next_line: } static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - int64_t desc_offset) + uint64_t desc_offset) { int ret; - char buf[2048]; + char *buf = NULL; char ct[128]; BDRVVmdkState *s = bs->opaque; + int64_t size; - ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf)); + size = bdrv_getlength(bs->file); + if (size < 0) { + return -EINVAL; + } + + size = MIN(size, 1 << 20); /* avoid unbounded allocation */ + buf = g_malloc0(size + 1); + + ret = bdrv_pread(bs->file, desc_offset, buf, size); if (ret < 0) { - return ret; + goto exit; } - buf[2047] = '\0'; if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { - return -EINVAL; + ret = -EMEDIUMTYPE; + goto exit; } if (strcmp(ct, "monolithicFlat") && strcmp(ct, "twoGbMaxExtentSparse") && strcmp(ct, "twoGbMaxExtentFlat")) { fprintf(stderr, "VMDK: Not supported image type \"%s\""".\n", ct); - return -ENOTSUP; + ret = -ENOTSUP; + goto exit; } s->desc_offset = 0; - return vmdk_parse_extents(buf, bs, bs->file->filename); + ret = vmdk_parse_extents(buf, bs, bs->file->filename); +exit: + g_free(buf); + return ret; } -static int vmdk_open(BlockDriverState *bs, int flags) +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) { int ret; BDRVVmdkState *s = bs->opaque; @@ -771,16 +843,17 @@ static int get_whole_cluster(BlockDriverState *bs, uint64_t offset, bool allocate) { - /* 128 sectors * 512 bytes each = grain size 64KB */ - uint8_t whole_grain[extent->cluster_sectors * 512]; + int ret = VMDK_OK; + uint8_t *whole_grain = NULL; /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ if (bs->backing_hd) { - int ret; - + whole_grain = + qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); if (!vmdk_is_cid_valid(bs)) { - return -1; + ret = VMDK_ERROR; + goto exit; } /* floor offset to cluster */ @@ -788,30 +861,35 @@ static int get_whole_cluster(BlockDriverState *bs, ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, extent->cluster_sectors); if (ret < 0) { - return -1; + ret = VMDK_ERROR; + goto exit; } /* Write grain only into the active image */ ret = bdrv_write(extent->file, cluster_offset, whole_grain, extent->cluster_sectors); if (ret < 0) { - return -1; + ret = VMDK_ERROR; + goto exit; } } - return 0; +exit: + qemu_vfree(whole_grain); + return ret; } static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) { + uint32_t offset; + QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); + offset = cpu_to_le32(m_data->offset); /* update L2 table */ if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), - &(m_data->offset), - sizeof(m_data->offset) - ) < 0) { - return -1; + &offset, sizeof(offset)) < 0) { + return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { @@ -820,13 +898,15 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) extent->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), - &(m_data->offset), sizeof(m_data->offset) - ) < 0) { - return -1; + &offset, sizeof(offset)) < 0) { + return VMDK_ERROR; } } + if (m_data->l2_cache_entry) { + *m_data->l2_cache_entry = offset; + } - return 0; + return VMDK_OK; } static int get_cluster_offset(BlockDriverState *bs, @@ -838,24 +918,25 @@ static int get_cluster_offset(BlockDriverState *bs, { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; - uint32_t min_count, *l2_table, tmp = 0; + uint32_t min_count, *l2_table; + bool zeroed = false; if (m_data) { m_data->valid = 0; } if (extent->flat) { *cluster_offset = extent->flat_start_offset; - return 0; + return VMDK_OK; } offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; l1_index = (offset >> 9) / extent->l1_entry_sectors; if (l1_index >= extent->l1_size) { - return -1; + return VMDK_ERROR; } l2_offset = extent->l1_table[l1_index]; if (!l2_offset) { - return -1; + return VMDK_UNALLOC; } for (i = 0; i < L2_CACHE_SIZE; i++) { if (l2_offset == extent->l2_cache_offsets[i]) { @@ -885,7 +966,7 @@ static int get_cluster_offset(BlockDriverState *bs, l2_table, extent->l2_size * sizeof(uint32_t) ) != extent->l2_size * sizeof(uint32_t)) { - return -1; + return VMDK_ERROR; } extent->l2_cache_offsets[min_index] = l2_offset; @@ -894,9 +975,21 @@ static int get_cluster_offset(BlockDriverState *bs, l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; *cluster_offset = le32_to_cpu(l2_table[l2_index]); - if (!*cluster_offset) { + if (m_data) { + m_data->valid = 1; + m_data->l1_index = l1_index; + m_data->l2_index = l2_index; + m_data->offset = *cluster_offset; + m_data->l2_offset = l2_offset; + m_data->l2_cache_entry = &l2_table[l2_index]; + } + if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { + zeroed = true; + } + + if (!*cluster_offset || zeroed) { if (!allocate) { - return -1; + return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } /* Avoid the L2 tables update for the images that have snapshots. */ @@ -909,8 +1002,7 @@ static int get_cluster_offset(BlockDriverState *bs, } *cluster_offset >>= 9; - tmp = cpu_to_le32(*cluster_offset); - l2_table[l2_index] = tmp; + l2_table[l2_index] = cpu_to_le32(*cluster_offset); /* First of all we write grain itself, to avoid race condition * that may to corrupt the image. @@ -919,19 +1011,15 @@ static int get_cluster_offset(BlockDriverState *bs, */ if (get_whole_cluster( bs, extent, *cluster_offset, offset, allocate) == -1) { - return -1; + return VMDK_ERROR; } if (m_data) { - m_data->offset = tmp; - m_data->l1_index = l1_index; - m_data->l2_index = l2_index; - m_data->l2_offset = l2_offset; - m_data->valid = 1; + m_data->offset = *cluster_offset; } } *cluster_offset <<= 9; - return 0; + return VMDK_OK; } static VmdkExtent *find_extent(BDRVVmdkState *s, @@ -967,8 +1055,8 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, ret = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0, &offset); qemu_co_mutex_unlock(&s->lock); - /* get_cluster_offset returning 0 means success */ - ret = !ret; + + ret = (ret == VMDK_OK || ret == VMDK_ZEROED); index_in_cluster = sector_num % extent->cluster_sectors; n = extent->cluster_sectors - index_in_cluster; @@ -1111,9 +1199,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num, if (n > nb_sectors) { n = nb_sectors; } - if (ret) { + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ - if (bs->backing_hd) { + if (bs->backing_hd && ret != VMDK_ZEROED) { if (!vmdk_is_cid_valid(bs)) { return -EINVAL; } @@ -1150,8 +1238,19 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, return ret; } +/** + * vmdk_write: + * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature + * if possible, otherwise return -ENOTSUP. + * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try + * with each cluster. By dry run we can find if the zero write + * is possible without modifying image data. + * + * Returns: error code with 0 for success. + */ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) + const uint8_t *buf, int nb_sectors, + bool zeroed, bool zero_dry_run) { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; @@ -1181,7 +1280,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, sector_num << 9, !extent->compressed, &cluster_offset); if (extent->compressed) { - if (ret == 0) { + if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ fprintf(stderr, "VMDK: can't write to allocated cluster" @@ -1197,7 +1296,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, &cluster_offset); } } - if (ret) { + if (ret == VMDK_ERROR) { return -EINVAL; } extent_begin_sector = extent->end_sector - extent->sectors; @@ -1207,17 +1306,34 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (n > nb_sectors) { n = nb_sectors; } - - ret = vmdk_write_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n, sector_num); - if (ret) { - return ret; - } - if (m_data.valid) { - /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) == -1) { - return -EIO; + if (zeroed) { + /* Do zeroed write, buf is ignored */ + if (extent->has_zero_grain && + index_in_cluster == 0 && + n >= extent->cluster_sectors) { + n = extent->cluster_sectors; + if (!zero_dry_run) { + m_data.offset = VMDK_GTE_ZEROED; + /* update L2 tables */ + if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + return -EIO; + } + } + } else { + return -ENOTSUP; + } + } else { + ret = vmdk_write_extent(extent, + cluster_offset, index_in_cluster * 512, + buf, n, sector_num); + if (ret) { + return ret; + } + if (m_data.valid) { + /* update L2 tables */ + if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + return -EIO; + } } } nb_sectors -= n; @@ -1243,14 +1359,31 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, int ret; BDRVVmdkState *s = bs->opaque; qemu_co_mutex_lock(&s->lock); - ret = vmdk_write(bs, sector_num, buf, nb_sectors); + ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + /* write zeroes could fail if sectors not aligned to cluster, test it with + * dry_run == true before really updating image */ + ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); + if (!ret) { + ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); + } qemu_co_mutex_unlock(&s->lock); return ret; } static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress) + bool flat, bool compress, bool zeroed_grain) { int ret, i; int fd = 0; @@ -1272,18 +1405,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, } magic = cpu_to_be32(VMDK4_MAGIC); memset(&header, 0, sizeof(header)); - header.version = 1; - header.flags = - 3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0); + header.version = zeroed_grain ? 2 : 1; + header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT + | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) + | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; header.capacity = filesize / 512; header.granularity = 128; - header.num_gtes_per_gte = 512; + header.num_gtes_per_gt = 512; grains = (filesize / 512 + header.granularity - 1) / header.granularity; - gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9; + gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9; gt_count = - (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte; + (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt; gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; header.desc_offset = 1; @@ -1299,7 +1433,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.flags = cpu_to_le32(header.flags); header.capacity = cpu_to_le64(header.capacity); header.granularity = cpu_to_le64(header.granularity); - header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte); + header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); header.desc_offset = cpu_to_le64(header.desc_offset); header.desc_size = cpu_to_le64(header.desc_size); header.rgd_offset = cpu_to_le64(header.rgd_offset); @@ -1365,7 +1499,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix, if (filename == NULL || !strlen(filename)) { fprintf(stderr, "Vmdk: no filename provided.\n"); - return -1; + return VMDK_ERROR; } p = strrchr(filename, '/'); if (p == NULL) { @@ -1377,7 +1511,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix, if (p != NULL) { p++; if (p - filename >= buf_len) { - return -1; + return VMDK_ERROR; } pstrcpy(path, p - filename + 1, filename); } else { @@ -1390,51 +1524,12 @@ static int filename_decompose(const char *filename, char *path, char *prefix, postfix[0] = '\0'; } else { if (q - p >= buf_len) { - return -1; + return VMDK_ERROR; } pstrcpy(prefix, q - p + 1, p); pstrcpy(postfix, buf_len, q); } - return 0; -} - -static int relative_path(char *dest, int dest_size, - const char *base, const char *target) -{ - int i = 0; - int n = 0; - const char *p, *q; -#ifdef _WIN32 - const char *sep = "\\"; -#else - const char *sep = "/"; -#endif - - if (!(dest && base && target)) { - return -1; - } - if (path_is_absolute(target)) { - pstrcpy(dest, dest_size, target); - return 0; - } - while (base[i] == target[i]) { - i++; - } - p = &base[i]; - q = &target[i]; - while (*p) { - if (*p == *sep) { - n++; - } - p++; - } - dest[0] = '\0'; - for (; n; n--) { - pstrcat(dest, dest_size, ".."); - pstrcat(dest, dest_size, sep); - } - pstrcat(dest, dest_size, q); - return 0; + return VMDK_OK; } static int vmdk_create(const char *filename, QEMUOptionParameter *options) @@ -1442,6 +1537,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) int fd, idx = 0; char desc[BUF_SIZE]; int64_t total_size = 0, filesize; + const char *adapter_type = NULL; const char *backing_file = NULL; const char *fmt = NULL; int flags = 0; @@ -1453,6 +1549,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) const char *desc_extent_line; char parent_desc_line[BUF_SIZE] = ""; uint32_t parent_cid = 0xffffffff; + uint32_t number_heads = 16; + bool zeroed_grain = false; const char desc_template[] = "# Disk DescriptorFile\n" "version=1\n" @@ -1469,9 +1567,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) "\n" "ddb.virtualHWVersion = \"%d\"\n" "ddb.geometry.cylinders = \"%" PRId64 "\"\n" - "ddb.geometry.heads = \"16\"\n" + "ddb.geometry.heads = \"%d\"\n" "ddb.geometry.sectors = \"63\"\n" - "ddb.adapterType = \"ide\"\n"; + "ddb.adapterType = \"%s\"\n"; if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) { return -EINVAL; @@ -1480,15 +1578,33 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { total_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) { + adapter_type = options->value.s; } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { backing_file = options->value.s; } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) { flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0; } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) { + zeroed_grain |= options->value.n; } options++; } + if (!adapter_type) { + adapter_type = "ide"; + } else if (strcmp(adapter_type, "ide") && + strcmp(adapter_type, "buslogic") && + strcmp(adapter_type, "lsilogic") && + strcmp(adapter_type, "legacyESX")) { + fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type); + return -EINVAL; + } + if (strcmp(adapter_type, "ide") != 0) { + /* that's the number of heads with which vmware operates when + creating, exporting, etc. vmdk files with a non-ide adapter type */ + number_heads = 255; + } if (!fmt) { /* Default format to monolithicSparse */ fmt = "monolithicSparse"; @@ -1515,9 +1631,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) return -ENOTSUP; } if (backing_file) { - char parent_filename[PATH_MAX]; BlockDriverState *bs = bdrv_new(""); - ret = bdrv_open(bs, backing_file, 0, NULL); + ret = bdrv_open(bs, backing_file, NULL, 0, NULL); if (ret != 0) { bdrv_delete(bs); return ret; @@ -1528,10 +1643,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) } parent_cid = vmdk_read_cid(bs, 0); bdrv_delete(bs); - relative_path(parent_filename, sizeof(parent_filename), - filename, backing_file); snprintf(parent_desc_line, sizeof(parent_desc_line), - "parentFileNameHint=\"%s\"", parent_filename); + "parentFileNameHint=\"%s\"", backing_file); } /* Create extents */ @@ -1558,7 +1671,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) snprintf(ext_filename, sizeof(ext_filename), "%s%s", path, desc_filename); - if (vmdk_create_extent(ext_filename, size, flat, compress)) { + if (vmdk_create_extent(ext_filename, size, + flat, compress, zeroed_grain)) { return -EINVAL; } filesize -= size; @@ -1576,7 +1690,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) parent_desc_line, ext_desc_lines, (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), - total_size / (int64_t)(63 * 16 * 512)); + total_size / (int64_t)(63 * number_heads * 512), number_heads, + adapter_type); if (split || flat) { fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, @@ -1654,6 +1769,23 @@ static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) return ret; } +static int vmdk_has_zero_init(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + + /* If has a flat extent and its underlying storage doesn't have zero init, + * return 0. */ + for (i = 0; i < s->num_extents; i++) { + if (s->extents[i].flat) { + if (!bdrv_has_zero_init(s->extents[i].file)) { + return 0; + } + } + } + return 1; +} + static QEMUOptionParameter vmdk_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1661,6 +1793,12 @@ static QEMUOptionParameter vmdk_create_options[] = { .help = "Virtual disk size" }, { + .name = BLOCK_OPT_ADAPTER_TYPE, + .type = OPT_STRING, + .help = "Virtual adapter type, can be one of " + "ide (default), lsilogic, buslogic or legacyESX" + }, + { .name = BLOCK_OPT_BACKING_FILE, .type = OPT_STRING, .help = "File name of a base image" @@ -1677,24 +1815,31 @@ static QEMUOptionParameter vmdk_create_options[] = { "VMDK flat extent format, can be one of " "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " }, + { + .name = BLOCK_OPT_ZEROED_GRAIN, + .type = OPT_FLAG, + .help = "Enable efficient zero writes using the zeroed-grain GTE feature" + }, { NULL } }; static BlockDriver bdrv_vmdk = { - .format_name = "vmdk", - .instance_size = sizeof(BDRVVmdkState), - .bdrv_probe = vmdk_probe, - .bdrv_open = vmdk_open, - .bdrv_reopen_prepare = vmdk_reopen_prepare, - .bdrv_read = vmdk_co_read, - .bdrv_write = vmdk_co_write, - .bdrv_close = vmdk_close, - .bdrv_create = vmdk_create, - .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_co_is_allocated = vmdk_co_is_allocated, - .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, - - .create_options = vmdk_create_options, + .format_name = "vmdk", + .instance_size = sizeof(BDRVVmdkState), + .bdrv_probe = vmdk_probe, + .bdrv_open = vmdk_open, + .bdrv_reopen_prepare = vmdk_reopen_prepare, + .bdrv_read = vmdk_co_read, + .bdrv_write = vmdk_co_write, + .bdrv_co_write_zeroes = vmdk_co_write_zeroes, + .bdrv_close = vmdk_close, + .bdrv_create = vmdk_create, + .bdrv_co_flush_to_disk = vmdk_co_flush, + .bdrv_co_is_allocated = vmdk_co_is_allocated, + .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, + .bdrv_has_zero_init = vmdk_has_zero_init, + + .create_options = vmdk_create_options, }; static void bdrv_vmdk_init(void) diff --git a/block/vpc.c b/block/vpc.c index b6bf52f14..fe4f311d5 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -23,9 +23,12 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block_int.h" -#include "module.h" -#include "migration.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#endif /**************************************************************/ @@ -152,7 +155,7 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int vpc_open(BlockDriverState *bs, int flags) +static int vpc_open(BlockDriverState *bs, QDict *options, int flags) { BDRVVPCState *s = bs->opaque; int i; @@ -160,24 +163,33 @@ static int vpc_open(BlockDriverState *bs, int flags) struct vhd_dyndisk_header* dyndisk_header; uint8_t buf[HEADER_SIZE]; uint32_t checksum; - int err = -1; int disk_type = VHD_DYNAMIC; + int ret; - if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) + ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE); + if (ret < 0) { goto fail; + } footer = (struct vhd_footer*) s->footer_buf; if (strncmp(footer->creator, "conectix", 8)) { int64_t offset = bdrv_getlength(bs->file); - if (offset < HEADER_SIZE) { + if (offset < 0) { + ret = offset; + goto fail; + } else if (offset < HEADER_SIZE) { + ret = -EINVAL; goto fail; } + /* If a fixed disk, the footer is found only at the end of the file */ - if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE) - != HEADER_SIZE) { + ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, + HEADER_SIZE); + if (ret < 0) { goto fail; } if (strncmp(footer->creator, "conectix", 8)) { + ret = -EMEDIUMTYPE; goto fail; } disk_type = VHD_FIXED; @@ -198,20 +210,23 @@ static int vpc_open(BlockDriverState *bs, int flags) bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - if (bs->total_sectors >= 65535 * 16 * 255) { - err = -EFBIG; + /* Allow a maximum disk size of approximately 2 TB */ + if (bs->total_sectors >= 65535LL * 255 * 255) { + ret = -EFBIG; goto fail; } if (disk_type == VHD_DYNAMIC) { - if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, - HEADER_SIZE) != HEADER_SIZE) { + ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, + HEADER_SIZE); + if (ret < 0) { goto fail; } dyndisk_header = (struct vhd_dyndisk_header *) buf; if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { + ret = -EINVAL; goto fail; } @@ -222,8 +237,10 @@ static int vpc_open(BlockDriverState *bs, int flags) s->pagetable = g_malloc(s->max_table_entries * 4); s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - if (bdrv_pread(bs->file, s->bat_offset, s->pagetable, - s->max_table_entries * 4) != s->max_table_entries * 4) { + + ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable, + s->max_table_entries * 4); + if (ret < 0) { goto fail; } @@ -261,8 +278,13 @@ static int vpc_open(BlockDriverState *bs, int flags) migrate_add_blocker(s->migration_blocker); return 0; - fail: - return err; + +fail: + g_free(s->pagetable); +#ifdef CACHE + g_free(s->pageentry_u8); +#endif + return ret; } static int vpc_reopen_prepare(BDRVReopenState *state, @@ -524,19 +546,27 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, * Note that the geometry doesn't always exactly match total_sectors but * may round it down. * - * Returns 0 on success, -EFBIG if the size is larger than 127 GB + * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override + * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB) + * and instead allow up to 255 heads. */ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, uint8_t* heads, uint8_t* secs_per_cyl) { uint32_t cyls_times_heads; - if (total_sectors > 65535 * 16 * 255) + /* Allow a maximum disk size of approximately 2 TB */ + if (total_sectors > 65535LL * 255 * 255) { return -EFBIG; + } if (total_sectors > 65535 * 16 * 63) { *secs_per_cyl = 255; - *heads = 16; + if (total_sectors > 65535 * 16 * 255) { + *heads = 255; + } else { + *heads = 16; + } cyls_times_heads = total_sectors / *secs_per_cyl; } else { *secs_per_cyl = 17; @@ -739,7 +769,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) footer->type = be32_to_cpu(disk_type); - /* TODO uuid is missing */ +#if defined(CONFIG_UUID) + uuid_generate(footer->uuid); +#endif footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); @@ -754,6 +786,18 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) return ret; } +static int vpc_has_zero_init(BlockDriverState *bs) +{ + BDRVVPCState *s = bs->opaque; + struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + + if (cpu_to_be32(footer->type) == VHD_FIXED) { + return bdrv_has_zero_init(bs->file); + } else { + return 1; + } +} + static void vpc_close(BlockDriverState *bs) { BDRVVPCState *s = bs->opaque; @@ -786,16 +830,17 @@ static BlockDriver bdrv_vpc = { .format_name = "vpc", .instance_size = sizeof(BDRVVPCState), - .bdrv_probe = vpc_probe, - .bdrv_open = vpc_open, - .bdrv_close = vpc_close, - .bdrv_reopen_prepare = vpc_reopen_prepare, - .bdrv_create = vpc_create, + .bdrv_probe = vpc_probe, + .bdrv_open = vpc_open, + .bdrv_close = vpc_close, + .bdrv_reopen_prepare = vpc_reopen_prepare, + .bdrv_create = vpc_create, .bdrv_read = vpc_co_read, .bdrv_write = vpc_co_write, - .create_options = vpc_create_options, + .create_options = vpc_create_options, + .bdrv_has_zero_init = vpc_has_zero_init, }; static void bdrv_vpc_init(void) diff --git a/block/vvfat.c b/block/vvfat.c index 59d3c5b8a..cd3b8edd9 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1,4 +1,4 @@ -/* vim:set shiftwidth=4 ts=8: */ +/* vim:set shiftwidth=4 ts=4: */ /* * QEMU Block driver for virtual VFAT (shadows a local directory) * @@ -25,9 +25,11 @@ #include <sys/stat.h> #include <dirent.h> #include "qemu-common.h" -#include "block_int.h" -#include "module.h" -#include "migration.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qbool.h" #ifndef S_IWGRP #define S_IWGRP 0 @@ -529,13 +531,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry) /* if return_time==0, this returns the fat_date, else the fat_time */ static uint16_t fat_datetime(time_t time,int return_time) { struct tm* t; -#ifdef _WIN32 - t=localtime(&time); /* this is not thread safe */ -#else struct tm t1; t = &t1; localtime_r(&time,t); -#endif if(return_time) return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); @@ -992,10 +990,90 @@ static void vvfat_rebind(BlockDriverState *bs) s->bs = bs; } -static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags) +static QemuOptsList runtime_opts = { + .name = "vvfat", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "dir", + .type = QEMU_OPT_STRING, + .help = "Host directory to map to the vvfat device", + }, + { + .name = "fat-type", + .type = QEMU_OPT_NUMBER, + .help = "FAT type (12, 16 or 32)", + }, + { + .name = "floppy", + .type = QEMU_OPT_BOOL, + .help = "Create a floppy rather than a hard disk image", + }, + { + .name = "rw", + .type = QEMU_OPT_BOOL, + .help = "Make the image writable", + }, + { /* end of list */ } + }, +}; + +static void vvfat_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + int fat_type = 0; + bool floppy = false; + bool rw = false; + int i; + + if (!strstart(filename, "fat:", NULL)) { + error_setg(errp, "File name string must start with 'fat:'"); + return; + } + + /* Parse options */ + if (strstr(filename, ":32:")) { + fat_type = 32; + } else if (strstr(filename, ":16:")) { + fat_type = 16; + } else if (strstr(filename, ":12:")) { + fat_type = 12; + } + + if (strstr(filename, ":floppy:")) { + floppy = true; + } + + if (strstr(filename, ":rw:")) { + rw = true; + } + + /* Get the directory name without options */ + i = strrchr(filename, ':') - filename; + assert(i >= 3); + if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) { + /* workaround for DOS drive names */ + filename += i - 1; + } else { + filename += i + 1; + } + + /* Fill in the options QDict */ + qdict_put(options, "dir", qstring_from_str(filename)); + qdict_put(options, "fat-type", qint_from_int(fat_type)); + qdict_put(options, "floppy", qbool_from_int(floppy)); + qdict_put(options, "rw", qbool_from_int(rw)); +} + +static int vvfat_open(BlockDriverState *bs, QDict *options, int flags) { BDRVVVFATState *s = bs->opaque; - int i, cyls, heads, secs; + int cyls, heads, secs; + bool floppy; + const char *dirname; + QemuOpts *opts; + Error *local_err = NULL; + int ret; #ifdef DEBUG vvv = s; @@ -1006,6 +1084,65 @@ DLOG(if (stderr == NULL) { setbuf(stderr, NULL); }) + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + dirname = qemu_opt_get(opts, "dir"); + if (!dirname) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires " + "a 'dir' option"); + ret = -EINVAL; + goto fail; + } + + s->fat_type = qemu_opt_get_number(opts, "fat-type", 0); + floppy = qemu_opt_get_bool(opts, "floppy", false); + + if (floppy) { + /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ + if (!s->fat_type) { + s->fat_type = 12; + secs = 36; + s->sectors_per_cluster = 2; + } else { + secs = s->fat_type == 12 ? 18 : 36; + s->sectors_per_cluster = 1; + } + s->first_sectors_number = 1; + cyls = 80; + heads = 2; + } else { + /* 32MB or 504MB disk*/ + if (!s->fat_type) { + s->fat_type = 16; + } + cyls = s->fat_type == 12 ? 64 : 1024; + heads = 16; + secs = 63; + } + + switch (s->fat_type) { + case 32: + fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. " + "You are welcome to do so!\n"); + break; + case 16: + case 12: + break; + default: + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only " + "12, 16 and 32"); + ret = -EINVAL; + goto fail; + } + + s->bs = bs; /* LATER TODO: if FAT32, adjust */ @@ -1021,63 +1158,24 @@ DLOG(if (stderr == NULL) { s->fat2 = NULL; s->downcase_short_names = 1; - if (!strstart(dirname, "fat:", NULL)) - return -1; - - if (strstr(dirname, ":32:")) { - fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n"); - s->fat_type = 32; - } else if (strstr(dirname, ":16:")) { - s->fat_type = 16; - } else if (strstr(dirname, ":12:")) { - s->fat_type = 12; - } - - if (strstr(dirname, ":floppy:")) { - /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ - if (!s->fat_type) { - s->fat_type = 12; - secs = 36; - s->sectors_per_cluster=2; - } else { - secs = s->fat_type == 12 ? 18 : 36; - s->sectors_per_cluster=1; - } - s->first_sectors_number = 1; - cyls = 80; - heads = 2; - } else { - /* 32MB or 504MB disk*/ - if (!s->fat_type) { - s->fat_type = 16; - } - cyls = s->fat_type == 12 ? 64 : 1024; - heads = 16; - secs = 63; - } fprintf(stderr, "vvfat %s chs %d,%d,%d\n", dirname, cyls, heads, secs); s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); - if (strstr(dirname, ":rw:")) { - if (enable_write_target(s)) - return -1; - bs->read_only = 0; + if (qemu_opt_get_bool(opts, "rw", false)) { + ret = enable_write_target(s); + if (ret < 0) { + goto fail; + } + bs->read_only = 0; } - i = strrchr(dirname, ':') - dirname; - assert(i >= 3); - if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1])) - /* workaround for DOS drive names */ - dirname += i-1; - else - dirname += i+1; - bs->total_sectors = cyls * heads * secs; if (init_directories(s, dirname, heads, secs)) { - return -1; + ret = -EIO; + goto fail; } s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; @@ -1097,7 +1195,10 @@ DLOG(if (stderr == NULL) { migrate_add_blocker(s->migration_blocker); } - return 0; + ret = 0; +fail: + qemu_opts_del(opts); + return ret; } static inline void vvfat_close_current_file(BDRVVVFATState *s) @@ -2816,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s) s->qcow_filename = g_malloc(1024); ret = get_tmp_filename(s->qcow_filename, 1024); if (ret < 0) { - g_free(s->qcow_filename); - s->qcow_filename = NULL; - return ret; + goto err; } bdrv_qcow = bdrv_find_format("qcow"); @@ -2826,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s) set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); - if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0) - return -1; + ret = bdrv_create(bdrv_qcow, s->qcow_filename, options); + if (ret < 0) { + goto err; + } s->qcow = bdrv_new(""); - if (s->qcow == NULL) { - return -1; - } - ret = bdrv_open(s->qcow, s->qcow_filename, + ret = bdrv_open(s->qcow, s->qcow_filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); if (ret < 0) { - return ret; + bdrv_delete(s->qcow); + goto err; } #ifndef _WIN32 @@ -2850,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s) *(void**)s->bs->backing_hd->opaque = s; return 0; + +err: + g_free(s->qcow_filename); + s->qcow_filename = NULL; + return ret; } static void vvfat_close(BlockDriverState *bs) @@ -2869,15 +2973,18 @@ static void vvfat_close(BlockDriverState *bs) } static BlockDriver bdrv_vvfat = { - .format_name = "vvfat", - .instance_size = sizeof(BDRVVVFATState), - .bdrv_file_open = vvfat_open, - .bdrv_rebind = vvfat_rebind, - .bdrv_read = vvfat_co_read, - .bdrv_write = vvfat_co_write, - .bdrv_close = vvfat_close, - .bdrv_co_is_allocated = vvfat_co_is_allocated, - .protocol_name = "fat", + .format_name = "vvfat", + .protocol_name = "fat", + .instance_size = sizeof(BDRVVVFATState), + + .bdrv_parse_filename = vvfat_parse_filename, + .bdrv_file_open = vvfat_open, + .bdrv_close = vvfat_close, + .bdrv_rebind = vvfat_rebind, + + .bdrv_read = vvfat_co_read, + .bdrv_write = vvfat_co_write, + .bdrv_co_is_allocated = vvfat_co_is_allocated, }; static void bdrv_vvfat_init(void) diff --git a/block/win32-aio.c b/block/win32-aio.c index 4704ee06c..fcb7c754d 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -22,13 +22,13 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "qemu-timer.h" -#include "block_int.h" -#include "module.h" -#include "qemu-common.h" -#include "qemu-aio.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "block/aio.h" #include "raw-aio.h" -#include "event_notifier.h" +#include "qemu/event_notifier.h" +#include "qemu/iov.h" #include <windows.h> #include <winioctl.h> @@ -80,15 +80,9 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s, if (!waiocb->is_linear) { if (ret == 0 && waiocb->is_read) { QEMUIOVector *qiov = waiocb->qiov; - char *p = waiocb->buf; - int i; - - for (i = 0; i < qiov->niov; ++i) { - memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len); - p += qiov->iov[i].iov_len; - } - g_free(waiocb->buf); + iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); } + qemu_vfree(waiocb->buf); } @@ -153,13 +147,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, if (qiov->niov > 1) { waiocb->buf = qemu_blockalign(bs, qiov->size); if (type & QEMU_AIO_WRITE) { - char *p = waiocb->buf; - int i; - - for (i = 0; i < qiov->niov; ++i) { - memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len); - p += qiov->iov[i].iov_len; - } + iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); } waiocb->is_linear = false; } else { |