summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorJunfeng Dong <junfeng.dong@intel.com>2013-11-19 17:45:23 +0800
committerJunfeng Dong <junfeng.dong@intel.com>2013-11-19 17:45:23 +0800
commit340f06c9eaee097e626c251bf7a013350649c091 (patch)
tree107e5705050a12da68fc80a56ae37afd50a2cc94 /block
parent42bf3037d458a330856a0be584200c1e41c3f417 (diff)
downloadqemu-340f06c9eaee097e626c251bf7a013350649c091.tar.gz
qemu-340f06c9eaee097e626c251bf7a013350649c091.tar.bz2
qemu-340f06c9eaee097e626c251bf7a013350649c091.zip
Import upstream 1.6.0.upstream/1.6.0
Change-Id: Icf52b556470cac8677297f2ef14ded16684f7887 Signed-off-by: Junfeng Dong <junfeng.dong@intel.com>
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs6
-rw-r--r--block/backup.c386
-rw-r--r--block/blkdebug.c244
-rw-r--r--block/blkverify.c120
-rw-r--r--block/bochs.c30
-rw-r--r--block/cloop.c35
-rw-r--r--block/commit.c15
-rw-r--r--block/cow.c11
-rw-r--r--block/curl.c204
-rw-r--r--block/dmg.c174
-rw-r--r--block/gluster.c114
-rw-r--r--block/iscsi.c707
-rw-r--r--block/linux-aio.c6
-rw-r--r--block/mirror.c394
-rw-r--r--block/nbd.c182
-rw-r--r--block/parallels.c29
-rw-r--r--block/qapi.c470
-rw-r--r--block/qcow.c32
-rw-r--r--block/qcow2-cache.c2
-rw-r--r--block/qcow2-cluster.c687
-rw-r--r--block/qcow2-refcount.c247
-rw-r--r--block/qcow2-snapshot.c18
-rw-r--r--block/qcow2.c229
-rw-r--r--block/qcow2.h123
-rw-r--r--block/qed-table.c2
-rw-r--r--block/qed.c15
-rw-r--r--block/qed.h2
-rw-r--r--block/raw-aio.h5
-rw-r--r--block/raw-posix.c367
-rw-r--r--block/raw-win32.c95
-rw-r--r--block/raw.c43
-rw-r--r--block/rbd.c100
-rw-r--r--block/sheepdog.c889
-rw-r--r--block/snapshot.c157
-rw-r--r--block/ssh.c1076
-rw-r--r--block/stream.c8
-rw-r--r--block/vdi.c40
-rw-r--r--block/vhdx.c972
-rw-r--r--block/vhdx.h325
-rw-r--r--block/vmdk.c499
-rw-r--r--block/vpc.c99
-rw-r--r--block/vvfat.c261
-rw-r--r--block/win32-aio.c30
43 files changed, 7821 insertions, 1629 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 7f015105b..4cf9aa499 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,7 +2,9 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
+block-obj-y += vhdx.o
block-obj-y += parallels.o blkdebug.o blkverify.o
+block-obj-y += snapshot.o qapi.o
block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
@@ -13,8 +15,12 @@ block-obj-$(CONFIG_LIBISCSI) += iscsi.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+block-obj-$(CONFIG_LIBSSH2) += ssh.o
endif
common-obj-y += stream.o
common-obj-y += commit.o
common-obj-y += mirror.o
+common-obj-y += backup.o
+
+$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
diff --git a/block/backup.c b/block/backup.c
new file mode 100644
index 000000000..6ae8a05a3
--- /dev/null
+++ b/block/backup.c
@@ -0,0 +1,386 @@
+/*
+ * QEMU backup
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ * Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "trace.h"
+#include "block/block.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
+#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CowRequest {
+ int64_t start;
+ int64_t end;
+ QLIST_ENTRY(CowRequest) list;
+ CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+typedef struct BackupBlockJob {
+ BlockJob common;
+ BlockDriverState *target;
+ MirrorSyncMode sync_mode;
+ RateLimit limit;
+ BlockdevOnError on_source_error;
+ BlockdevOnError on_target_error;
+ CoRwlock flush_rwlock;
+ uint64_t sectors_read;
+ HBitmap *bitmap;
+ QLIST_HEAD(, CowRequest) inflight_reqs;
+} BackupBlockJob;
+
+/* See if in-flight requests overlap and wait for them to complete */
+static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
+ int64_t start,
+ int64_t end)
+{
+ CowRequest *req;
+ bool retry;
+
+ do {
+ retry = false;
+ QLIST_FOREACH(req, &job->inflight_reqs, list) {
+ if (end > req->start && start < req->end) {
+ qemu_co_queue_wait(&req->wait_queue);
+ retry = true;
+ break;
+ }
+ }
+ } while (retry);
+}
+
+/* Keep track of an in-flight request */
+static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
+ int64_t start, int64_t end)
+{
+ req->start = start;
+ req->end = end;
+ qemu_co_queue_init(&req->wait_queue);
+ QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
+}
+
+/* Forget about a completed request */
+static void cow_request_end(CowRequest *req)
+{
+ QLIST_REMOVE(req, list);
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ bool *error_is_read)
+{
+ BackupBlockJob *job = (BackupBlockJob *)bs->job;
+ CowRequest cow_request;
+ struct iovec iov;
+ QEMUIOVector bounce_qiov;
+ void *bounce_buffer = NULL;
+ int ret = 0;
+ int64_t start, end;
+ int n;
+
+ qemu_co_rwlock_rdlock(&job->flush_rwlock);
+
+ start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
+ end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
+
+ trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
+
+ wait_for_overlapping_requests(job, start, end);
+ cow_request_begin(&cow_request, job, start, end);
+
+ for (; start < end; start++) {
+ if (hbitmap_get(job->bitmap, start)) {
+ trace_backup_do_cow_skip(job, start);
+ continue; /* already copied */
+ }
+
+ trace_backup_do_cow_process(job, start);
+
+ n = MIN(BACKUP_SECTORS_PER_CLUSTER,
+ job->common.len / BDRV_SECTOR_SIZE -
+ start * BACKUP_SECTORS_PER_CLUSTER);
+
+ if (!bounce_buffer) {
+ bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
+ }
+ iov.iov_base = bounce_buffer;
+ iov.iov_len = n * BDRV_SECTOR_SIZE;
+ qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+ ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+ &bounce_qiov);
+ if (ret < 0) {
+ trace_backup_do_cow_read_fail(job, start, ret);
+ if (error_is_read) {
+ *error_is_read = true;
+ }
+ goto out;
+ }
+
+ if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
+ ret = bdrv_co_write_zeroes(job->target,
+ start * BACKUP_SECTORS_PER_CLUSTER, n);
+ } else {
+ ret = bdrv_co_writev(job->target,
+ start * BACKUP_SECTORS_PER_CLUSTER, n,
+ &bounce_qiov);
+ }
+ if (ret < 0) {
+ trace_backup_do_cow_write_fail(job, start, ret);
+ if (error_is_read) {
+ *error_is_read = false;
+ }
+ goto out;
+ }
+
+ hbitmap_set(job->bitmap, start, 1);
+
+ /* Publish progress, guest I/O counts as progress too. Note that the
+ * offset field is an opaque progress value, it is not a disk offset.
+ */
+ job->sectors_read += n;
+ job->common.offset += n * BDRV_SECTOR_SIZE;
+ }
+
+out:
+ if (bounce_buffer) {
+ qemu_vfree(bounce_buffer);
+ }
+
+ cow_request_end(&cow_request);
+
+ trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
+
+ qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+ return ret;
+}
+
+static int coroutine_fn backup_before_write_notify(
+ NotifierWithReturn *notifier,
+ void *opaque)
+{
+ BdrvTrackedRequest *req = opaque;
+
+ return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+}
+
+static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+ BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+ if (speed < 0) {
+ error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ return;
+ }
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static void backup_iostatus_reset(BlockJob *job)
+{
+ BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+ bdrv_iostatus_reset(s->target);
+}
+
+static const BlockJobType backup_job_type = {
+ .instance_size = sizeof(BackupBlockJob),
+ .job_type = "backup",
+ .set_speed = backup_set_speed,
+ .iostatus_reset = backup_iostatus_reset,
+};
+
+static BlockErrorAction backup_error_action(BackupBlockJob *job,
+ bool read, int error)
+{
+ if (read) {
+ return block_job_error_action(&job->common, job->common.bs,
+ job->on_source_error, true, error);
+ } else {
+ return block_job_error_action(&job->common, job->target,
+ job->on_target_error, false, error);
+ }
+}
+
+static void coroutine_fn backup_run(void *opaque)
+{
+ BackupBlockJob *job = opaque;
+ BlockDriverState *bs = job->common.bs;
+ BlockDriverState *target = job->target;
+ BlockdevOnError on_target_error = job->on_target_error;
+ NotifierWithReturn before_write = {
+ .notify = backup_before_write_notify,
+ };
+ int64_t start, end;
+ int ret = 0;
+
+ QLIST_INIT(&job->inflight_reqs);
+ qemu_co_rwlock_init(&job->flush_rwlock);
+
+ start = 0;
+ end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
+ BACKUP_SECTORS_PER_CLUSTER);
+
+ job->bitmap = hbitmap_alloc(end, 0);
+
+ bdrv_set_enable_write_cache(target, true);
+ bdrv_set_on_error(target, on_target_error, on_target_error);
+ bdrv_iostatus_enable(target);
+
+ bdrv_add_before_write_notifier(bs, &before_write);
+
+ if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
+ while (!block_job_is_cancelled(&job->common)) {
+ /* Yield until the job is cancelled. We just let our before_write
+ * notify callback service CoW requests. */
+ job->common.busy = false;
+ qemu_coroutine_yield();
+ job->common.busy = true;
+ }
+ } else {
+ /* Both FULL and TOP SYNC_MODE's require copying.. */
+ for (; start < end; start++) {
+ bool error_is_read;
+
+ if (block_job_is_cancelled(&job->common)) {
+ break;
+ }
+
+ /* we need to yield so that qemu_aio_flush() returns.
+ * (without, VM does not reboot)
+ */
+ if (job->common.speed) {
+ uint64_t delay_ns = ratelimit_calculate_delay(
+ &job->limit, job->sectors_read);
+ job->sectors_read = 0;
+ block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+ } else {
+ block_job_sleep_ns(&job->common, rt_clock, 0);
+ }
+
+ if (block_job_is_cancelled(&job->common)) {
+ break;
+ }
+
+ if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
+ int i, n;
+ int alloced = 0;
+
+ /* Check to see if these blocks are already in the
+ * backing file. */
+
+ for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
+ /* bdrv_co_is_allocated() only returns true/false based
+ * on the first set of sectors it comes accross that
+ * are are all in the same state.
+ * For that reason we must verify each sector in the
+ * backup cluster length. We end up copying more than
+ * needed but at some point that is always the case. */
+ alloced =
+ bdrv_co_is_allocated(bs,
+ start * BACKUP_SECTORS_PER_CLUSTER + i,
+ BACKUP_SECTORS_PER_CLUSTER - i, &n);
+ i += n;
+
+ if (alloced == 1) {
+ break;
+ }
+ }
+
+ /* If the above loop never found any sectors that are in
+ * the topmost image, skip this backup. */
+ if (alloced == 0) {
+ continue;
+ }
+ }
+ /* FULL sync mode we copy the whole drive. */
+ ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
+ BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+ if (ret < 0) {
+ /* Depending on error action, fail now or retry cluster */
+ BlockErrorAction action =
+ backup_error_action(job, error_is_read, -ret);
+ if (action == BDRV_ACTION_REPORT) {
+ break;
+ } else {
+ start--;
+ continue;
+ }
+ }
+ }
+ }
+
+ notifier_with_return_remove(&before_write);
+
+ /* wait until pending backup_do_cow() calls have completed */
+ qemu_co_rwlock_wrlock(&job->flush_rwlock);
+ qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+ hbitmap_free(job->bitmap);
+
+ bdrv_iostatus_disable(target);
+ bdrv_delete(target);
+
+ block_job_completed(&job->common, ret);
+}
+
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+ int64_t speed, MirrorSyncMode sync_mode,
+ BlockdevOnError on_source_error,
+ BlockdevOnError on_target_error,
+ BlockDriverCompletionFunc *cb, void *opaque,
+ Error **errp)
+{
+ int64_t len;
+
+ assert(bs);
+ assert(target);
+ assert(cb);
+
+ if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+ on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+ !bdrv_iostatus_is_enabled(bs)) {
+ error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+ return;
+ }
+
+ len = bdrv_getlength(bs);
+ if (len < 0) {
+ error_setg_errno(errp, -len, "unable to get length for '%s'",
+ bdrv_get_device_name(bs));
+ return;
+ }
+
+ BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+ cb, opaque, errp);
+ if (!job) {
+ return;
+ }
+
+ job->on_source_error = on_source_error;
+ job->on_target_error = on_target_error;
+ job->target = target;
+ job->sync_mode = sync_mode;
+ job->common.len = len;
+ job->common.co = qemu_coroutine_create(backup_run);
+ qemu_coroutine_enter(job->common.co, job);
+}
diff --git a/block/blkdebug.c b/block/blkdebug.c
index d61ece86a..ccb627ad9 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -23,14 +23,17 @@
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/config-file.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
typedef struct BDRVBlkdebugState {
int state;
int new_state;
+
QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
+ QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
} BDRVBlkdebugState;
typedef struct BlkdebugAIOCB {
@@ -39,6 +42,12 @@ typedef struct BlkdebugAIOCB {
int ret;
} BlkdebugAIOCB;
+typedef struct BlkdebugSuspendedReq {
+ Coroutine *co;
+ char *tag;
+ QLIST_ENTRY(BlkdebugSuspendedReq) next;
+} BlkdebugSuspendedReq;
+
static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
static const AIOCBInfo blkdebug_aiocb_info = {
@@ -49,6 +58,7 @@ static const AIOCBInfo blkdebug_aiocb_info = {
enum {
ACTION_INJECT_ERROR,
ACTION_SET_STATE,
+ ACTION_SUSPEND,
};
typedef struct BlkdebugRule {
@@ -65,6 +75,9 @@ typedef struct BlkdebugRule {
struct {
int new_state;
} set_state;
+ struct {
+ char *tag;
+ } suspend;
} options;
QLIST_ENTRY(BlkdebugRule) next;
QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
@@ -169,6 +182,9 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
[BLKDBG_CLUSTER_ALLOC] = "cluster_alloc",
[BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes",
[BLKDBG_CLUSTER_FREE] = "cluster_free",
+
+ [BLKDBG_FLUSH_TO_OS] = "flush_to_os",
+ [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk",
};
static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -226,6 +242,11 @@ static int add_rule(QemuOpts *opts, void *opaque)
rule->options.set_state.new_state =
qemu_opt_get_number(opts, "new_state", 0);
break;
+
+ case ACTION_SUSPEND:
+ rule->options.suspend.tag =
+ g_strdup(qemu_opt_get(opts, "tag"));
+ break;
};
/* Add the rule */
@@ -234,6 +255,21 @@ static int add_rule(QemuOpts *opts, void *opaque)
return 0;
}
+static void remove_rule(BlkdebugRule *rule)
+{
+ switch (rule->action) {
+ case ACTION_INJECT_ERROR:
+ case ACTION_SET_STATE:
+ break;
+ case ACTION_SUSPEND:
+ g_free(rule->options.suspend.tag);
+ break;
+ }
+
+ QLIST_REMOVE(rule, next);
+ g_free(rule);
+}
+
static int read_config(BDRVBlkdebugState *s, const char *filename)
{
FILE *f;
@@ -266,43 +302,98 @@ fail:
}
/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */
-static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags)
+static void blkdebug_parse_filename(const char *filename, QDict *options,
+ Error **errp)
{
- BDRVBlkdebugState *s = bs->opaque;
- int ret;
- char *config, *c;
+ const char *c;
/* Parse the blkdebug: prefix */
- if (strncmp(filename, "blkdebug:", strlen("blkdebug:"))) {
- return -EINVAL;
+ if (!strstart(filename, "blkdebug:", &filename)) {
+ error_setg(errp, "File name string must start with 'blkdebug:'");
+ return;
}
- filename += strlen("blkdebug:");
- /* Read rules from config file */
+ /* Parse config file path */
c = strchr(filename, ':');
if (c == NULL) {
- return -EINVAL;
+ error_setg(errp, "blkdebug requires both config file and image path");
+ return;
}
- config = g_strdup(filename);
- config[c - filename] = '\0';
- ret = read_config(s, config);
- g_free(config);
- if (ret < 0) {
- return ret;
+ if (c != filename) {
+ QString *config_path;
+ config_path = qstring_from_substr(filename, 0, c - filename - 1);
+ qdict_put(options, "config", config_path);
}
+
+ /* TODO Allow multi-level nesting and set file.filename here */
filename = c + 1;
+ qdict_put(options, "x-image", qstring_from_str(filename));
+}
+
+static QemuOptsList runtime_opts = {
+ .name = "blkdebug",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "config",
+ .type = QEMU_OPT_STRING,
+ .help = "Path to the configuration file",
+ },
+ {
+ .name = "x-image",
+ .type = QEMU_OPT_STRING,
+ .help = "[internal use only, will be removed]",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename, *config;
+ int ret;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Read rules from config file */
+ config = qemu_opt_get(opts, "config");
+ if (config) {
+ ret = read_config(s, config);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
/* Set initial state */
s->state = 1;
/* Open the backing file */
- ret = bdrv_file_open(&bs->file, filename, flags);
+ filename = qemu_opt_get(opts, "x-image");
+ if (filename == NULL) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = bdrv_file_open(&bs->file, filename, NULL, flags);
if (ret < 0) {
- return ret;
+ goto fail;
}
- return 0;
+ ret = 0;
+fail:
+ qemu_opts_del(opts);
+ return ret;
}
static void error_callback_bh(void *opaque)
@@ -389,6 +480,7 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
}
+
static void blkdebug_close(BlockDriverState *bs)
{
BDRVBlkdebugState *s = bs->opaque;
@@ -397,12 +489,32 @@ static void blkdebug_close(BlockDriverState *bs)
for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
- QLIST_REMOVE(rule, next);
- g_free(rule);
+ remove_rule(rule);
}
}
}
+static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugSuspendedReq r;
+
+ r = (BlkdebugSuspendedReq) {
+ .co = qemu_coroutine_self(),
+ .tag = g_strdup(rule->options.suspend.tag),
+ };
+
+ remove_rule(rule);
+ QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);
+
+ printf("blkdebug: Suspended request '%s'\n", r.tag);
+ qemu_coroutine_yield();
+ printf("blkdebug: Resuming request '%s'\n", r.tag);
+
+ QLIST_REMOVE(&r, next);
+ g_free(r.tag);
+}
+
static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
bool injected)
{
@@ -426,6 +538,10 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
case ACTION_SET_STATE:
s->new_state = rule->options.set_state.new_state;
break;
+
+ case ACTION_SUSPEND:
+ suspend_request(bs, rule);
+ break;
}
return injected;
}
@@ -433,38 +549,94 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
{
BDRVBlkdebugState *s = bs->opaque;
- struct BlkdebugRule *rule;
+ struct BlkdebugRule *rule, *next;
bool injected;
assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
injected = false;
s->new_state = s->state;
- QLIST_FOREACH(rule, &s->rules[event], next) {
+ QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) {
injected = process_rule(bs, rule, injected);
}
s->state = s->new_state;
}
-static int64_t blkdebug_getlength(BlockDriverState *bs)
+static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
+ const char *tag)
{
- return bdrv_getlength(bs->file);
+ BDRVBlkdebugState *s = bs->opaque;
+ struct BlkdebugRule *rule;
+ BlkDebugEvent blkdebug_event;
+
+ if (get_event_by_name(event, &blkdebug_event) < 0) {
+ return -ENOENT;
+ }
+
+
+ rule = g_malloc(sizeof(*rule));
+ *rule = (struct BlkdebugRule) {
+ .event = blkdebug_event,
+ .action = ACTION_SUSPEND,
+ .state = 0,
+ .options.suspend.tag = g_strdup(tag),
+ };
+
+ QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next);
+
+ return 0;
+}
+
+static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugSuspendedReq *r;
+
+ QLIST_FOREACH(r, &s->suspended_reqs, next) {
+ if (!strcmp(r->tag, tag)) {
+ qemu_coroutine_enter(r->co, NULL);
+ return 0;
+ }
+ }
+ return -ENOENT;
}
-static BlockDriver bdrv_blkdebug = {
- .format_name = "blkdebug",
- .protocol_name = "blkdebug",
- .instance_size = sizeof(BDRVBlkdebugState),
+static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+ BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugSuspendedReq *r;
- .bdrv_file_open = blkdebug_open,
- .bdrv_close = blkdebug_close,
- .bdrv_getlength = blkdebug_getlength,
+ QLIST_FOREACH(r, &s->suspended_reqs, next) {
+ if (!strcmp(r->tag, tag)) {
+ return true;
+ }
+ }
+ return false;
+}
- .bdrv_aio_readv = blkdebug_aio_readv,
- .bdrv_aio_writev = blkdebug_aio_writev,
+static int64_t blkdebug_getlength(BlockDriverState *bs)
+{
+ return bdrv_getlength(bs->file);
+}
- .bdrv_debug_event = blkdebug_debug_event,
+static BlockDriver bdrv_blkdebug = {
+ .format_name = "blkdebug",
+ .protocol_name = "blkdebug",
+ .instance_size = sizeof(BDRVBlkdebugState),
+
+ .bdrv_parse_filename = blkdebug_parse_filename,
+ .bdrv_file_open = blkdebug_open,
+ .bdrv_close = blkdebug_close,
+ .bdrv_getlength = blkdebug_getlength,
+
+ .bdrv_aio_readv = blkdebug_aio_readv,
+ .bdrv_aio_writev = blkdebug_aio_writev,
+
+ .bdrv_debug_event = blkdebug_debug_event,
+ .bdrv_debug_breakpoint = blkdebug_debug_breakpoint,
+ .bdrv_debug_resume = blkdebug_debug_resume,
+ .bdrv_debug_is_suspended = blkdebug_debug_is_suspended,
};
static void bdrv_blkdebug_init(void)
diff --git a/block/blkverify.c b/block/blkverify.c
index 4beede77a..1d58cc393 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -8,8 +8,8 @@
*/
#include <stdarg.h>
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
-#include "block_int.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
typedef struct {
BlockDriverState *test_file;
@@ -69,43 +69,100 @@ static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb,
}
/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */
-static int blkverify_open(BlockDriverState *bs, const char *filename, int flags)
+static void blkverify_parse_filename(const char *filename, QDict *options,
+ Error **errp)
{
- BDRVBlkverifyState *s = bs->opaque;
- int ret;
- char *raw, *c;
+ const char *c;
+ QString *raw_path;
+
/* Parse the blkverify: prefix */
- if (strncmp(filename, "blkverify:", strlen("blkverify:"))) {
- return -EINVAL;
+ if (!strstart(filename, "blkverify:", &filename)) {
+ error_setg(errp, "File name string must start with 'blkverify:'");
+ return;
}
- filename += strlen("blkverify:");
/* Parse the raw image filename */
c = strchr(filename, ':');
if (c == NULL) {
- return -EINVAL;
+ error_setg(errp, "blkverify requires raw copy and original image path");
+ return;
}
- raw = g_strdup(filename);
- raw[c - filename] = '\0';
- ret = bdrv_file_open(&bs->file, raw, flags);
- g_free(raw);
+ /* TODO Implement option pass-through and set raw.filename here */
+ raw_path = qstring_from_substr(filename, 0, c - filename - 1);
+ qdict_put(options, "x-raw", raw_path);
+
+ /* TODO Allow multi-level nesting and set file.filename here */
+ filename = c + 1;
+ qdict_put(options, "x-image", qstring_from_str(filename));
+}
+
+static QemuOptsList runtime_opts = {
+ .name = "blkverify",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "x-raw",
+ .type = QEMU_OPT_STRING,
+ .help = "[internal use only, will be removed]",
+ },
+ {
+ .name = "x-image",
+ .type = QEMU_OPT_STRING,
+ .help = "[internal use only, will be removed]",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int blkverify_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVBlkverifyState *s = bs->opaque;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename, *raw;
+ int ret;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Parse the raw image filename */
+ raw = qemu_opt_get(opts, "x-raw");
+ if (raw == NULL) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = bdrv_file_open(&bs->file, raw, NULL, flags);
if (ret < 0) {
- return ret;
+ goto fail;
}
- filename = c + 1;
/* Open the test file */
+ filename = qemu_opt_get(opts, "x-image");
+ if (filename == NULL) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
s->test_file = bdrv_new("");
- ret = bdrv_open(s->test_file, filename, flags, NULL);
+ ret = bdrv_open(s->test_file, filename, NULL, flags, NULL);
if (ret < 0) {
bdrv_delete(s->test_file);
s->test_file = NULL;
- return ret;
+ goto fail;
}
- return 0;
+ ret = 0;
+fail:
+ return ret;
}
static void blkverify_close(BlockDriverState *bs)
@@ -343,19 +400,18 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
}
static BlockDriver bdrv_blkverify = {
- .format_name = "blkverify",
- .protocol_name = "blkverify",
-
- .instance_size = sizeof(BDRVBlkverifyState),
-
- .bdrv_getlength = blkverify_getlength,
-
- .bdrv_file_open = blkverify_open,
- .bdrv_close = blkverify_close,
-
- .bdrv_aio_readv = blkverify_aio_readv,
- .bdrv_aio_writev = blkverify_aio_writev,
- .bdrv_aio_flush = blkverify_aio_flush,
+ .format_name = "blkverify",
+ .protocol_name = "blkverify",
+ .instance_size = sizeof(BDRVBlkverifyState),
+
+ .bdrv_parse_filename = blkverify_parse_filename,
+ .bdrv_file_open = blkverify_open,
+ .bdrv_close = blkverify_close,
+ .bdrv_getlength = blkverify_getlength,
+
+ .bdrv_aio_readv = blkverify_aio_readv,
+ .bdrv_aio_writev = blkverify_aio_writev,
+ .bdrv_aio_flush = blkverify_aio_flush,
};
static void bdrv_blkverify_init(void)
diff --git a/block/bochs.c b/block/bochs.c
index ab7944dc4..d7078c077 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -23,8 +23,8 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
/**************************************************************/
@@ -108,17 +108,19 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
return 0;
}
-static int bochs_open(BlockDriverState *bs, int flags)
+static int bochs_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVBochsState *s = bs->opaque;
int i;
struct bochs_header bochs;
struct bochs_header_v1 header_v1;
+ int ret;
bs->read_only = 1; // no write support yet
- if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) {
- goto fail;
+ ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
+ if (ret < 0) {
+ return ret;
}
if (strcmp(bochs.magic, HEADER_MAGIC) ||
@@ -126,7 +128,7 @@ static int bochs_open(BlockDriverState *bs, int flags)
strcmp(bochs.subtype, GROWING_TYPE) ||
((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
(le32_to_cpu(bochs.version) != HEADER_V1))) {
- goto fail;
+ return -EMEDIUMTYPE;
}
if (le32_to_cpu(bochs.version) == HEADER_V1) {
@@ -138,9 +140,13 @@ static int bochs_open(BlockDriverState *bs, int flags)
s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
s->catalog_bitmap = g_malloc(s->catalog_size * 4);
- if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
- s->catalog_size * 4) != s->catalog_size * 4)
- goto fail;
+
+ ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
+ s->catalog_size * 4);
+ if (ret < 0) {
+ goto fail;
+ }
+
for (i = 0; i < s->catalog_size; i++)
le32_to_cpus(&s->catalog_bitmap[i]);
@@ -153,8 +159,10 @@ static int bochs_open(BlockDriverState *bs, int flags)
qemu_co_mutex_init(&s->lock);
return 0;
- fail:
- return -1;
+
+fail:
+ g_free(s->catalog_bitmap);
+ return ret;
}
static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
diff --git a/block/cloop.c b/block/cloop.c
index 7570eb8e7..6ea7cf404 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -22,8 +22,8 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
#include <zlib.h>
typedef struct BDRVCloopState {
@@ -53,31 +53,36 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
return 0;
}
-static int cloop_open(BlockDriverState *bs, int flags)
+static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVCloopState *s = bs->opaque;
uint32_t offsets_size, max_compressed_block_size = 1, i;
+ int ret;
bs->read_only = 1;
/* read header */
- if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) {
- goto cloop_close;
+ ret = bdrv_pread(bs->file, 128, &s->block_size, 4);
+ if (ret < 0) {
+ return ret;
}
s->block_size = be32_to_cpu(s->block_size);
- if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) {
- goto cloop_close;
+ ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
+ if (ret < 0) {
+ return ret;
}
s->n_blocks = be32_to_cpu(s->n_blocks);
/* read offsets */
offsets_size = s->n_blocks * sizeof(uint64_t);
s->offsets = g_malloc(offsets_size);
- if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) <
- offsets_size) {
- goto cloop_close;
+
+ ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
+ if (ret < 0) {
+ goto fail;
}
+
for(i=0;i<s->n_blocks;i++) {
s->offsets[i] = be64_to_cpu(s->offsets[i]);
if (i > 0) {
@@ -92,7 +97,8 @@ static int cloop_open(BlockDriverState *bs, int flags)
s->compressed_block = g_malloc(max_compressed_block_size + 1);
s->uncompressed_block = g_malloc(s->block_size);
if (inflateInit(&s->zstream) != Z_OK) {
- goto cloop_close;
+ ret = -EINVAL;
+ goto fail;
}
s->current_block = s->n_blocks;
@@ -101,8 +107,11 @@ static int cloop_open(BlockDriverState *bs, int flags)
qemu_co_mutex_init(&s->lock);
return 0;
-cloop_close:
- return -1;
+fail:
+ g_free(s->offsets);
+ g_free(s->compressed_block);
+ g_free(s->uncompressed_block);
+ return ret;
}
static inline int cloop_read_block(BlockDriverState *bs, int block_num)
diff --git a/block/commit.c b/block/commit.c
index fae79582d..2227fc2e6 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -13,8 +13,8 @@
*/
#include "trace.h"
-#include "block_int.h"
-#include "blockjob.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
#include "qemu/ratelimit.h"
enum {
@@ -65,7 +65,7 @@ static void coroutine_fn commit_run(void *opaque)
BlockDriverState *active = s->active;
BlockDriverState *top = s->top;
BlockDriverState *base = s->base;
- BlockDriverState *overlay_bs = NULL;
+ BlockDriverState *overlay_bs;
int64_t sector_num, end;
int ret = 0;
int n = 0;
@@ -92,8 +92,6 @@ static void coroutine_fn commit_run(void *opaque)
}
}
- overlay_bs = bdrv_find_overlay(active, top);
-
end = s->common.len >> BDRV_SECTOR_BITS;
buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
@@ -103,7 +101,7 @@ static void coroutine_fn commit_run(void *opaque)
wait:
/* Note that even when no rate limit is applied we need to yield
- * with no pending I/O here so that qemu_aio_flush() returns.
+ * with no pending I/O here so that bdrv_drain_all() returns.
*/
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
if (block_job_is_cancelled(&s->common)) {
@@ -156,7 +154,8 @@ exit_restore_reopen:
if (s->base_flags != bdrv_get_flags(base)) {
bdrv_reopen(base, s->base_flags, NULL);
}
- if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+ overlay_bs = bdrv_find_overlay(active, top);
+ if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
}
@@ -174,7 +173,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}
-static BlockJobType commit_job_type = {
+static const BlockJobType commit_job_type = {
.instance_size = sizeof(CommitBlockJob),
.job_type = "commit",
.set_speed = commit_set_speed,
diff --git a/block/cow.c b/block/cow.c
index a5a00eb9c..1cc2e89c7 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -22,8 +22,8 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
/**************************************************************/
/* COW block driver using file system holes */
@@ -58,7 +58,7 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
return 0;
}
-static int cow_open(BlockDriverState *bs, int flags)
+static int cow_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVCowState *s = bs->opaque;
struct cow_header_v2 cow_header;
@@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags)
}
if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
- ret = -EINVAL;
+ ret = -EMEDIUMTYPE;
goto fail;
}
@@ -279,7 +279,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
return ret;
}
- ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
+ ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR);
if (ret < 0) {
return ret;
}
@@ -340,6 +340,7 @@ static BlockDriver bdrv_cow = {
.bdrv_open = cow_open,
.bdrv_close = cow_close,
.bdrv_create = cow_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_read = cow_co_read,
.bdrv_write = cow_co_write,
diff --git a/block/curl.c b/block/curl.c
index 1179484de..82d39ff53 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -22,7 +22,7 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
#include <curl/curl.h>
// #define DEBUG
@@ -34,6 +34,10 @@
#define DPRINTF(fmt, ...) do { } while (0)
#endif
+#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
+ CURLPROTO_FTP | CURLPROTO_FTPS | \
+ CURLPROTO_TFTP)
+
#define CURL_NUM_STATES 8
#define CURL_NUM_ACB 8
#define SECTOR_SIZE 512
@@ -77,6 +81,7 @@ typedef struct BDRVCURLState {
CURLState states[CURL_NUM_STATES];
char *url;
size_t readahead_size;
+ bool accept_range;
} BDRVCURLState;
static void curl_clean_state(CURLState *s);
@@ -106,14 +111,15 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
return 0;
}
-static size_t curl_size_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
+static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
{
- CURLState *s = ((CURLState*)opaque);
+ BDRVCURLState *s = opaque;
size_t realsize = size * nmemb;
- size_t fsize;
+ const char *accept_line = "Accept-Ranges: bytes";
- if(sscanf(ptr, "Content-Length: %zd", &fsize) == 1) {
- s->s->len = fsize;
+ if (realsize >= strlen(accept_line)
+ && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) {
+ s->accept_range = true;
}
return realsize;
@@ -302,6 +308,17 @@ static CURLState *curl_init_state(BDRVCURLState *s)
curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);
+ /* Restrict supported protocols to avoid security issues in the more
+ * obscure protocols. For example, do not allow POP3/SMTP/IMAP see
+ * CVE-2013-0249.
+ *
+ * Restricting protocols is only supported from 7.19.4 upwards.
+ */
+#if LIBCURL_VERSION_NUM >= 0x071304
+ curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS);
+ curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS);
+#endif
+
#ifdef DEBUG_VERBOSE
curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1);
#endif
@@ -320,11 +337,9 @@ static void curl_clean_state(CURLState *s)
s->in_use = 0;
}
-static int curl_open(BlockDriverState *bs, const char *filename, int flags)
+static void curl_parse_filename(const char *filename, QDict *options,
+ Error **errp)
{
- BDRVCURLState *s = bs->opaque;
- CURLState *state = NULL;
- double d;
#define RA_OPTSTR ":readahead="
char *file;
@@ -332,19 +347,17 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
const char *ra_val;
int parse_state = 0;
- static int inited = 0;
-
file = g_strdup(filename);
- s->readahead_size = READ_AHEAD_SIZE;
/* Parse a trailing ":readahead=#:" param, if present. */
ra = file + strlen(file) - 1;
while (ra >= file) {
if (parse_state == 0) {
- if (*ra == ':')
+ if (*ra == ':') {
parse_state++;
- else
+ } else {
break;
+ }
} else if (parse_state == 1) {
if (*ra > '9' || *ra < '0') {
char *opt_start = ra - strlen(RA_OPTSTR) + 1;
@@ -353,46 +366,108 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
ra_val = ra + 1;
ra -= strlen(RA_OPTSTR) - 1;
*ra = '\0';
- s->readahead_size = atoi(ra_val);
- break;
- } else {
- break;
+ qdict_put(options, "readahead", qstring_from_str(ra_val));
}
+ break;
}
}
ra--;
}
+ qdict_put(options, "url", qstring_from_str(file));
+
+ g_free(file);
+}
+
+static QemuOptsList runtime_opts = {
+ .name = "curl",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "url",
+ .type = QEMU_OPT_STRING,
+ .help = "URL to open",
+ },
+ {
+ .name = "readahead",
+ .type = QEMU_OPT_SIZE,
+ .help = "Readahead size",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int curl_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVCURLState *s = bs->opaque;
+ CURLState *state = NULL;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *file;
+ double d;
+
+ static int inited = 0;
+
+ if (flags & BDRV_O_RDWR) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR,
+ "curl block device does not support writes");
+ return -EROFS;
+ }
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ goto out_noclean;
+ }
+
+ s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE);
if ((s->readahead_size & 0x1ff) != 0) {
fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n",
s->readahead_size);
goto out_noclean;
}
+ file = qemu_opt_get(opts, "url");
+ if (file == NULL) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires "
+ "an 'url' option");
+ goto out_noclean;
+ }
+
if (!inited) {
curl_global_init(CURL_GLOBAL_ALL);
inited = 1;
}
DPRINTF("CURL: Opening %s\n", file);
- s->url = file;
+ s->url = g_strdup(file);
state = curl_init_state(s);
if (!state)
goto out_noclean;
// Get file size
+ s->accept_range = false;
curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1);
- curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_size_cb);
+ curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION,
+ curl_header_cb);
+ curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s);
if (curl_easy_perform(state->curl))
goto out;
curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d);
- curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb);
- curl_easy_setopt(state->curl, CURLOPT_NOBODY, 0);
if (d)
s->len = (size_t)d;
else if(!s->len)
goto out;
+ if ((!strncasecmp(s->url, "http://", strlen("http://"))
+ || !strncasecmp(s->url, "https://", strlen("https://")))
+ && !s->accept_range) {
+ pstrcpy(state->errmsg, CURL_ERROR_SIZE,
+ "Server does not support 'range' (byte ranges).");
+ goto out;
+ }
DPRINTF("CURL: Size = %zd\n", s->len);
curl_clean_state(state);
@@ -403,10 +478,11 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
// initialize the multi interface!
s->multi = curl_multi_init();
- curl_multi_setopt( s->multi, CURLMOPT_SOCKETDATA, s);
- curl_multi_setopt( s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb );
+ curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
+ curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
curl_multi_do(s);
+ qemu_opts_del(opts);
return 0;
out:
@@ -414,7 +490,8 @@ out:
curl_easy_cleanup(state->curl);
state->curl = NULL;
out_noclean:
- g_free(file);
+ g_free(s->url);
+ qemu_opts_del(opts);
return -EINVAL;
}
@@ -552,63 +629,68 @@ static int64_t curl_getlength(BlockDriverState *bs)
}
static BlockDriver bdrv_http = {
- .format_name = "http",
- .protocol_name = "http",
+ .format_name = "http",
+ .protocol_name = "http",
- .instance_size = sizeof(BDRVCURLState),
- .bdrv_file_open = curl_open,
- .bdrv_close = curl_close,
- .bdrv_getlength = curl_getlength,
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_parse_filename = curl_parse_filename,
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
- .bdrv_aio_readv = curl_aio_readv,
+ .bdrv_aio_readv = curl_aio_readv,
};
static BlockDriver bdrv_https = {
- .format_name = "https",
- .protocol_name = "https",
+ .format_name = "https",
+ .protocol_name = "https",
- .instance_size = sizeof(BDRVCURLState),
- .bdrv_file_open = curl_open,
- .bdrv_close = curl_close,
- .bdrv_getlength = curl_getlength,
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_parse_filename = curl_parse_filename,
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
- .bdrv_aio_readv = curl_aio_readv,
+ .bdrv_aio_readv = curl_aio_readv,
};
static BlockDriver bdrv_ftp = {
- .format_name = "ftp",
- .protocol_name = "ftp",
+ .format_name = "ftp",
+ .protocol_name = "ftp",
- .instance_size = sizeof(BDRVCURLState),
- .bdrv_file_open = curl_open,
- .bdrv_close = curl_close,
- .bdrv_getlength = curl_getlength,
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_parse_filename = curl_parse_filename,
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
- .bdrv_aio_readv = curl_aio_readv,
+ .bdrv_aio_readv = curl_aio_readv,
};
static BlockDriver bdrv_ftps = {
- .format_name = "ftps",
- .protocol_name = "ftps",
+ .format_name = "ftps",
+ .protocol_name = "ftps",
- .instance_size = sizeof(BDRVCURLState),
- .bdrv_file_open = curl_open,
- .bdrv_close = curl_close,
- .bdrv_getlength = curl_getlength,
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_parse_filename = curl_parse_filename,
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
- .bdrv_aio_readv = curl_aio_readv,
+ .bdrv_aio_readv = curl_aio_readv,
};
static BlockDriver bdrv_tftp = {
- .format_name = "tftp",
- .protocol_name = "tftp",
+ .format_name = "tftp",
+ .protocol_name = "tftp",
- .instance_size = sizeof(BDRVCURLState),
- .bdrv_file_open = curl_open,
- .bdrv_close = curl_close,
- .bdrv_getlength = curl_getlength,
+ .instance_size = sizeof(BDRVCURLState),
+ .bdrv_parse_filename = curl_parse_filename,
+ .bdrv_file_open = curl_open,
+ .bdrv_close = curl_close,
+ .bdrv_getlength = curl_getlength,
- .bdrv_aio_readv = curl_aio_readv,
+ .bdrv_aio_readv = curl_aio_readv,
};
static void curl_block_init(void)
diff --git a/block/dmg.c b/block/dmg.c
index 37902a434..3141cb5b8 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -22,9 +22,9 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "bswap.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/bswap.h"
+#include "qemu/module.h"
#include <zlib.h>
typedef struct BDRVDMGState {
@@ -51,35 +51,55 @@ typedef struct BDRVDMGState {
static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
{
- int len=strlen(filename);
- if(len>4 && !strcmp(filename+len-4,".dmg"))
- return 2;
+ int len;
+
+ if (!filename) {
+ return 0;
+ }
+
+ len = strlen(filename);
+ if (len > 4 && !strcmp(filename + len - 4, ".dmg")) {
+ return 2;
+ }
return 0;
}
-static off_t read_off(BlockDriverState *bs, int64_t offset)
+static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
{
- uint64_t buffer;
- if (bdrv_pread(bs->file, offset, &buffer, 8) < 8)
- return 0;
- return be64_to_cpu(buffer);
+ uint64_t buffer;
+ int ret;
+
+ ret = bdrv_pread(bs->file, offset, &buffer, 8);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *result = be64_to_cpu(buffer);
+ return 0;
}
-static off_t read_uint32(BlockDriverState *bs, int64_t offset)
+static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
{
- uint32_t buffer;
- if (bdrv_pread(bs->file, offset, &buffer, 4) < 4)
- return 0;
- return be32_to_cpu(buffer);
+ uint32_t buffer;
+ int ret;
+
+ ret = bdrv_pread(bs->file, offset, &buffer, 4);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *result = be32_to_cpu(buffer);
+ return 0;
}
-static int dmg_open(BlockDriverState *bs, int flags)
+static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVDMGState *s = bs->opaque;
- off_t info_begin,info_end,last_in_offset,last_out_offset;
- uint32_t count;
+ uint64_t info_begin,info_end,last_in_offset,last_out_offset;
+ uint32_t count, tmp;
uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
int64_t offset;
+ int ret;
bs->read_only = 1;
s->n_chunks = 0;
@@ -88,21 +108,32 @@ static int dmg_open(BlockDriverState *bs, int flags)
/* read offset of info blocks */
offset = bdrv_getlength(bs->file);
if (offset < 0) {
+ ret = offset;
goto fail;
}
offset -= 0x1d8;
- info_begin = read_off(bs, offset);
- if (info_begin == 0) {
- goto fail;
+ ret = read_uint64(bs, offset, &info_begin);
+ if (ret < 0) {
+ goto fail;
+ } else if (info_begin == 0) {
+ ret = -EINVAL;
+ goto fail;
}
- if (read_uint32(bs, info_begin) != 0x100) {
+ ret = read_uint32(bs, info_begin, &tmp);
+ if (ret < 0) {
+ goto fail;
+ } else if (tmp != 0x100) {
+ ret = -EINVAL;
goto fail;
}
- count = read_uint32(bs, info_begin + 4);
- if (count == 0) {
+ ret = read_uint32(bs, info_begin + 4, &count);
+ if (ret < 0) {
+ goto fail;
+ } else if (count == 0) {
+ ret = -EINVAL;
goto fail;
}
info_end = info_begin + count;
@@ -114,12 +145,20 @@ static int dmg_open(BlockDriverState *bs, int flags)
while (offset < info_end) {
uint32_t type;
- count = read_uint32(bs, offset);
- if(count==0)
- goto fail;
+ ret = read_uint32(bs, offset, &count);
+ if (ret < 0) {
+ goto fail;
+ } else if (count == 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
offset += 4;
- type = read_uint32(bs, offset);
+ ret = read_uint32(bs, offset, &type);
+ if (ret < 0) {
+ goto fail;
+ }
+
if (type == 0x6d697368 && count >= 244) {
int new_size, chunk_count;
@@ -134,8 +173,11 @@ static int dmg_open(BlockDriverState *bs, int flags)
s->sectors = g_realloc(s->sectors, new_size);
s->sectorcounts = g_realloc(s->sectorcounts, new_size);
- for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
- s->types[i] = read_uint32(bs, offset);
+ for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
+ ret = read_uint32(bs, offset, &s->types[i]);
+ if (ret < 0) {
+ goto fail;
+ }
offset += 4;
if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
if(s->types[i]==0xffffffff) {
@@ -149,17 +191,31 @@ static int dmg_open(BlockDriverState *bs, int flags)
}
offset += 4;
- s->sectors[i] = last_out_offset+read_off(bs, offset);
- offset += 8;
-
- s->sectorcounts[i] = read_off(bs, offset);
- offset += 8;
-
- s->offsets[i] = last_in_offset+read_off(bs, offset);
- offset += 8;
-
- s->lengths[i] = read_off(bs, offset);
- offset += 8;
+ ret = read_uint64(bs, offset, &s->sectors[i]);
+ if (ret < 0) {
+ goto fail;
+ }
+ s->sectors[i] += last_out_offset;
+ offset += 8;
+
+ ret = read_uint64(bs, offset, &s->sectorcounts[i]);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += 8;
+
+ ret = read_uint64(bs, offset, &s->offsets[i]);
+ if (ret < 0) {
+ goto fail;
+ }
+ s->offsets[i] += last_in_offset;
+ offset += 8;
+
+ ret = read_uint64(bs, offset, &s->lengths[i]);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += 8;
if(s->lengths[i]>max_compressed_size)
max_compressed_size = s->lengths[i];
@@ -173,15 +229,25 @@ static int dmg_open(BlockDriverState *bs, int flags)
/* initialize zlib engine */
s->compressed_chunk = g_malloc(max_compressed_size+1);
s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
- if(inflateInit(&s->zstream) != Z_OK)
- goto fail;
+ if(inflateInit(&s->zstream) != Z_OK) {
+ ret = -EINVAL;
+ goto fail;
+ }
s->current_chunk = s->n_chunks;
qemu_co_mutex_init(&s->lock);
return 0;
+
fail:
- return -1;
+ g_free(s->types);
+ g_free(s->offsets);
+ g_free(s->lengths);
+ g_free(s->sectors);
+ g_free(s->sectorcounts);
+ g_free(s->compressed_chunk);
+ g_free(s->uncompressed_chunk);
+ return ret;
}
static inline int is_sector_in_chunk(BDRVDMGState* s,
@@ -296,15 +362,15 @@ static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
static void dmg_close(BlockDriverState *bs)
{
BDRVDMGState *s = bs->opaque;
- if(s->n_chunks>0) {
- free(s->types);
- free(s->offsets);
- free(s->lengths);
- free(s->sectors);
- free(s->sectorcounts);
- }
- free(s->compressed_chunk);
- free(s->uncompressed_chunk);
+
+ g_free(s->types);
+ g_free(s->offsets);
+ g_free(s->lengths);
+ g_free(s->sectors);
+ g_free(s->sectorcounts);
+ g_free(s->compressed_chunk);
+ g_free(s->uncompressed_chunk);
+
inflateEnd(&s->zstream);
}
diff --git a/block/gluster.c b/block/gluster.c
index 1c90174b1..645b7f12a 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -16,9 +16,9 @@
* GNU GPL, version 2 or (at your option) any later version.
*/
#include <glusterfs/api/glfs.h>
-#include "block_int.h"
-#include "qemu_socket.h"
-#include "uri.h"
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
typedef struct GlusterAIOCB {
BlockDriverAIOCB common;
@@ -217,7 +217,7 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
ret = glfs_init(glfs);
if (ret) {
error_report("Gluster connection failed for server=%s port=%d "
- "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+ "volume=%s image=%s transport=%s", gconf->server, gconf->port,
gconf->volname, gconf->image, gconf->transport);
goto out;
}
@@ -282,13 +282,42 @@ static int qemu_gluster_aio_flush_cb(void *opaque)
return (s->qemu_aio_count > 0);
}
-static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
- int bdrv_flags)
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+ .name = "gluster",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "URL to the gluster image",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
+ int bdrv_flags)
{
BDRVGlusterState *s = bs->opaque;
int open_flags = O_BINARY;
int ret = 0;
GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
s->glfs = qemu_gluster_init(gconf, filename);
if (!s->glfs) {
@@ -322,6 +351,7 @@ static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
out:
+ qemu_opts_del(opts);
qemu_gluster_gconf_free(gconf);
if (!ret) {
return ret;
@@ -463,6 +493,19 @@ out:
return NULL;
}
+static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
+{
+ int ret;
+ BDRVGlusterState *s = bs->opaque;
+
+ ret = glfs_ftruncate(s->fd, offset);
+ if (ret < 0) {
+ return -errno;
+ }
+
+ return 0;
+}
+
static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque)
@@ -502,6 +545,39 @@ out:
return NULL;
}
+#ifdef CONFIG_GLUSTERFS_DISCARD
+static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ int ret;
+ GlusterAIOCB *acb;
+ BDRVGlusterState *s = bs->opaque;
+ size_t size;
+ off_t offset;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+ acb->size = 0;
+ acb->ret = 0;
+ acb->finished = NULL;
+ s->qemu_aio_count++;
+
+ ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+ if (ret < 0) {
+ goto out;
+ }
+ return &acb->common;
+
+out:
+ s->qemu_aio_count--;
+ qemu_aio_release(acb);
+ return NULL;
+}
+#endif
+
static int64_t qemu_gluster_getlength(BlockDriverState *bs)
{
BDRVGlusterState *s = bs->opaque;
@@ -544,6 +620,12 @@ static void qemu_gluster_close(BlockDriverState *bs)
glfs_fini(s->glfs);
}
+static int qemu_gluster_has_zero_init(BlockDriverState *bs)
+{
+ /* GlusterFS volume could be backed by a block device */
+ return 0;
+}
+
static QEMUOptionParameter qemu_gluster_create_options[] = {
{
.name = BLOCK_OPT_SIZE,
@@ -562,9 +644,14 @@ static BlockDriver bdrv_gluster = {
.bdrv_create = qemu_gluster_create,
.bdrv_getlength = qemu_gluster_getlength,
.bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_truncate = qemu_gluster_truncate,
.bdrv_aio_readv = qemu_gluster_aio_readv,
.bdrv_aio_writev = qemu_gluster_aio_writev,
.bdrv_aio_flush = qemu_gluster_aio_flush,
+ .bdrv_has_zero_init = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+ .bdrv_aio_discard = qemu_gluster_aio_discard,
+#endif
.create_options = qemu_gluster_create_options,
};
@@ -577,9 +664,14 @@ static BlockDriver bdrv_gluster_tcp = {
.bdrv_create = qemu_gluster_create,
.bdrv_getlength = qemu_gluster_getlength,
.bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_truncate = qemu_gluster_truncate,
.bdrv_aio_readv = qemu_gluster_aio_readv,
.bdrv_aio_writev = qemu_gluster_aio_writev,
.bdrv_aio_flush = qemu_gluster_aio_flush,
+ .bdrv_has_zero_init = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+ .bdrv_aio_discard = qemu_gluster_aio_discard,
+#endif
.create_options = qemu_gluster_create_options,
};
@@ -592,9 +684,14 @@ static BlockDriver bdrv_gluster_unix = {
.bdrv_create = qemu_gluster_create,
.bdrv_getlength = qemu_gluster_getlength,
.bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_truncate = qemu_gluster_truncate,
.bdrv_aio_readv = qemu_gluster_aio_readv,
.bdrv_aio_writev = qemu_gluster_aio_writev,
.bdrv_aio_flush = qemu_gluster_aio_flush,
+ .bdrv_has_zero_init = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+ .bdrv_aio_discard = qemu_gluster_aio_discard,
+#endif
.create_options = qemu_gluster_create_options,
};
@@ -607,9 +704,14 @@ static BlockDriver bdrv_gluster_rdma = {
.bdrv_create = qemu_gluster_create,
.bdrv_getlength = qemu_gluster_getlength,
.bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_truncate = qemu_gluster_truncate,
.bdrv_aio_readv = qemu_gluster_aio_readv,
.bdrv_aio_writev = qemu_gluster_aio_writev,
.bdrv_aio_flush = qemu_gluster_aio_flush,
+ .bdrv_has_zero_init = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+ .bdrv_aio_discard = qemu_gluster_aio_discard,
+#endif
.create_options = qemu_gluster_create_options,
};
diff --git a/block/iscsi.c b/block/iscsi.c
index c0b70b3d3..e7c1c2b53 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -27,17 +27,19 @@
#include <poll.h>
#include <arpa/inet.h>
#include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
#include "trace.h"
-#include "hw/scsi-defs.h"
+#include "block/scsi.h"
+#include "qemu/iov.h"
#include <iscsi/iscsi.h>
#include <iscsi/scsi-lowlevel.h>
#ifdef __linux__
#include <scsi/sg.h>
-#include <hw/scsi-defs.h>
+#include <block/scsi.h>
#endif
typedef struct IscsiLun {
@@ -47,6 +49,7 @@ typedef struct IscsiLun {
int block_size;
uint64_t num_blocks;
int events;
+ QEMUTimer *nop_timer;
} IscsiLun;
typedef struct IscsiAIOCB {
@@ -58,13 +61,18 @@ typedef struct IscsiAIOCB {
uint8_t *buf;
int status;
int canceled;
- size_t read_size;
- size_t read_offset;
+ int retries;
+ int64_t sector_num;
+ int nb_sectors;
#ifdef __linux__
sg_io_hdr_t *ioh;
#endif
} IscsiAIOCB;
+#define NOP_INTERVAL 5000
+#define MAX_NOP_FAILURES 3
+#define ISCSI_CMD_RETRIES 5
+
static void
iscsi_bh_cb(void *p)
{
@@ -72,6 +80,9 @@ iscsi_bh_cb(void *p)
qemu_bh_delete(acb->bh);
+ g_free(acb->buf);
+ acb->buf = NULL;
+
if (acb->canceled == 0) {
acb->common.cb(acb->common.opaque, acb->status);
}
@@ -183,6 +194,8 @@ iscsi_process_write(void *arg)
iscsi_set_events(iscsilun);
}
+static int
+iscsi_aio_writev_acb(IscsiAIOCB *acb);
static void
iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
@@ -193,13 +206,24 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled);
g_free(acb->buf);
+ acb->buf = NULL;
if (acb->canceled != 0) {
return;
}
acb->status = 0;
- if (status < 0) {
+ if (status != 0) {
+ if (status == SCSI_STATUS_CHECK_CONDITION
+ && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+ && acb->retries-- > 0) {
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ if (iscsi_aio_writev_acb(acb) == 0) {
+ iscsi_set_events(acb->iscsilun);
+ return;
+ }
+ }
error_report("Failed to write16 data to iSCSI lun. %s",
iscsi_get_error(iscsi));
acb->status = -EIO;
@@ -208,78 +232,139 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
iscsi_schedule_bh(acb);
}
+static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+ return sector * iscsilun->block_size / BDRV_SECTOR_SIZE;
+}
+
static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
{
return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
}
-static BlockDriverAIOCB *
-iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb,
- void *opaque)
+static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
+ IscsiLun *iscsilun)
{
- IscsiLun *iscsilun = bs->opaque;
- struct iscsi_context *iscsi = iscsilun->iscsi;
- IscsiAIOCB *acb;
+ if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
+ (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
+ error_report("iSCSI misaligned request: "
+ "iscsilun->block_size %u, sector_num %" PRIi64
+ ", nb_sectors %d",
+ iscsilun->block_size, sector_num, nb_sectors);
+ return 0;
+ }
+ return 1;
+}
+
+static int
+iscsi_aio_writev_acb(IscsiAIOCB *acb)
+{
+ struct iscsi_context *iscsi = acb->iscsilun->iscsi;
size_t size;
uint32_t num_sectors;
uint64_t lba;
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
struct iscsi_data data;
-
- acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
- trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
-
- acb->iscsilun = iscsilun;
- acb->qiov = qiov;
+#endif
+ int ret;
acb->canceled = 0;
acb->bh = NULL;
acb->status = -EINPROGRESS;
+ acb->buf = NULL;
- /* XXX we should pass the iovec to write16 to avoid the extra copy */
/* this will allow us to get rid of 'buf' completely */
- size = nb_sectors * BDRV_SECTOR_SIZE;
- acb->buf = g_malloc(size);
- qemu_iovec_to_buf(acb->qiov, 0, acb->buf, size);
+ size = acb->nb_sectors * BDRV_SECTOR_SIZE;
+
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
+ data.size = MIN(size, acb->qiov->size);
+
+ /* if the iovec only contains one buffer we can pass it directly */
+ if (acb->qiov->niov == 1) {
+ data.data = acb->qiov->iov[0].iov_base;
+ } else {
+ acb->buf = g_malloc(data.size);
+ qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size);
+ data.data = acb->buf;
+ }
+#endif
acb->task = malloc(sizeof(struct scsi_task));
if (acb->task == NULL) {
error_report("iSCSI: Failed to allocate task for scsi WRITE16 "
"command. %s", iscsi_get_error(iscsi));
- qemu_aio_release(acb);
- return NULL;
+ return -1;
}
memset(acb->task, 0, sizeof(struct scsi_task));
acb->task->xfer_dir = SCSI_XFER_WRITE;
acb->task->cdb_size = 16;
acb->task->cdb[0] = 0x8a;
- lba = sector_qemu2lun(sector_num, iscsilun);
+ lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
*(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32);
*(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff);
- num_sectors = size / iscsilun->block_size;
+ num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
*(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
acb->task->expxferlen = size;
- data.data = acb->buf;
- data.size = size;
-
- if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
- iscsi_aio_write16_cb,
- &data,
- acb) != 0) {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+ iscsi_aio_write16_cb,
+ NULL,
+ acb);
+#else
+ ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+ iscsi_aio_write16_cb,
+ &data,
+ acb);
+#endif
+ if (ret != 0) {
scsi_free_scsi_task(acb->task);
g_free(acb->buf);
+ return -1;
+ }
+
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+#endif
+
+ return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ IscsiAIOCB *acb;
+
+ if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+ return NULL;
+ }
+
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+ trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
+
+ acb->iscsilun = iscsilun;
+ acb->qiov = qiov;
+ acb->nb_sectors = nb_sectors;
+ acb->sector_num = sector_num;
+ acb->retries = ISCSI_CMD_RETRIES;
+
+ if (iscsi_aio_writev_acb(acb) != 0) {
qemu_aio_release(acb);
return NULL;
}
iscsi_set_events(iscsilun);
-
return &acb->common;
}
+static int
+iscsi_aio_readv_acb(IscsiAIOCB *acb);
+
static void
iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
void *command_data, void *opaque)
@@ -294,6 +379,16 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
acb->status = 0;
if (status != 0) {
+ if (status == SCSI_STATUS_CHECK_CONDITION
+ && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+ && acb->retries-- > 0) {
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ if (iscsi_aio_readv_acb(acb) == 0) {
+ iscsi_set_events(acb->iscsilun);
+ return;
+ }
+ }
error_report("Failed to read16 data from iSCSI lun. %s",
iscsi_get_error(iscsi));
acb->status = -EIO;
@@ -302,63 +397,39 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
iscsi_schedule_bh(acb);
}
-static BlockDriverAIOCB *
-iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb,
- void *opaque)
+static int
+iscsi_aio_readv_acb(IscsiAIOCB *acb)
{
- IscsiLun *iscsilun = bs->opaque;
- struct iscsi_context *iscsi = iscsilun->iscsi;
- IscsiAIOCB *acb;
- size_t qemu_read_size;
- int i;
+ struct iscsi_context *iscsi = acb->iscsilun->iscsi;
+ size_t size;
uint64_t lba;
uint32_t num_sectors;
-
- qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
-
- acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
- trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
-
- acb->iscsilun = iscsilun;
- acb->qiov = qiov;
+ int ret;
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
+ int i;
+#endif
acb->canceled = 0;
acb->bh = NULL;
acb->status = -EINPROGRESS;
- acb->read_size = qemu_read_size;
acb->buf = NULL;
- /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU
- * may be misaligned to the LUN, so we may need to read some extra
- * data.
- */
- acb->read_offset = 0;
- if (iscsilun->block_size > BDRV_SECTOR_SIZE) {
- uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num;
-
- acb->read_offset = bdrv_offset % iscsilun->block_size;
- }
-
- num_sectors = (qemu_read_size + iscsilun->block_size
- + acb->read_offset - 1)
- / iscsilun->block_size;
+ size = acb->nb_sectors * BDRV_SECTOR_SIZE;
acb->task = malloc(sizeof(struct scsi_task));
if (acb->task == NULL) {
error_report("iSCSI: Failed to allocate task for scsi READ16 "
"command. %s", iscsi_get_error(iscsi));
- qemu_aio_release(acb);
- return NULL;
+ return -1;
}
memset(acb->task, 0, sizeof(struct scsi_task));
acb->task->xfer_dir = SCSI_XFER_READ;
- lba = sector_qemu2lun(sector_num, iscsilun);
- acb->task->expxferlen = qemu_read_size;
+ acb->task->expxferlen = size;
+ lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
+ num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
- switch (iscsilun->type) {
+ switch (acb->iscsilun->type) {
case TYPE_DISK:
acb->task->cdb_size = 16;
acb->task->cdb[0] = 0x88;
@@ -374,26 +445,60 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
break;
}
- if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
- iscsi_aio_read16_cb,
- NULL,
- acb) != 0) {
+ ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+ iscsi_aio_read16_cb,
+ NULL,
+ acb);
+ if (ret != 0) {
scsi_free_scsi_task(acb->task);
- qemu_aio_release(acb);
- return NULL;
+ return -1;
}
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+#else
for (i = 0; i < acb->qiov->niov; i++) {
scsi_task_add_data_in_buffer(acb->task,
acb->qiov->iov[i].iov_len,
acb->qiov->iov[i].iov_base);
}
+#endif
+ return 0;
+}
- iscsi_set_events(iscsilun);
+static BlockDriverAIOCB *
+iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ IscsiAIOCB *acb;
+ if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+ return NULL;
+ }
+
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+ trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
+
+ acb->nb_sectors = nb_sectors;
+ acb->sector_num = sector_num;
+ acb->iscsilun = iscsilun;
+ acb->qiov = qiov;
+ acb->retries = ISCSI_CMD_RETRIES;
+
+ if (iscsi_aio_readv_acb(acb) != 0) {
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ iscsi_set_events(iscsilun);
return &acb->common;
}
+static int
+iscsi_aio_flush_acb(IscsiAIOCB *acb);
static void
iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
@@ -406,7 +511,17 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
}
acb->status = 0;
- if (status < 0) {
+ if (status != 0) {
+ if (status == SCSI_STATUS_CHECK_CONDITION
+ && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+ && acb->retries-- > 0) {
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ if (iscsi_aio_flush_acb(acb) == 0) {
+ iscsi_set_events(acb->iscsilun);
+ return;
+ }
+ }
error_report("Failed to sync10 data on iSCSI lun. %s",
iscsi_get_error(iscsi));
acb->status = -EIO;
@@ -415,28 +530,43 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
iscsi_schedule_bh(acb);
}
-static BlockDriverAIOCB *
-iscsi_aio_flush(BlockDriverState *bs,
- BlockDriverCompletionFunc *cb, void *opaque)
+static int
+iscsi_aio_flush_acb(IscsiAIOCB *acb)
{
- IscsiLun *iscsilun = bs->opaque;
- struct iscsi_context *iscsi = iscsilun->iscsi;
- IscsiAIOCB *acb;
+ struct iscsi_context *iscsi = acb->iscsilun->iscsi;
- acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
- acb->iscsilun = iscsilun;
acb->canceled = 0;
acb->bh = NULL;
acb->status = -EINPROGRESS;
+ acb->buf = NULL;
- acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
+ acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun,
0, 0, 0, 0,
iscsi_synccache10_cb,
acb);
if (acb->task == NULL) {
error_report("iSCSI: Failed to send synchronizecache10 command. %s",
iscsi_get_error(iscsi));
+ return -1;
+ }
+
+ return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+
+ IscsiAIOCB *acb;
+
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->retries = ISCSI_CMD_RETRIES;
+
+ if (iscsi_aio_flush_acb(acb) != 0) {
qemu_aio_release(acb);
return NULL;
}
@@ -446,6 +576,8 @@ iscsi_aio_flush(BlockDriverState *bs,
return &acb->common;
}
+static int iscsi_aio_discard_acb(IscsiAIOCB *acb);
+
static void
iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
void *command_data, void *opaque)
@@ -457,7 +589,17 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
}
acb->status = 0;
- if (status < 0) {
+ if (status != 0) {
+ if (status == SCSI_STATUS_CHECK_CONDITION
+ && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+ && acb->retries-- > 0) {
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ if (iscsi_aio_discard_acb(acb) == 0) {
+ iscsi_set_events(acb->iscsilun);
+ return;
+ }
+ }
error_report("Failed to unmap data on iSCSI lun. %s",
iscsi_get_error(iscsi));
acb->status = -EIO;
@@ -466,33 +608,47 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
iscsi_schedule_bh(acb);
}
-static BlockDriverAIOCB *
-iscsi_aio_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
-{
- IscsiLun *iscsilun = bs->opaque;
- struct iscsi_context *iscsi = iscsilun->iscsi;
- IscsiAIOCB *acb;
+static int iscsi_aio_discard_acb(IscsiAIOCB *acb) {
+ struct iscsi_context *iscsi = acb->iscsilun->iscsi;
struct unmap_list list[1];
- acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
- acb->iscsilun = iscsilun;
acb->canceled = 0;
acb->bh = NULL;
acb->status = -EINPROGRESS;
+ acb->buf = NULL;
- list[0].lba = sector_qemu2lun(sector_num, iscsilun);
- list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
+ list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
+ list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size;
- acb->task = iscsi_unmap_task(iscsi, iscsilun->lun,
+ acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun,
0, 0, &list[0], 1,
iscsi_unmap_cb,
acb);
if (acb->task == NULL) {
error_report("iSCSI: Failed to send unmap command. %s",
iscsi_get_error(iscsi));
+ return -1;
+ }
+
+ return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ IscsiAIOCB *acb;
+
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->nb_sectors = nb_sectors;
+ acb->sector_num = sector_num;
+ acb->retries = ISCSI_CMD_RETRIES;
+
+ if (iscsi_aio_discard_acb(acb) != 0) {
qemu_aio_release(acb);
return NULL;
}
@@ -509,6 +665,9 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
{
IscsiAIOCB *acb = opaque;
+ g_free(acb->buf);
+ acb->buf = NULL;
+
if (acb->canceled != 0) {
return;
}
@@ -585,14 +744,30 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len);
acb->task->expxferlen = acb->ioh->dxfer_len;
+ data.size = 0;
if (acb->task->xfer_dir == SCSI_XFER_WRITE) {
- data.data = acb->ioh->dxferp;
- data.size = acb->ioh->dxfer_len;
+ if (acb->ioh->iovec_count == 0) {
+ data.data = acb->ioh->dxferp;
+ data.size = acb->ioh->dxfer_len;
+ } else {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_out(acb->task,
+ (struct scsi_iovec *) acb->ioh->dxferp,
+ acb->ioh->iovec_count);
+#else
+ struct iovec *iov = (struct iovec *)acb->ioh->dxferp;
+
+ acb->buf = g_malloc(acb->ioh->dxfer_len);
+ data.data = acb->buf;
+ data.size = iov_to_buf(iov, acb->ioh->iovec_count, 0,
+ acb->buf, acb->ioh->dxfer_len);
+#endif
+ }
}
+
if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
iscsi_aio_ioctl_cb,
- (acb->task->xfer_dir == SCSI_XFER_WRITE) ?
- &data : NULL,
+ (data.size > 0) ? &data : NULL,
acb) != 0) {
scsi_free_scsi_task(acb->task);
qemu_aio_release(acb);
@@ -601,9 +776,26 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
/* tell libiscsi to read straight into the buffer we got from ioctl */
if (acb->task->xfer_dir == SCSI_XFER_READ) {
- scsi_task_add_data_in_buffer(acb->task,
- acb->ioh->dxfer_len,
- acb->ioh->dxferp);
+ if (acb->ioh->iovec_count == 0) {
+ scsi_task_add_data_in_buffer(acb->task,
+ acb->ioh->dxfer_len,
+ acb->ioh->dxferp);
+ } else {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_in(acb->task,
+ (struct scsi_iovec *) acb->ioh->dxferp,
+ acb->ioh->iovec_count);
+#else
+ int i;
+ for (i = 0; i < acb->ioh->iovec_count; i++) {
+ struct iovec *iov = (struct iovec *)acb->ioh->dxferp;
+
+ scsi_task_add_data_in_buffer(acb->task,
+ iov[i].iov_len,
+ iov[i].iov_base);
+ }
+#endif
+ }
}
iscsi_set_events(iscsilun);
@@ -761,20 +953,118 @@ static char *parse_initiator_name(const char *target)
}
}
+#if defined(LIBISCSI_FEATURE_NOP_COUNTER)
+static void iscsi_nop_timed_event(void *opaque)
+{
+ IscsiLun *iscsilun = opaque;
+
+ if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) {
+ error_report("iSCSI: NOP timeout. Reconnecting...");
+ iscsi_reconnect(iscsilun->iscsi);
+ }
+
+ if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
+ error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
+ return;
+ }
+
+ qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+ iscsi_set_events(iscsilun);
+}
+#endif
+
+static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
+{
+ struct scsi_task *task = NULL;
+ struct scsi_readcapacity10 *rc10 = NULL;
+ struct scsi_readcapacity16 *rc16 = NULL;
+ int ret = 0;
+ int retries = ISCSI_CMD_RETRIES;
+
+ do {
+ if (task != NULL) {
+ scsi_free_scsi_task(task);
+ task = NULL;
+ }
+
+ switch (iscsilun->type) {
+ case TYPE_DISK:
+ task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun);
+ if (task != NULL && task->status == SCSI_STATUS_GOOD) {
+ rc16 = scsi_datain_unmarshall(task);
+ if (rc16 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+ ret = -EINVAL;
+ } else {
+ iscsilun->block_size = rc16->block_length;
+ iscsilun->num_blocks = rc16->returned_lba + 1;
+ }
+ }
+ break;
+ case TYPE_ROM:
+ task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0);
+ if (task != NULL && task->status == SCSI_STATUS_GOOD) {
+ rc10 = scsi_datain_unmarshall(task);
+ if (rc10 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+ ret = -EINVAL;
+ } else {
+ iscsilun->block_size = rc10->block_size;
+ if (rc10->lba == 0) {
+ /* blank disk loaded */
+ iscsilun->num_blocks = 0;
+ } else {
+ iscsilun->num_blocks = rc10->lba + 1;
+ }
+ }
+ }
+ break;
+ default:
+ return 0;
+ }
+ } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION
+ && task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+ && retries-- > 0);
+
+ if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+ error_report("iSCSI: failed to send readcapacity10 command.");
+ ret = -EINVAL;
+ }
+ if (task) {
+ scsi_free_scsi_task(task);
+ }
+ return ret;
+}
+
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+ .name = "iscsi",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "URL to the iscsi image",
+ },
+ { /* end of list */ }
+ },
+};
+
/*
* We support iscsi url's on the form
* iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
*/
-static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
+static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
{
IscsiLun *iscsilun = bs->opaque;
struct iscsi_context *iscsi = NULL;
struct iscsi_url *iscsi_url = NULL;
struct scsi_task *task = NULL;
struct scsi_inquiry_standard *inq = NULL;
- struct scsi_readcapacity10 *rc10 = NULL;
- struct scsi_readcapacity16 *rc16 = NULL;
char *initiator_name = NULL;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
int ret;
if ((BDRV_SECTOR_SIZE % 512) != 0) {
@@ -784,6 +1074,18 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
return -EINVAL;
}
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
+
iscsi_url = iscsi_parse_full_url(iscsi, filename);
if (iscsi_url == NULL) {
error_report("Failed to parse URL : %s", filename);
@@ -863,52 +1165,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
iscsilun->type = inq->periperal_device_type;
- scsi_free_scsi_task(task);
-
- switch (iscsilun->type) {
- case TYPE_DISK:
- task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun);
- if (task == NULL || task->status != SCSI_STATUS_GOOD) {
- error_report("iSCSI: failed to send readcapacity16 command.");
- ret = -EINVAL;
- goto out;
- }
- rc16 = scsi_datain_unmarshall(task);
- if (rc16 == NULL) {
- error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
- ret = -EINVAL;
- goto out;
- }
- iscsilun->block_size = rc16->block_length;
- iscsilun->num_blocks = rc16->returned_lba + 1;
- break;
- case TYPE_ROM:
- task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0);
- if (task == NULL || task->status != SCSI_STATUS_GOOD) {
- error_report("iSCSI: failed to send readcapacity10 command.");
- ret = -EINVAL;
- goto out;
- }
- rc10 = scsi_datain_unmarshall(task);
- if (rc10 == NULL) {
- error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
- ret = -EINVAL;
- goto out;
- }
- iscsilun->block_size = rc10->block_size;
- if (rc10->lba == 0) {
- /* blank disk loaded */
- iscsilun->num_blocks = 0;
- } else {
- iscsilun->num_blocks = rc10->lba + 1;
- }
- break;
- default:
- break;
+ if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
+ goto out;
}
-
- bs->total_sectors = iscsilun->num_blocks *
- iscsilun->block_size / BDRV_SECTOR_SIZE ;
+ bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
/* Medium changer or tape. We dont have any emulation for this so this must
* be sg ioctl compatible. We force it to be sg, otherwise qemu will try
@@ -919,9 +1179,14 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
bs->sg = 1;
}
- ret = 0;
+#if defined(LIBISCSI_FEATURE_NOP_COUNTER)
+ /* Set up a timer for sending out iSCSI NOPs */
+ iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun);
+ qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+#endif
out:
+ qemu_opts_del(opts);
if (initiator_name != NULL) {
g_free(initiator_name);
}
@@ -946,16 +1211,100 @@ static void iscsi_close(BlockDriverState *bs)
IscsiLun *iscsilun = bs->opaque;
struct iscsi_context *iscsi = iscsilun->iscsi;
+ if (iscsilun->nop_timer) {
+ qemu_del_timer(iscsilun->nop_timer);
+ qemu_free_timer(iscsilun->nop_timer);
+ }
qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL);
iscsi_destroy_context(iscsi);
memset(iscsilun, 0, sizeof(IscsiLun));
}
+static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ int ret = 0;
+
+ if (iscsilun->type != TYPE_DISK) {
+ return -ENOTSUP;
+ }
+
+ if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
+ return ret;
+ }
+
+ if (offset > iscsi_getlength(bs)) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int iscsi_has_zero_init(BlockDriverState *bs)
{
return 0;
}
+static int iscsi_create(const char *filename, QEMUOptionParameter *options)
+{
+ int ret = 0;
+ int64_t total_size = 0;
+ BlockDriverState bs;
+ IscsiLun *iscsilun = NULL;
+ QDict *bs_options;
+
+ memset(&bs, 0, sizeof(BlockDriverState));
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, "size")) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ bs.opaque = g_malloc0(sizeof(struct IscsiLun));
+ iscsilun = bs.opaque;
+
+ bs_options = qdict_new();
+ qdict_put(bs_options, "filename", qstring_from_str(filename));
+ ret = iscsi_open(&bs, bs_options, 0);
+ QDECREF(bs_options);
+
+ if (ret != 0) {
+ goto out;
+ }
+ if (iscsilun->nop_timer) {
+ qemu_del_timer(iscsilun->nop_timer);
+ qemu_free_timer(iscsilun->nop_timer);
+ }
+ if (iscsilun->type != TYPE_DISK) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (bs.total_sectors < total_size) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (iscsilun->iscsi != NULL) {
+ iscsi_destroy_context(iscsilun->iscsi);
+ }
+ g_free(bs.opaque);
+ return ret;
+}
+
+static QEMUOptionParameter iscsi_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
static BlockDriver bdrv_iscsi = {
.format_name = "iscsi",
.protocol_name = "iscsi",
@@ -963,8 +1312,11 @@ static BlockDriver bdrv_iscsi = {
.instance_size = sizeof(IscsiLun),
.bdrv_file_open = iscsi_open,
.bdrv_close = iscsi_close,
+ .bdrv_create = iscsi_create,
+ .create_options = iscsi_create_options,
.bdrv_getlength = iscsi_getlength,
+ .bdrv_truncate = iscsi_truncate,
.bdrv_aio_readv = iscsi_aio_readv,
.bdrv_aio_writev = iscsi_aio_writev,
@@ -979,9 +1331,36 @@ static BlockDriver bdrv_iscsi = {
#endif
};
+static QemuOptsList qemu_iscsi_opts = {
+ .name = "iscsi",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head),
+ .desc = {
+ {
+ .name = "user",
+ .type = QEMU_OPT_STRING,
+ .help = "username for CHAP authentication to target",
+ },{
+ .name = "password",
+ .type = QEMU_OPT_STRING,
+ .help = "password for CHAP authentication to target",
+ },{
+ .name = "header-digest",
+ .type = QEMU_OPT_STRING,
+ .help = "HeaderDigest setting. "
+ "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}",
+ },{
+ .name = "initiator-name",
+ .type = QEMU_OPT_STRING,
+ .help = "Initiator iqn name to use when connecting",
+ },
+ { /* end of list */ }
+ },
+};
+
static void iscsi_block_init(void)
{
bdrv_register(&bdrv_iscsi);
+ qemu_add_opts(&qemu_iscsi_opts);
}
block_init(iscsi_block_init);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 91ef86324..ee0f8d10c 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -8,10 +8,10 @@
* See the COPYING file in the top-level directory.
*/
#include "qemu-common.h"
-#include "qemu-aio.h"
-#include "qemu-queue.h"
+#include "block/aio.h"
+#include "qemu/queue.h"
#include "block/raw-aio.h"
-#include "event_notifier.h"
+#include "qemu/event_notifier.h"
#include <libaio.h>
diff --git a/block/mirror.c b/block/mirror.c
index d6618a4b3..bed4a7ead 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -12,20 +12,20 @@
*/
#include "trace.h"
-#include "blockjob.h"
-#include "block_int.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
#include "qemu/ratelimit.h"
+#include "qemu/bitmap.h"
-enum {
- /*
- * Size of data buffer for populating the image file. This should be large
- * enough to process multiple clusters in a single call, so that populating
- * contiguous regions of the image is efficient.
- */
- BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
-};
+#define SLICE_TIME 100000000ULL /* ns */
+#define MAX_IN_FLIGHT 16
-#define SLICE_TIME 100000000ULL /* ns */
+/* The mirroring buffer is a list of granularity-sized chunks.
+ * Free chunks are organized in a list.
+ */
+typedef struct MirrorBuffer {
+ QSIMPLEQ_ENTRY(MirrorBuffer) next;
+} MirrorBuffer;
typedef struct MirrorBlockJob {
BlockJob common;
@@ -36,9 +36,26 @@ typedef struct MirrorBlockJob {
bool synced;
bool should_complete;
int64_t sector_num;
+ int64_t granularity;
+ size_t buf_size;
+ unsigned long *cow_bitmap;
+ HBitmapIter hbi;
uint8_t *buf;
+ QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
+ int buf_free_count;
+
+ unsigned long *in_flight_bitmap;
+ int in_flight;
+ int ret;
} MirrorBlockJob;
+typedef struct MirrorOp {
+ MirrorBlockJob *s;
+ QEMUIOVector qiov;
+ int64_t sector_num;
+ int nb_sectors;
+} MirrorOp;
+
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
int error)
{
@@ -52,51 +69,234 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
}
}
-static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
- BlockErrorAction *p_action)
+static void mirror_iteration_done(MirrorOp *op, int ret)
{
- BlockDriverState *source = s->common.bs;
- BlockDriverState *target = s->target;
- QEMUIOVector qiov;
- int ret, nb_sectors;
- int64_t end;
- struct iovec iov;
+ MirrorBlockJob *s = op->s;
+ struct iovec *iov;
+ int64_t chunk_num;
+ int i, nb_chunks, sectors_per_chunk;
+
+ trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
+
+ s->in_flight--;
+ iov = op->qiov.iov;
+ for (i = 0; i < op->qiov.niov; i++) {
+ MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+ QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
+ s->buf_free_count++;
+ }
- end = s->common.len >> BDRV_SECTOR_BITS;
- s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
- nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
- bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+ sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+ chunk_num = op->sector_num / sectors_per_chunk;
+ nb_chunks = op->nb_sectors / sectors_per_chunk;
+ bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
+ if (s->cow_bitmap && ret >= 0) {
+ bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+ }
- /* Copy the dirty cluster. */
- iov.iov_base = s->buf;
- iov.iov_len = nb_sectors * 512;
- qemu_iovec_init_external(&qiov, &iov, 1);
+ g_slice_free(MirrorOp, op);
+ qemu_coroutine_enter(s->common.co, NULL);
+}
- trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
- ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+static void mirror_write_complete(void *opaque, int ret)
+{
+ MirrorOp *op = opaque;
+ MirrorBlockJob *s = op->s;
if (ret < 0) {
- *p_action = mirror_error_action(s, true, -ret);
- goto fail;
+ BlockDriverState *source = s->common.bs;
+ BlockErrorAction action;
+
+ bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+ action = mirror_error_action(s, false, -ret);
+ if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+ s->ret = ret;
+ }
}
- ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+ mirror_iteration_done(op, ret);
+}
+
+static void mirror_read_complete(void *opaque, int ret)
+{
+ MirrorOp *op = opaque;
+ MirrorBlockJob *s = op->s;
if (ret < 0) {
- *p_action = mirror_error_action(s, false, -ret);
- s->synced = false;
- goto fail;
+ BlockDriverState *source = s->common.bs;
+ BlockErrorAction action;
+
+ bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+ action = mirror_error_action(s, true, -ret);
+ if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+ s->ret = ret;
+ }
+
+ mirror_iteration_done(op, ret);
+ return;
+ }
+ bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+ mirror_write_complete, op);
+}
+
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
+{
+ BlockDriverState *source = s->common.bs;
+ int nb_sectors, sectors_per_chunk, nb_chunks;
+ int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
+ MirrorOp *op;
+
+ s->sector_num = hbitmap_iter_next(&s->hbi);
+ if (s->sector_num < 0) {
+ bdrv_dirty_iter_init(source, &s->hbi);
+ s->sector_num = hbitmap_iter_next(&s->hbi);
+ trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
+ assert(s->sector_num >= 0);
+ }
+
+ hbitmap_next_sector = s->sector_num;
+ sector_num = s->sector_num;
+ sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+ end = s->common.len >> BDRV_SECTOR_BITS;
+
+ /* Extend the QEMUIOVector to include all adjacent blocks that will
+ * be copied in this operation.
+ *
+ * We have to do this if we have no backing file yet in the destination,
+ * and the cluster size is very large. Then we need to do COW ourselves.
+ * The first time a cluster is copied, copy it entirely. Note that,
+ * because both the granularity and the cluster size are powers of two,
+ * the number of sectors to copy cannot exceed one cluster.
+ *
+ * We also want to extend the QEMUIOVector to include more adjacent
+ * dirty blocks if possible, to limit the number of I/O operations and
+ * run efficiently even with a small granularity.
+ */
+ nb_chunks = 0;
+ nb_sectors = 0;
+ next_sector = sector_num;
+ next_chunk = sector_num / sectors_per_chunk;
+
+ /* Wait for I/O to this cluster (from a previous iteration) to be done. */
+ while (test_bit(next_chunk, s->in_flight_bitmap)) {
+ trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+ qemu_coroutine_yield();
+ }
+
+ do {
+ int added_sectors, added_chunks;
+
+ if (!bdrv_get_dirty(source, next_sector) ||
+ test_bit(next_chunk, s->in_flight_bitmap)) {
+ assert(nb_sectors > 0);
+ break;
+ }
+
+ added_sectors = sectors_per_chunk;
+ if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
+ bdrv_round_to_clusters(s->target,
+ next_sector, added_sectors,
+ &next_sector, &added_sectors);
+
+ /* On the first iteration, the rounding may make us copy
+ * sectors before the first dirty one.
+ */
+ if (next_sector < sector_num) {
+ assert(nb_sectors == 0);
+ sector_num = next_sector;
+ next_chunk = next_sector / sectors_per_chunk;
+ }
+ }
+
+ added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
+ added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
+
+ /* When doing COW, it may happen that there is not enough space for
+ * a full cluster. Wait if that is the case.
+ */
+ while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
+ trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+ qemu_coroutine_yield();
+ }
+ if (s->buf_free_count < nb_chunks + added_chunks) {
+ trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
+ break;
+ }
+
+ /* We have enough free space to copy these sectors. */
+ bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
+
+ nb_sectors += added_sectors;
+ nb_chunks += added_chunks;
+ next_sector += added_sectors;
+ next_chunk += added_chunks;
+ } while (next_sector < end);
+
+ /* Allocate a MirrorOp that is used as an AIO callback. */
+ op = g_slice_new(MirrorOp);
+ op->s = s;
+ op->sector_num = sector_num;
+ op->nb_sectors = nb_sectors;
+
+ /* Now make a QEMUIOVector taking enough granularity-sized chunks
+ * from s->buf_free.
+ */
+ qemu_iovec_init(&op->qiov, nb_chunks);
+ next_sector = sector_num;
+ while (nb_chunks-- > 0) {
+ MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
+ QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
+ s->buf_free_count--;
+ qemu_iovec_add(&op->qiov, buf, s->granularity);
+
+ /* Advance the HBitmapIter in parallel, so that we do not examine
+ * the same sector twice.
+ */
+ if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+ hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
+ }
+
+ next_sector += sectors_per_chunk;
}
- return 0;
-fail:
- /* Try again later. */
- bdrv_set_dirty(source, s->sector_num, nb_sectors);
- return ret;
+ bdrv_reset_dirty(source, sector_num, nb_sectors);
+
+ /* Copy the dirty cluster. */
+ s->in_flight++;
+ trace_mirror_one_iteration(s, sector_num, nb_sectors);
+ bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+ mirror_read_complete, op);
+}
+
+static void mirror_free_init(MirrorBlockJob *s)
+{
+ int granularity = s->granularity;
+ size_t buf_size = s->buf_size;
+ uint8_t *buf = s->buf;
+
+ assert(s->buf_free_count == 0);
+ QSIMPLEQ_INIT(&s->buf_free);
+ while (buf_size != 0) {
+ MirrorBuffer *cur = (MirrorBuffer *)buf;
+ QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
+ s->buf_free_count++;
+ buf_size -= granularity;
+ buf += granularity;
+ }
+}
+
+static void mirror_drain(MirrorBlockJob *s)
+{
+ while (s->in_flight > 0) {
+ qemu_coroutine_yield();
+ }
}
static void coroutine_fn mirror_run(void *opaque)
{
MirrorBlockJob *s = opaque;
BlockDriverState *bs = s->common.bs;
- int64_t sector_num, end;
+ int64_t sector_num, end, sectors_per_chunk, length;
+ uint64_t last_pause_ns;
+ BlockDriverInfo bdi;
+ char backing_filename[1024];
int ret = 0;
int n;
@@ -105,20 +305,39 @@ static void coroutine_fn mirror_run(void *opaque)
}
s->common.len = bdrv_getlength(bs);
- if (s->common.len < 0) {
+ if (s->common.len <= 0) {
block_job_completed(&s->common, s->common.len);
return;
}
+ length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
+ s->in_flight_bitmap = bitmap_new(length);
+
+ /* If we have no backing file yet in the destination, we cannot let
+ * the destination do COW. Instead, we copy sectors around the
+ * dirty data if needed. We need a bitmap to do that.
+ */
+ bdrv_get_backing_filename(s->target, backing_filename,
+ sizeof(backing_filename));
+ if (backing_filename[0] && !s->target->backing_hd) {
+ bdrv_get_info(s->target, &bdi);
+ if (s->granularity < bdi.cluster_size) {
+ s->buf_size = MAX(s->buf_size, bdi.cluster_size);
+ s->cow_bitmap = bitmap_new(length);
+ }
+ }
+
end = s->common.len >> BDRV_SECTOR_BITS;
- s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+ s->buf = qemu_blockalign(bs, s->buf_size);
+ sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+ mirror_free_init(s);
if (s->mode != MIRROR_SYNC_MODE_NONE) {
/* First part, loop on the sectors and initialize the dirty bitmap. */
BlockDriverState *base;
base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
for (sector_num = 0; sector_num < end; ) {
- int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+ int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
ret = bdrv_co_is_allocated_above(bs, base,
sector_num, next - sector_num, &n);
@@ -136,24 +355,40 @@ static void coroutine_fn mirror_run(void *opaque)
}
}
- s->sector_num = -1;
+ bdrv_dirty_iter_init(bs, &s->hbi);
+ last_pause_ns = qemu_get_clock_ns(rt_clock);
for (;;) {
uint64_t delay_ns;
int64_t cnt;
bool should_complete;
+ if (s->ret < 0) {
+ ret = s->ret;
+ goto immediate_exit;
+ }
+
cnt = bdrv_get_dirty_count(bs);
- if (cnt != 0) {
- BlockErrorAction action = BDRV_ACTION_REPORT;
- ret = mirror_iteration(s, &action);
- if (ret < 0 && action == BDRV_ACTION_REPORT) {
- goto immediate_exit;
+
+ /* Note that even when no rate limit is applied we need to yield
+ * periodically with no pending I/O so that qemu_aio_flush() returns.
+ * We do so every SLICE_TIME nanoseconds, or when there is an error,
+ * or when the source is clean, whichever comes first.
+ */
+ if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
+ s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+ if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+ (cnt == 0 && s->in_flight > 0)) {
+ trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
+ qemu_coroutine_yield();
+ continue;
+ } else if (cnt != 0) {
+ mirror_iteration(s);
+ continue;
}
- cnt = bdrv_get_dirty_count(bs);
}
should_complete = false;
- if (cnt == 0) {
+ if (s->in_flight == 0 && cnt == 0) {
trace_mirror_before_flush(s);
ret = bdrv_flush(s->target);
if (ret < 0) {
@@ -196,23 +431,20 @@ static void coroutine_fn mirror_run(void *opaque)
trace_mirror_before_sleep(s, cnt, s->synced);
if (!s->synced) {
/* Publish progress */
- s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+ s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
if (s->common.speed) {
- delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+ delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
} else {
delay_ns = 0;
}
- /* Note that even when no rate limit is applied we need to yield
- * with no pending I/O here so that qemu_aio_flush() returns.
- */
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
if (block_job_is_cancelled(&s->common)) {
break;
}
} else if (!should_complete) {
- delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+ delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
} else if (cnt == 0) {
/* The two disks are in sync. Exit and report successful
@@ -222,11 +454,24 @@ static void coroutine_fn mirror_run(void *opaque)
s->common.cancelled = false;
break;
}
+ last_pause_ns = qemu_get_clock_ns(rt_clock);
}
immediate_exit:
- g_free(s->buf);
- bdrv_set_dirty_tracking(bs, false);
+ if (s->in_flight > 0) {
+ /* We get here only if something went wrong. Either the job failed,
+ * or it was cancelled prematurely so that we do not guarantee that
+ * the target is a copy of the source.
+ */
+ assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
+ mirror_drain(s);
+ }
+
+ assert(s->in_flight == 0);
+ qemu_vfree(s->buf);
+ g_free(s->cow_bitmap);
+ g_free(s->in_flight_bitmap);
+ bdrv_set_dirty_tracking(bs, 0);
bdrv_iostatus_disable(s->target);
if (s->should_complete && ret == 0) {
if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
@@ -262,12 +507,12 @@ static void mirror_complete(BlockJob *job, Error **errp)
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
int ret;
- ret = bdrv_open_backing_file(s->target);
+ ret = bdrv_open_backing_file(s->target, NULL);
if (ret < 0) {
char backing_filename[PATH_MAX];
bdrv_get_full_backing_filename(s->target, backing_filename,
sizeof(backing_filename));
- error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename);
+ error_setg_file_open(errp, -ret, backing_filename);
return;
}
if (!s->synced) {
@@ -279,7 +524,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
block_job_resume(job);
}
-static BlockJobType mirror_job_type = {
+static const BlockJobType mirror_job_type = {
.instance_size = sizeof(MirrorBlockJob),
.job_type = "mirror",
.set_speed = mirror_set_speed,
@@ -288,14 +533,28 @@ static BlockJobType mirror_job_type = {
};
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
- int64_t speed, MirrorSyncMode mode,
- BlockdevOnError on_source_error,
+ int64_t speed, int64_t granularity, int64_t buf_size,
+ MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp)
{
MirrorBlockJob *s;
+ if (granularity == 0) {
+ /* Choose the default granularity based on the target file's cluster
+ * size, clamped between 4k and 64k. */
+ BlockDriverInfo bdi;
+ if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
+ granularity = MAX(4096, bdi.cluster_size);
+ granularity = MIN(65536, granularity);
+ } else {
+ granularity = 65536;
+ }
+ }
+
+ assert ((granularity & (granularity - 1)) == 0);
+
if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
!bdrv_iostatus_is_enabled(bs)) {
@@ -312,7 +571,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
s->on_target_error = on_target_error;
s->target = target;
s->mode = mode;
- bdrv_set_dirty_tracking(bs, true);
+ s->granularity = granularity;
+ s->buf_size = MAX(buf_size, granularity);
+
+ bdrv_set_dirty_tracking(bs, granularity);
bdrv_set_enable_write_cache(s->target, true);
bdrv_set_on_error(s->target, on_target_error, on_target_error);
bdrv_iostatus_enable(s->target);
diff --git a/block/nbd.c b/block/nbd.c
index e87c24817..9c480b8f2 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -27,11 +27,13 @@
*/
#include "qemu-common.h"
-#include "nbd.h"
-#include "uri.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu_socket.h"
+#include "block/nbd.h"
+#include "qemu/uri.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"
+#include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qint.h"
#include <sys/types.h>
#include <unistd.h>
@@ -65,17 +67,19 @@ typedef struct BDRVNBDState {
Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
struct nbd_reply reply;
- int is_unix;
- char *host_spec;
+ bool is_unix;
+ QemuOpts *socket_opts;
+
char *export_name; /* An NBD server may export several devices */
} BDRVNBDState;
-static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
+static int nbd_parse_uri(const char *filename, QDict *options)
{
URI *uri;
const char *p;
QueryParams *qp = NULL;
int ret = 0;
+ bool is_unix;
uri = uri_parse(filename);
if (!uri) {
@@ -84,11 +88,11 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
/* transport */
if (!strcmp(uri->scheme, "nbd")) {
- s->is_unix = false;
+ is_unix = false;
} else if (!strcmp(uri->scheme, "nbd+tcp")) {
- s->is_unix = false;
+ is_unix = false;
} else if (!strcmp(uri->scheme, "nbd+unix")) {
- s->is_unix = true;
+ is_unix = true;
} else {
ret = -EINVAL;
goto out;
@@ -97,32 +101,44 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
p = uri->path ? uri->path : "/";
p += strspn(p, "/");
if (p[0]) {
- s->export_name = g_strdup(p);
+ qdict_put(options, "export", qstring_from_str(p));
}
qp = query_params_parse(uri->query);
- if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+ if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
ret = -EINVAL;
goto out;
}
- if (s->is_unix) {
+ if (is_unix) {
/* nbd+unix:///export?socket=path */
if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
ret = -EINVAL;
goto out;
}
- s->host_spec = g_strdup(qp->p[0].value);
+ qdict_put(options, "path", qstring_from_str(qp->p[0].value));
} else {
- /* nbd[+tcp]://host:port/export */
+ QString *host;
+ /* nbd[+tcp]://host[:port]/export */
if (!uri->server) {
ret = -EINVAL;
goto out;
}
- if (!uri->port) {
- uri->port = NBD_DEFAULT_PORT;
+
+ /* strip braces from literal IPv6 address */
+ if (uri->server[0] == '[') {
+ host = qstring_from_substr(uri->server, 1,
+ strlen(uri->server) - 2);
+ } else {
+ host = qstring_from_str(uri->server);
+ }
+
+ qdict_put(options, "host", host);
+ if (uri->port) {
+ char* port_str = g_strdup_printf("%d", uri->port);
+ qdict_put(options, "port", qstring_from_str(port_str));
+ g_free(port_str);
}
- s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port);
}
out:
@@ -133,16 +149,29 @@ out:
return ret;
}
-static int nbd_config(BDRVNBDState *s, const char *filename)
+static void nbd_parse_filename(const char *filename, QDict *options,
+ Error **errp)
{
char *file;
char *export_name;
const char *host_spec;
const char *unixpath;
- int err = -EINVAL;
+
+ if (qdict_haskey(options, "host")
+ || qdict_haskey(options, "port")
+ || qdict_haskey(options, "path"))
+ {
+ error_setg(errp, "host/port/path and a file name may not be specified "
+ "at the same time");
+ return;
+ }
if (strstr(filename, "://")) {
- return nbd_parse_uri(s, filename);
+ int ret = nbd_parse_uri(filename, options);
+ if (ret < 0) {
+ error_setg(errp, "No valid URL specified");
+ }
+ return;
}
file = g_strdup(filename);
@@ -154,34 +183,79 @@ static int nbd_config(BDRVNBDState *s, const char *filename)
}
export_name[0] = 0; /* truncate 'file' */
export_name += strlen(EN_OPTSTR);
- s->export_name = g_strdup(export_name);
+
+ qdict_put(options, "export", qstring_from_str(export_name));
}
/* extract the host_spec - fail if it's not nbd:... */
if (!strstart(file, "nbd:", &host_spec)) {
+ error_setg(errp, "File name string for NBD must start with 'nbd:'");
+ goto out;
+ }
+
+ if (!*host_spec) {
goto out;
}
/* are we a UNIX or TCP socket? */
if (strstart(host_spec, "unix:", &unixpath)) {
- s->is_unix = true;
- s->host_spec = g_strdup(unixpath);
+ qdict_put(options, "path", qstring_from_str(unixpath));
} else {
- s->is_unix = false;
- s->host_spec = g_strdup(host_spec);
- }
+ InetSocketAddress *addr = NULL;
- err = 0;
+ addr = inet_parse(host_spec, errp);
+ if (error_is_set(errp)) {
+ goto out;
+ }
+
+ qdict_put(options, "host", qstring_from_str(addr->host));
+ qdict_put(options, "port", qstring_from_str(addr->port));
+ qapi_free_InetSocketAddress(addr);
+ }
out:
g_free(file);
- if (err != 0) {
- g_free(s->export_name);
- g_free(s->host_spec);
+}
+
+static int nbd_config(BDRVNBDState *s, QDict *options)
+{
+ Error *local_err = NULL;
+
+ if (qdict_haskey(options, "path")) {
+ if (qdict_haskey(options, "host")) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not "
+ "be used at the same time.");
+ return -EINVAL;
+ }
+ s->is_unix = true;
+ } else if (qdict_haskey(options, "host")) {
+ s->is_unix = false;
+ } else {
+ return -EINVAL;
+ }
+
+ s->socket_opts = qemu_opts_create_nofail(&socket_optslist);
+
+ qemu_opts_absorb_qdict(s->socket_opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ return -EINVAL;
+ }
+
+ if (!qemu_opt_get(s->socket_opts, "port")) {
+ qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT);
}
- return err;
+
+ s->export_name = g_strdup(qdict_get_try_str(options, "export"));
+ if (s->export_name) {
+ qdict_del(options, "export");
+ }
+
+ return 0;
}
+
static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
{
int i;
@@ -269,13 +343,23 @@ static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
s->send_coroutine = qemu_coroutine_self();
qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
nbd_have_request, s);
- rc = nbd_send_request(s->sock, request);
- if (rc >= 0 && qiov) {
- ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
- offset, request->len);
- if (ret != request->len) {
- return -EIO;
+ if (qiov) {
+ if (!s->is_unix) {
+ socket_set_cork(s->sock, 1);
}
+ rc = nbd_send_request(s->sock, request);
+ if (rc >= 0) {
+ ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+ offset, request->len);
+ if (ret != request->len) {
+ rc = -EIO;
+ }
+ }
+ if (!s->is_unix) {
+ socket_set_cork(s->sock, 0);
+ }
+ } else {
+ rc = nbd_send_request(s->sock, request);
}
qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
nbd_have_request, s);
@@ -328,9 +412,12 @@ static int nbd_establish_connection(BlockDriverState *bs)
size_t blocksize;
if (s->is_unix) {
- sock = unix_socket_outgoing(s->host_spec);
+ sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path"));
} else {
- sock = tcp_socket_outgoing_spec(s->host_spec);
+ sock = tcp_socket_outgoing_opts(s->socket_opts);
+ if (sock >= 0) {
+ socket_set_nodelay(sock);
+ }
}
/* Failed to establish connection */
@@ -350,7 +437,7 @@ static int nbd_establish_connection(BlockDriverState *bs)
/* Now that we're connected, set the socket to be non-blocking and
* kick the reply mechanism. */
- socket_set_nonblock(sock);
+ qemu_set_nonblock(sock);
qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
nbd_have_request, s);
@@ -376,7 +463,7 @@ static void nbd_teardown_connection(BlockDriverState *bs)
closesocket(s->sock);
}
-static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+static int nbd_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVNBDState *s = bs->opaque;
int result;
@@ -385,7 +472,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
qemu_co_mutex_init(&s->free_sema);
/* Pop the config into our state object. Exit if invalid. */
- result = nbd_config(s, filename);
+ result = nbd_config(s, options);
if (result != 0) {
return result;
}
@@ -531,7 +618,7 @@ static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
return 0;
}
request.type = NBD_CMD_TRIM;
- request.from = sector_num * 512;;
+ request.from = sector_num * 512;
request.len = nb_sectors * 512;
nbd_coroutine_start(s, &request);
@@ -549,7 +636,7 @@ static void nbd_close(BlockDriverState *bs)
{
BDRVNBDState *s = bs->opaque;
g_free(s->export_name);
- g_free(s->host_spec);
+ qemu_opts_del(s->socket_opts);
nbd_teardown_connection(bs);
}
@@ -565,6 +652,7 @@ static BlockDriver bdrv_nbd = {
.format_name = "nbd",
.protocol_name = "nbd",
.instance_size = sizeof(BDRVNBDState),
+ .bdrv_parse_filename = nbd_parse_filename,
.bdrv_file_open = nbd_open,
.bdrv_co_readv = nbd_co_readv,
.bdrv_co_writev = nbd_co_writev,
@@ -578,6 +666,7 @@ static BlockDriver bdrv_nbd_tcp = {
.format_name = "nbd",
.protocol_name = "nbd+tcp",
.instance_size = sizeof(BDRVNBDState),
+ .bdrv_parse_filename = nbd_parse_filename,
.bdrv_file_open = nbd_open,
.bdrv_co_readv = nbd_co_readv,
.bdrv_co_writev = nbd_co_writev,
@@ -591,6 +680,7 @@ static BlockDriver bdrv_nbd_unix = {
.format_name = "nbd",
.protocol_name = "nbd+unix",
.instance_size = sizeof(BDRVNBDState),
+ .bdrv_parse_filename = nbd_parse_filename,
.bdrv_file_open = nbd_open,
.bdrv_co_readv = nbd_co_readv,
.bdrv_co_writev = nbd_co_writev,
diff --git a/block/parallels.c b/block/parallels.c
index d30f0ecf7..18b3ac0b2 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -24,8 +24,8 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
/**************************************************************/
@@ -68,19 +68,23 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam
return 0;
}
-static int parallels_open(BlockDriverState *bs, int flags)
+static int parallels_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVParallelsState *s = bs->opaque;
int i;
struct parallels_header ph;
+ int ret;
bs->read_only = 1; // no write support yet
- if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph))
+ ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
+ if (ret < 0) {
goto fail;
+ }
if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
- (le32_to_cpu(ph.version) != HEADER_VERSION)) {
+ (le32_to_cpu(ph.version) != HEADER_VERSION)) {
+ ret = -EMEDIUMTYPE;
goto fail;
}
@@ -90,18 +94,21 @@ static int parallels_open(BlockDriverState *bs, int flags)
s->catalog_size = le32_to_cpu(ph.catalog_entries);
s->catalog_bitmap = g_malloc(s->catalog_size * 4);
- if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) !=
- s->catalog_size * 4)
- goto fail;
+
+ ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
+ if (ret < 0) {
+ goto fail;
+ }
+
for (i = 0; i < s->catalog_size; i++)
le32_to_cpus(&s->catalog_bitmap[i]);
qemu_co_mutex_init(&s->lock);
return 0;
+
fail:
- if (s->catalog_bitmap)
- g_free(s->catalog_bitmap);
- return -1;
+ g_free(s->catalog_bitmap);
+ return ret;
}
static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
diff --git a/block/qapi.c b/block/qapi.c
new file mode 100644
index 000000000..a4bc4113b
--- /dev/null
+++ b/block/qapi.c
@@ -0,0 +1,470 @@
+/*
+ * Block layer qmp and info dump related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/qapi.h"
+#include "block/block_int.h"
+#include "qmp-commands.h"
+
+/*
+ * Returns 0 on success, with *p_list either set to describe snapshot
+ * information, or NULL because there are no snapshots. Returns -errno on
+ * error, with *p_list untouched.
+ */
+int bdrv_query_snapshot_info_list(BlockDriverState *bs,
+ SnapshotInfoList **p_list,
+ Error **errp)
+{
+ int i, sn_count;
+ QEMUSnapshotInfo *sn_tab = NULL;
+ SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL;
+ SnapshotInfo *info;
+
+ sn_count = bdrv_snapshot_list(bs, &sn_tab);
+ if (sn_count < 0) {
+ const char *dev = bdrv_get_device_name(bs);
+ switch (sn_count) {
+ case -ENOMEDIUM:
+ error_setg(errp, "Device '%s' is not inserted", dev);
+ break;
+ case -ENOTSUP:
+ error_setg(errp,
+ "Device '%s' does not support internal snapshots",
+ dev);
+ break;
+ default:
+ error_setg_errno(errp, -sn_count,
+ "Can't list snapshots of device '%s'", dev);
+ break;
+ }
+ return sn_count;
+ }
+
+ for (i = 0; i < sn_count; i++) {
+ info = g_new0(SnapshotInfo, 1);
+ info->id = g_strdup(sn_tab[i].id_str);
+ info->name = g_strdup(sn_tab[i].name);
+ info->vm_state_size = sn_tab[i].vm_state_size;
+ info->date_sec = sn_tab[i].date_sec;
+ info->date_nsec = sn_tab[i].date_nsec;
+ info->vm_clock_sec = sn_tab[i].vm_clock_nsec / 1000000000;
+ info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000;
+
+ info_list = g_new0(SnapshotInfoList, 1);
+ info_list->value = info;
+
+ /* XXX: waiting for the qapi to support qemu-queue.h types */
+ if (!cur_item) {
+ head = cur_item = info_list;
+ } else {
+ cur_item->next = info_list;
+ cur_item = info_list;
+ }
+
+ }
+
+ g_free(sn_tab);
+ *p_list = head;
+ return 0;
+}
+
+/**
+ * bdrv_query_image_info:
+ * @bs: block device to examine
+ * @p_info: location to store image information
+ * @errp: location to store error information
+ *
+ * Store "flat" image information in @p_info.
+ *
+ * "Flat" means it does *not* query backing image information,
+ * i.e. (*pinfo)->has_backing_image will be set to false and
+ * (*pinfo)->backing_image to NULL even when the image does in fact have
+ * a backing image.
+ *
+ * @p_info will be set only on success. On error, store error in @errp.
+ */
+void bdrv_query_image_info(BlockDriverState *bs,
+ ImageInfo **p_info,
+ Error **errp)
+{
+ uint64_t total_sectors;
+ const char *backing_filename;
+ char backing_filename2[1024];
+ BlockDriverInfo bdi;
+ int ret;
+ Error *err = NULL;
+ ImageInfo *info = g_new0(ImageInfo, 1);
+
+ bdrv_get_geometry(bs, &total_sectors);
+
+ info->filename = g_strdup(bs->filename);
+ info->format = g_strdup(bdrv_get_format_name(bs));
+ info->virtual_size = total_sectors * 512;
+ info->actual_size = bdrv_get_allocated_file_size(bs);
+ info->has_actual_size = info->actual_size >= 0;
+ if (bdrv_is_encrypted(bs)) {
+ info->encrypted = true;
+ info->has_encrypted = true;
+ }
+ if (bdrv_get_info(bs, &bdi) >= 0) {
+ if (bdi.cluster_size != 0) {
+ info->cluster_size = bdi.cluster_size;
+ info->has_cluster_size = true;
+ }
+ info->dirty_flag = bdi.is_dirty;
+ info->has_dirty_flag = true;
+ }
+ backing_filename = bs->backing_file;
+ if (backing_filename[0] != '\0') {
+ info->backing_filename = g_strdup(backing_filename);
+ info->has_backing_filename = true;
+ bdrv_get_full_backing_filename(bs, backing_filename2,
+ sizeof(backing_filename2));
+
+ if (strcmp(backing_filename, backing_filename2) != 0) {
+ info->full_backing_filename =
+ g_strdup(backing_filename2);
+ info->has_full_backing_filename = true;
+ }
+
+ if (bs->backing_format[0]) {
+ info->backing_filename_format = g_strdup(bs->backing_format);
+ info->has_backing_filename_format = true;
+ }
+ }
+
+ ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err);
+ switch (ret) {
+ case 0:
+ if (info->snapshots) {
+ info->has_snapshots = true;
+ }
+ break;
+ /* recoverable error */
+ case -ENOMEDIUM:
+ case -ENOTSUP:
+ error_free(err);
+ break;
+ default:
+ error_propagate(errp, err);
+ qapi_free_ImageInfo(info);
+ return;
+ }
+
+ *p_info = info;
+}
+
+/* @p_info will be set only on success. */
+void bdrv_query_info(BlockDriverState *bs,
+ BlockInfo **p_info,
+ Error **errp)
+{
+ BlockInfo *info = g_malloc0(sizeof(*info));
+ BlockDriverState *bs0;
+ ImageInfo **p_image_info;
+ Error *local_err = NULL;
+ info->device = g_strdup(bs->device_name);
+ info->type = g_strdup("unknown");
+ info->locked = bdrv_dev_is_medium_locked(bs);
+ info->removable = bdrv_dev_has_removable_media(bs);
+
+ if (bdrv_dev_has_removable_media(bs)) {
+ info->has_tray_open = true;
+ info->tray_open = bdrv_dev_is_tray_open(bs);
+ }
+
+ if (bdrv_iostatus_is_enabled(bs)) {
+ info->has_io_status = true;
+ info->io_status = bs->iostatus;
+ }
+
+ if (bs->dirty_bitmap) {
+ info->has_dirty = true;
+ info->dirty = g_malloc0(sizeof(*info->dirty));
+ info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
+ info->dirty->granularity =
+ ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
+ }
+
+ if (bs->drv) {
+ info->has_inserted = true;
+ info->inserted = g_malloc0(sizeof(*info->inserted));
+ info->inserted->file = g_strdup(bs->filename);
+ info->inserted->ro = bs->read_only;
+ info->inserted->drv = g_strdup(bs->drv->format_name);
+ info->inserted->encrypted = bs->encrypted;
+ info->inserted->encryption_key_missing = bdrv_key_required(bs);
+
+ if (bs->backing_file[0]) {
+ info->inserted->has_backing_file = true;
+ info->inserted->backing_file = g_strdup(bs->backing_file);
+ }
+
+ info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
+
+ if (bs->io_limits_enabled) {
+ info->inserted->bps =
+ bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+ info->inserted->bps_rd =
+ bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
+ info->inserted->bps_wr =
+ bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
+ info->inserted->iops =
+ bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+ info->inserted->iops_rd =
+ bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
+ info->inserted->iops_wr =
+ bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+ }
+
+ bs0 = bs;
+ p_image_info = &info->inserted->image;
+ while (1) {
+ bdrv_query_image_info(bs0, p_image_info, &local_err);
+ if (error_is_set(&local_err)) {
+ error_propagate(errp, local_err);
+ goto err;
+ }
+ if (bs0->drv && bs0->backing_hd) {
+ bs0 = bs0->backing_hd;
+ (*p_image_info)->has_backing_image = true;
+ p_image_info = &((*p_image_info)->backing_image);
+ } else {
+ break;
+ }
+ }
+ }
+
+ *p_info = info;
+ return;
+
+ err:
+ qapi_free_BlockInfo(info);
+}
+
+BlockStats *bdrv_query_stats(const BlockDriverState *bs)
+{
+ BlockStats *s;
+
+ s = g_malloc0(sizeof(*s));
+
+ if (bs->device_name[0]) {
+ s->has_device = true;
+ s->device = g_strdup(bs->device_name);
+ }
+
+ s->stats = g_malloc0(sizeof(*s->stats));
+ s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
+ s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
+ s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
+ s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
+ s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
+ s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
+ s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
+ s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
+ s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
+
+ if (bs->file) {
+ s->has_parent = true;
+ s->parent = bdrv_query_stats(bs->file);
+ }
+
+ return s;
+}
+
+BlockInfoList *qmp_query_block(Error **errp)
+{
+ BlockInfoList *head = NULL, **p_next = &head;
+ BlockDriverState *bs = NULL;
+ Error *local_err = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ BlockInfoList *info = g_malloc0(sizeof(*info));
+ bdrv_query_info(bs, &info->value, &local_err);
+ if (error_is_set(&local_err)) {
+ error_propagate(errp, local_err);
+ goto err;
+ }
+
+ *p_next = info;
+ p_next = &info->next;
+ }
+
+ return head;
+
+ err:
+ qapi_free_BlockInfoList(head);
+ return NULL;
+}
+
+BlockStatsList *qmp_query_blockstats(Error **errp)
+{
+ BlockStatsList *head = NULL, **p_next = &head;
+ BlockDriverState *bs = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ BlockStatsList *info = g_malloc0(sizeof(*info));
+ info->value = bdrv_query_stats(bs);
+
+ *p_next = info;
+ p_next = &info->next;
+ }
+
+ return head;
+}
+
+#define NB_SUFFIXES 4
+
+static char *get_human_readable_size(char *buf, int buf_size, int64_t size)
+{
+ static const char suffixes[NB_SUFFIXES] = "KMGT";
+ int64_t base;
+ int i;
+
+ if (size <= 999) {
+ snprintf(buf, buf_size, "%" PRId64, size);
+ } else {
+ base = 1024;
+ for (i = 0; i < NB_SUFFIXES; i++) {
+ if (size < (10 * base)) {
+ snprintf(buf, buf_size, "%0.1f%c",
+ (double)size / base,
+ suffixes[i]);
+ break;
+ } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
+ snprintf(buf, buf_size, "%" PRId64 "%c",
+ ((size + (base >> 1)) / base),
+ suffixes[i]);
+ break;
+ }
+ base = base * 1024;
+ }
+ }
+ return buf;
+}
+
+void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f,
+ QEMUSnapshotInfo *sn)
+{
+ char buf1[128], date_buf[128], clock_buf[128];
+ struct tm tm;
+ time_t ti;
+ int64_t secs;
+
+ if (!sn) {
+ func_fprintf(f,
+ "%-10s%-20s%7s%20s%15s",
+ "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
+ } else {
+ ti = sn->date_sec;
+ localtime_r(&ti, &tm);
+ strftime(date_buf, sizeof(date_buf),
+ "%Y-%m-%d %H:%M:%S", &tm);
+ secs = sn->vm_clock_nsec / 1000000000;
+ snprintf(clock_buf, sizeof(clock_buf),
+ "%02d:%02d:%02d.%03d",
+ (int)(secs / 3600),
+ (int)((secs / 60) % 60),
+ (int)(secs % 60),
+ (int)((sn->vm_clock_nsec / 1000000) % 1000));
+ func_fprintf(f,
+ "%-10s%-20s%7s%20s%15s",
+ sn->id_str, sn->name,
+ get_human_readable_size(buf1, sizeof(buf1),
+ sn->vm_state_size),
+ date_buf,
+ clock_buf);
+ }
+}
+
+void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
+ ImageInfo *info)
+{
+ char size_buf[128], dsize_buf[128];
+ if (!info->has_actual_size) {
+ snprintf(dsize_buf, sizeof(dsize_buf), "unavailable");
+ } else {
+ get_human_readable_size(dsize_buf, sizeof(dsize_buf),
+ info->actual_size);
+ }
+ get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size);
+ func_fprintf(f,
+ "image: %s\n"
+ "file format: %s\n"
+ "virtual size: %s (%" PRId64 " bytes)\n"
+ "disk size: %s\n",
+ info->filename, info->format, size_buf,
+ info->virtual_size,
+ dsize_buf);
+
+ if (info->has_encrypted && info->encrypted) {
+ func_fprintf(f, "encrypted: yes\n");
+ }
+
+ if (info->has_cluster_size) {
+ func_fprintf(f, "cluster_size: %" PRId64 "\n",
+ info->cluster_size);
+ }
+
+ if (info->has_dirty_flag && info->dirty_flag) {
+ func_fprintf(f, "cleanly shut down: no\n");
+ }
+
+ if (info->has_backing_filename) {
+ func_fprintf(f, "backing file: %s", info->backing_filename);
+ if (info->has_full_backing_filename) {
+ func_fprintf(f, " (actual path: %s)", info->full_backing_filename);
+ }
+ func_fprintf(f, "\n");
+ if (info->has_backing_filename_format) {
+ func_fprintf(f, "backing file format: %s\n",
+ info->backing_filename_format);
+ }
+ }
+
+ if (info->has_snapshots) {
+ SnapshotInfoList *elem;
+
+ func_fprintf(f, "Snapshot list:\n");
+ bdrv_snapshot_dump(func_fprintf, f, NULL);
+ func_fprintf(f, "\n");
+
+ /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but
+ * we convert to the block layer's native QEMUSnapshotInfo for now.
+ */
+ for (elem = info->snapshots; elem; elem = elem->next) {
+ QEMUSnapshotInfo sn = {
+ .vm_state_size = elem->value->vm_state_size,
+ .date_sec = elem->value->date_sec,
+ .date_nsec = elem->value->date_nsec,
+ .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL +
+ elem->value->vm_clock_nsec,
+ };
+
+ pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id);
+ pstrcpy(sn.name, sizeof(sn.name), elem->value->name);
+ bdrv_snapshot_dump(func_fprintf, f, &sn);
+ func_fprintf(f, "\n");
+ }
+ }
+}
diff --git a/block/qcow.c b/block/qcow.c
index b239c82ae..5239bd68f 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -22,11 +22,11 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
#include <zlib.h>
-#include "aes.h"
-#include "migration.h"
+#include "qemu/aes.h"
+#include "migration/migration.h"
/**************************************************************/
/* QEMU COW block driver with compression and encryption support */
@@ -92,7 +92,7 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
return 0;
}
-static int qcow_open(BlockDriverState *bs, int flags)
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVQcowState *s = bs->opaque;
int len, i, shift, ret;
@@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags)
be64_to_cpus(&header.l1_table_offset);
if (header.magic != QCOW_MAGIC) {
- ret = -EINVAL;
+ ret = -EMEDIUMTYPE;
goto fail;
}
if (header.version != QCOW_VERSION) {
@@ -679,7 +679,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
return ret;
}
- ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR);
+ ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
if (ret < 0) {
return ret;
}
@@ -787,8 +787,21 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
uint8_t *out_buf;
uint64_t cluster_offset;
- if (nb_sectors != s->cluster_sectors)
- return -EINVAL;
+ if (nb_sectors != s->cluster_sectors) {
+ ret = -EINVAL;
+
+ /* Zero-pad last write if image size is not cluster aligned */
+ if (sector_num + nb_sectors == bs->total_sectors &&
+ nb_sectors < s->cluster_sectors) {
+ uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+ memset(pad_buf, 0, s->cluster_size);
+ memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+ ret = qcow_write_compressed(bs, sector_num,
+ pad_buf, s->cluster_sectors);
+ qemu_vfree(pad_buf);
+ }
+ return ret;
+ }
out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
@@ -879,6 +892,7 @@ static BlockDriver bdrv_qcow = {
.bdrv_close = qcow_close,
.bdrv_reopen_prepare = qcow_reopen_prepare,
.bdrv_create = qcow_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_readv = qcow_co_readv,
.bdrv_co_writev = qcow_co_writev,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 2d4322a8d..2f3114ecc 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -22,7 +22,7 @@
* THE SOFTWARE.
*/
-#include "block_int.h"
+#include "block/block_int.h"
#include "qemu-common.h"
#include "qcow2.h"
#include "trace.h"
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e179211c5..cca76d4fc 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -25,16 +25,17 @@
#include <zlib.h>
#include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
#include "block/qcow2.h"
#include "trace.h"
-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+ bool exact_size)
{
BDRVQcowState *s = bs->opaque;
- int new_l1_size, new_l1_size2, ret, i;
+ int new_l1_size2, ret, i;
uint64_t *new_l1_table;
- int64_t new_l1_table_offset;
+ int64_t new_l1_table_offset, new_l1_size;
uint8_t data[12];
if (min_size <= s->l1_size)
@@ -53,8 +54,13 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
}
}
+ if (new_l1_size > INT_MAX) {
+ return -EFBIG;
+ }
+
#ifdef DEBUG_ALLOC2
- fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+ fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+ s->l1_size, new_l1_size);
#endif
new_l1_size2 = sizeof(uint64_t) * new_l1_size;
@@ -92,14 +98,16 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
goto fail;
}
g_free(s->l1_table);
- qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+ qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
s->l1_table_offset = new_l1_table_offset;
s->l1_table = new_l1_table;
s->l1_size = new_l1_size;
return 0;
fail:
g_free(new_l1_table);
- qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2);
+ qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+ QCOW2_DISCARD_OTHER);
return ret;
}
@@ -391,8 +399,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
int *num, uint64_t *cluster_offset)
{
BDRVQcowState *s = bs->opaque;
- unsigned int l1_index, l2_index;
- uint64_t l2_offset, *l2_table;
+ unsigned int l2_index;
+ uint64_t l1_index, l2_offset, *l2_table;
int l1_bits, c;
unsigned int index_in_cluster, nb_clusters;
uint64_t nb_available, nb_needed;
@@ -454,6 +462,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
*cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
break;
case QCOW2_CLUSTER_ZERO:
+ if (s->qcow_version < 3) {
+ return -EIO;
+ }
c = count_contiguous_clusters(nb_clusters, s->cluster_size,
&l2_table[l2_index], 0,
QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
@@ -504,8 +515,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
int *new_l2_index)
{
BDRVQcowState *s = bs->opaque;
- unsigned int l1_index, l2_index;
- uint64_t l2_offset;
+ unsigned int l2_index;
+ uint64_t l1_index, l2_offset;
uint64_t *l2_table = NULL;
int ret;
@@ -519,6 +530,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
}
}
+ assert(l1_index < s->l1_size);
l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
/* seek the l2 table of the given l2 offset */
@@ -538,7 +550,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
/* Then decrease the refcount of the old table */
if (l2_offset) {
- qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+ qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
}
}
@@ -615,57 +628,67 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
return cluster_offset;
}
-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
{
BDRVQcowState *s = bs->opaque;
- int i, j = 0, l2_index, ret;
- uint64_t *old_cluster, start_sect, *l2_table;
- uint64_t cluster_offset = m->alloc_offset;
- bool cow = false;
-
- trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+ int ret;
- if (m->nb_clusters == 0)
+ if (r->nb_sectors == 0) {
return 0;
+ }
- old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+ r->offset / BDRV_SECTOR_SIZE,
+ r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+ qemu_co_mutex_lock(&s->lock);
- /* copy content of unmodified sectors */
- start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
- if (m->n_start) {
- cow = true;
- qemu_co_mutex_unlock(&s->lock);
- ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
- qemu_co_mutex_lock(&s->lock);
- if (ret < 0)
- goto err;
- }
-
- if (m->nb_available & (s->cluster_sectors - 1)) {
- cow = true;
- qemu_co_mutex_unlock(&s->lock);
- ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
- align_offset(m->nb_available, s->cluster_sectors));
- qemu_co_mutex_lock(&s->lock);
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ return ret;
}
/*
- * Update L2 table.
- *
* Before we update the L2 table to actually point to the new cluster, we
* need to be sure that the refcounts have been increased and COW was
* handled.
*/
- if (cow) {
- qcow2_cache_depends_on_flush(s->l2_table_cache);
+ qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+ return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, j = 0, l2_index, ret;
+ uint64_t *old_cluster, *l2_table;
+ uint64_t cluster_offset = m->alloc_offset;
+
+ trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+ assert(m->nb_clusters > 0);
+
+ old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+ /* copy content of unmodified sectors */
+ ret = perform_cow(bs, m, &m->cow_start);
+ if (ret < 0) {
+ goto err;
}
+ ret = perform_cow(bs, m, &m->cow_end);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /* Update L2 table. */
+ if (s->use_lazy_refcounts) {
+ qcow2_mark_dirty(bs);
+ }
if (qcow2_need_accurate_refcounts(s)) {
qcow2_cache_set_dependency(bs, s->l2_table_cache,
s->refcount_block_cache);
}
+
ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
if (ret < 0) {
goto err;
@@ -695,10 +718,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
/*
* If this was a COW, we need to decrease the refcount of the old cluster.
* Also flush bs->file to get the right order for L2 and refcount update.
+ *
+ * Don't discard clusters that reach a refcount of 0 (e.g. compressed
+ * clusters), the next write will reuse them anyway.
*/
if (j != 0) {
for (i = 0; i < j; i++) {
- qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
+ qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
+ QCOW2_DISCARD_NEVER);
}
}
@@ -743,56 +770,53 @@ out:
}
/*
- * Allocates new clusters for the given guest_offset.
- *
- * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
- * contain the number of clusters that have been allocated and are contiguous
- * in the image file.
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
*
- * If *host_offset is non-zero, it specifies the offset in the image file at
- * which the new clusters must start. *nb_clusters can be 0 on return in this
- * case if the cluster at host_offset is already in use. If *host_offset is
- * zero, the clusters can be allocated anywhere in the image file.
+ * Returns:
+ * 0 if there was no dependency. *cur_bytes indicates the number of
+ * bytes from guest_offset that can be read before the next
+ * dependency must be processed (or the request is complete)
*
- * *host_offset is updated to contain the offset into the image file at which
- * the first allocated cluster starts.
- *
- * Return 0 on success and -errno in error cases. -EAGAIN means that the
- * function has been waiting for another request and the allocation must be
- * restarted, but the whole request should not be failed.
+ * -EAGAIN if we had to wait for another request, previously gathered
+ * information on cluster allocation may be invalid now. The caller
+ * must start over anyway, so consider *cur_bytes undefined.
*/
-static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
- uint64_t *host_offset, unsigned int *nb_clusters)
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *cur_bytes, QCowL2Meta **m)
{
BDRVQcowState *s = bs->opaque;
QCowL2Meta *old_alloc;
+ uint64_t bytes = *cur_bytes;
- trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
- *host_offset, *nb_clusters);
-
- /*
- * Check if there already is an AIO write request in flight which allocates
- * the same cluster. In this case we need to wait until the previous
- * request has completed and updated the L2 table accordingly.
- */
QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
- uint64_t start = guest_offset >> s->cluster_bits;
- uint64_t end = start + *nb_clusters;
- uint64_t old_start = old_alloc->offset >> s->cluster_bits;
- uint64_t old_end = old_start + old_alloc->nb_clusters;
+ uint64_t start = guest_offset;
+ uint64_t end = start + bytes;
+ uint64_t old_start = l2meta_cow_start(old_alloc);
+ uint64_t old_end = l2meta_cow_end(old_alloc);
- if (end < old_start || start > old_end) {
+ if (end <= old_start || start >= old_end) {
/* No intersection */
} else {
if (start < old_start) {
/* Stop at the start of a running allocation */
- *nb_clusters = old_start - start;
+ bytes = old_start - start;
} else {
- *nb_clusters = 0;
+ bytes = 0;
}
- if (*nb_clusters == 0) {
+ /* Stop if already an l2meta exists. After yielding, it wouldn't
+ * be valid any more, so we'd have to clean up the old L2Metas
+ * and deal with requests depending on them before starting to
+ * gather new ones. Not worth the trouble. */
+ if (bytes == 0 && *m) {
+ *cur_bytes = 0;
+ return 0;
+ }
+
+ if (bytes == 0) {
/* Wait for the dependency to complete. We need to recheck
* the free/allocated clusters when we continue. */
qemu_co_mutex_unlock(&s->lock);
@@ -803,10 +827,144 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
}
}
- if (!*nb_clusters) {
- abort();
+ /* Make sure that existing clusters and new allocations are only used up to
+ * the next dependency if we shortened the request above */
+ *cur_bytes = bytes;
+
+ return 0;
+}
+
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ * 0: if no allocated clusters are available at the given offset.
+ * *bytes is normally unchanged. It is set to 0 if the cluster
+ * is allocated and doesn't need COW, but doesn't have the right
+ * physical offset.
+ *
+ * 1: if allocated clusters that don't require a COW are available at
+ * the requested offset. *bytes may have decreased and describes
+ * the length of the area that can be written to.
+ *
+ * -errno: in error cases
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index;
+ uint64_t cluster_offset;
+ uint64_t *l2_table;
+ unsigned int nb_clusters;
+ unsigned int keep_clusters;
+ int ret, pret;
+
+ trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+ *bytes);
+
+ assert(*host_offset == 0 || offset_into_cluster(s, guest_offset)
+ == offset_into_cluster(s, *host_offset));
+
+ /*
+ * Calculate the number of clusters to look for. We stop at L2 table
+ * boundaries to keep things simple.
+ */
+ nb_clusters =
+ size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+ l2_index = offset_to_l2_index(s, guest_offset);
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ /* Find L2 entry for the first involved cluster */
+ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ /* Check how many clusters are already allocated and don't need COW */
+ if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+ && (cluster_offset & QCOW_OFLAG_COPIED))
+ {
+ /* If a specific host_offset is required, check it */
+ bool offset_matches =
+ (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+ if (*host_offset != 0 && !offset_matches) {
+ *bytes = 0;
+ ret = 0;
+ goto out;
+ }
+
+ /* We keep all QCOW_OFLAG_COPIED clusters */
+ keep_clusters =
+ count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+ assert(keep_clusters <= nb_clusters);
+
+ *bytes = MIN(*bytes,
+ keep_clusters * s->cluster_size
+ - offset_into_cluster(s, guest_offset));
+
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+
+ /* Cleanup */
+out:
+ pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (pret < 0) {
+ return pret;
}
+ /* Only return a host offset if we actually made progress. Otherwise we
+ * would make requirements for handle_alloc() that it can't fulfill */
+ if (ret) {
+ *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+ + offset_into_cluster(s, guest_offset);
+ }
+
+ return ret;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, unsigned int *nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+ *host_offset, *nb_clusters);
+
/* Allocate new clusters */
trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
if (*host_offset == 0) {
@@ -828,6 +986,151 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
}
/*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ * 0: if no clusters could be allocated. *bytes is set to 0,
+ * *host_offset is left unchanged.
+ *
+ * 1: if new clusters were allocated. *bytes may be decreased if the
+ * new allocation doesn't cover all of the requested area.
+ * *host_offset is updated to contain the host offset of the first
+ * newly allocated cluster.
+ *
+ * -errno: in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index;
+ uint64_t *l2_table;
+ uint64_t entry;
+ unsigned int nb_clusters;
+ int ret;
+
+ uint64_t alloc_cluster_offset;
+
+ trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+ *bytes);
+ assert(*bytes > 0);
+
+ /*
+ * Calculate the number of clusters to look for. We stop at L2 table
+ * boundaries to keep things simple.
+ */
+ nb_clusters =
+ size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+ l2_index = offset_to_l2_index(s, guest_offset);
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ /* Find L2 entry for the first involved cluster */
+ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ entry = be64_to_cpu(l2_table[l2_index]);
+
+ /* For the moment, overwrite compressed clusters one by one */
+ if (entry & QCOW_OFLAG_COMPRESSED) {
+ nb_clusters = 1;
+ } else {
+ nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+ }
+
+ /* This function is only called when there were no non-COW clusters, so if
+ * we can't find any unallocated or COW clusters either, something is
+ * wrong with our code. */
+ assert(nb_clusters > 0);
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Allocate, if necessary at a given offset in the image file */
+ alloc_cluster_offset = start_of_cluster(s, *host_offset);
+ ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+ &nb_clusters);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Can't extend contiguous allocation */
+ if (nb_clusters == 0) {
+ *bytes = 0;
+ return 0;
+ }
+
+ /*
+ * Save info needed for meta data update.
+ *
+ * requested_sectors: Number of sectors from the start of the first
+ * newly allocated cluster to the end of the (possibly shortened
+ * before) write request.
+ *
+ * avail_sectors: Number of sectors from the start of the first
+ * newly allocated to the end of the last newly allocated cluster.
+ *
+ * nb_sectors: The number of sectors from the start of the first
+ * newly allocated cluster to the end of the area that the write
+ * request actually writes to (excluding COW at the end)
+ */
+ int requested_sectors =
+ (*bytes + offset_into_cluster(s, guest_offset))
+ >> BDRV_SECTOR_BITS;
+ int avail_sectors = nb_clusters
+ << (s->cluster_bits - BDRV_SECTOR_BITS);
+ int alloc_n_start = offset_into_cluster(s, guest_offset)
+ >> BDRV_SECTOR_BITS;
+ int nb_sectors = MIN(requested_sectors, avail_sectors);
+ QCowL2Meta *old_m = *m;
+
+ *m = g_malloc0(sizeof(**m));
+
+ **m = (QCowL2Meta) {
+ .next = old_m,
+
+ .alloc_offset = alloc_cluster_offset,
+ .offset = start_of_cluster(s, guest_offset),
+ .nb_clusters = nb_clusters,
+ .nb_available = nb_sectors,
+
+ .cow_start = {
+ .offset = 0,
+ .nb_sectors = alloc_n_start,
+ },
+ .cow_end = {
+ .offset = nb_sectors * BDRV_SECTOR_SIZE,
+ .nb_sectors = avail_sectors - nb_sectors,
+ },
+ };
+ qemu_co_queue_init(&(*m)->dependent_requests);
+ QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+ *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
+ *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+ - offset_into_cluster(s, guest_offset));
+ assert(*bytes != 0);
+
+ return 1;
+
+fail:
+ if (*m && (*m)->nb_clusters > 0) {
+ QLIST_REMOVE(*m, next_in_flight);
+ }
+ return ret;
+}
+
+/*
* alloc_cluster_offset
*
* For a given offset on the virtual disk, find the cluster offset in qcow2
@@ -847,151 +1150,113 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
* Return 0 on success and -errno in error cases
*/
int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
- int n_start, int n_end, int *num, QCowL2Meta *m)
+ int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
{
BDRVQcowState *s = bs->opaque;
- int l2_index, ret, sectors;
- uint64_t *l2_table;
- unsigned int nb_clusters, keep_clusters;
+ uint64_t start, remaining;
uint64_t cluster_offset;
+ uint64_t cur_bytes;
+ int ret;
trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
n_start, n_end);
- /* Find L2 entry for the first involved cluster */
-again:
- ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
- if (ret < 0) {
- return ret;
- }
+ assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+ offset = start_of_cluster(s, offset);
- /*
- * Calculate the number of clusters to look for. We stop at L2 table
- * boundaries to keep things simple.
- */
- nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
- s->l2_size - l2_index);
+again:
+ start = offset + (n_start << BDRV_SECTOR_BITS);
+ remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+ cluster_offset = 0;
+ *host_offset = 0;
+ cur_bytes = 0;
+ *m = NULL;
- cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ while (true) {
- /*
- * Check how many clusters are already allocated and don't need COW, and how
- * many need a new allocation.
- */
- if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
- && (cluster_offset & QCOW_OFLAG_COPIED))
- {
- /* We keep all QCOW_OFLAG_COPIED clusters */
- keep_clusters =
- count_contiguous_clusters(nb_clusters, s->cluster_size,
- &l2_table[l2_index], 0,
- QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
- assert(keep_clusters <= nb_clusters);
- nb_clusters -= keep_clusters;
- } else {
- keep_clusters = 0;
- cluster_offset = 0;
- }
-
- if (nb_clusters > 0) {
- /* For the moment, overwrite compressed clusters one by one */
- uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
- if (entry & QCOW_OFLAG_COMPRESSED) {
- nb_clusters = 1;
- } else {
- nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
- l2_index + keep_clusters);
+ if (!*host_offset) {
+ *host_offset = start_of_cluster(s, cluster_offset);
}
- }
- cluster_offset &= L2E_OFFSET_MASK;
+ assert(remaining >= cur_bytes);
- /*
- * The L2 table isn't used any more after this. As long as the cache works
- * synchronously, it's important to release it before calling
- * do_alloc_cluster_offset, which may yield if we need to wait for another
- * request to complete. If we still had the reference, we could use up the
- * whole cache with sleeping requests.
- */
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- return ret;
- }
-
- /* If there is something left to allocate, do that now */
- *m = (QCowL2Meta) {
- .cluster_offset = cluster_offset,
- .nb_clusters = 0,
- };
- qemu_co_queue_init(&m->dependent_requests);
+ start += cur_bytes;
+ remaining -= cur_bytes;
+ cluster_offset += cur_bytes;
- if (nb_clusters > 0) {
- uint64_t alloc_offset;
- uint64_t alloc_cluster_offset;
- uint64_t keep_bytes = keep_clusters * s->cluster_size;
-
- /* Calculate start and size of allocation */
- alloc_offset = offset + keep_bytes;
-
- if (keep_clusters == 0) {
- alloc_cluster_offset = 0;
- } else {
- alloc_cluster_offset = cluster_offset + keep_bytes;
+ if (remaining == 0) {
+ break;
}
- /* Allocate, if necessary at a given offset in the image file */
- ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
- &nb_clusters);
+ cur_bytes = remaining;
+
+ /*
+ * Now start gathering as many contiguous clusters as possible:
+ *
+ * 1. Check for overlaps with in-flight allocations
+ *
+ * a) Overlap not in the first cluster -> shorten this request and
+ * let the caller handle the rest in its next loop iteration.
+ *
+ * b) Real overlaps of two requests. Yield and restart the search
+ * for contiguous clusters (the situation could have changed
+ * while we were sleeping)
+ *
+ * c) TODO: Request starts in the same cluster as the in-flight
+ * allocation ends. Shorten the COW of the in-fight allocation,
+ * set cluster_offset to write to the same cluster and set up
+ * the right synchronisation between the in-flight request and
+ * the new one.
+ */
+ ret = handle_dependencies(bs, start, &cur_bytes, m);
if (ret == -EAGAIN) {
+ /* Currently handle_dependencies() doesn't yield if we already had
+ * an allocation. If it did, we would have to clean up the L2Meta
+ * structs before starting over. */
+ assert(*m == NULL);
goto again;
} else if (ret < 0) {
- goto fail;
+ return ret;
+ } else if (cur_bytes == 0) {
+ break;
+ } else {
+ /* handle_dependencies() may have decreased cur_bytes (shortened
+ * the allocations below) so that the next dependency is processed
+ * correctly during the next loop iteration. */
}
- /* save info needed for meta data update */
- if (nb_clusters > 0) {
- /*
- * requested_sectors: Number of sectors from the start of the first
- * newly allocated cluster to the end of the (possibly shortened
- * before) write request.
- *
- * avail_sectors: Number of sectors from the start of the first
- * newly allocated to the end of the last newly allocated cluster.
- */
- int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
- int avail_sectors = nb_clusters
- << (s->cluster_bits - BDRV_SECTOR_BITS);
-
- *m = (QCowL2Meta) {
- .cluster_offset = keep_clusters == 0 ?
- alloc_cluster_offset : cluster_offset,
- .alloc_offset = alloc_cluster_offset,
- .offset = alloc_offset,
- .n_start = keep_clusters == 0 ? n_start : 0,
- .nb_clusters = nb_clusters,
- .nb_available = MIN(requested_sectors, avail_sectors),
- };
- qemu_co_queue_init(&m->dependent_requests);
- QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
+ /*
+ * 2. Count contiguous COPIED clusters.
+ */
+ ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ continue;
+ } else if (cur_bytes == 0) {
+ break;
}
- }
- /* Some cleanup work */
- sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
- if (sectors > n_end) {
- sectors = n_end;
+ /*
+ * 3. If the request still hasn't completed, allocate new clusters,
+ * considering any cluster_offset of steps 1c or 2.
+ */
+ ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ continue;
+ } else {
+ assert(cur_bytes == 0);
+ break;
+ }
}
- assert(sectors > n_start);
- *num = sectors - n_start;
+ *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+ assert(*num > 0);
+ assert(*host_offset != 0);
return 0;
-
-fail:
- if (m->nb_clusters > 0) {
- QLIST_REMOVE(m, next_in_flight);
- }
- return ret;
}
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@@ -1081,7 +1346,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
l2_table[l2_index + i] = cpu_to_be64(0);
/* Then decrease the refcount */
- qcow2_free_any_clusters(bs, old_offset, 1);
+ qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
}
ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
@@ -1112,18 +1377,25 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
nb_clusters = size_to_clusters(s, end_offset - offset);
+ s->cache_discards = true;
+
/* Each L2 table is handled by its own loop iteration */
while (nb_clusters > 0) {
ret = discard_single_l2(bs, offset, nb_clusters);
if (ret < 0) {
- return ret;
+ goto fail;
}
nb_clusters -= ret;
offset += (ret * s->cluster_size);
}
- return 0;
+ ret = 0;
+fail:
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
+ return ret;
}
/*
@@ -1157,7 +1429,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
if (old_offset & QCOW_OFLAG_COMPRESSED) {
l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
- qcow2_free_any_clusters(bs, old_offset, 1);
+ qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
} else {
l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
}
@@ -1185,15 +1457,22 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
/* Each L2 table is handled by its own loop iteration */
nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+ s->cache_discards = true;
+
while (nb_clusters > 0) {
ret = zero_single_l2(bs, offset, nb_clusters);
if (ret < 0) {
- return ret;
+ goto fail;
}
nb_clusters -= ret;
offset += (ret * s->cluster_size);
}
- return 0;
+ ret = 0;
+fail:
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
+ return ret;
}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 96224d1af..1244693f3 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -23,13 +23,13 @@
*/
#include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
#include "block/qcow2.h"
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
int64_t offset, int64_t length,
- int addend);
+ int addend, enum qcow2_discard_type type);
/*********************************************************/
@@ -201,7 +201,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
*refcount_block = NULL;
/* We write to the refcount table, so we might depend on L2 tables */
- qcow2_cache_flush(bs, s->l2_table_cache);
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ return ret;
+ }
/* Allocate the refcount block itself and mark it as used */
int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
@@ -232,12 +235,16 @@ static int alloc_refcount_block(BlockDriverState *bs,
} else {
/* Described somewhere else. This can recurse at most twice before we
* arrive at a block that describes itself. */
- ret = update_refcount(bs, new_block, s->cluster_size, 1);
+ ret = update_refcount(bs, new_block, s->cluster_size, 1,
+ QCOW2_DISCARD_NEVER);
if (ret < 0) {
goto fail_block;
}
- bdrv_flush(bs->file);
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail_block;
+ }
/* Initialize the new refcount block only after updating its refcount,
* update_refcount uses the refcount cache itself */
@@ -393,7 +400,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
/* Free old table. Remember, we must not change free_cluster_index */
uint64_t old_free_cluster_index = s->free_cluster_index;
- qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+ qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
s->free_cluster_index = old_free_cluster_index;
ret = load_refcount_block(bs, new_block, (void**) refcount_block);
@@ -412,9 +420,77 @@ fail_block:
return ret;
}
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2DiscardRegion *d, *next;
+
+ QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+ QTAILQ_REMOVE(&s->discards, d, next);
+
+ /* Discard is optional, ignore the return value */
+ if (ret >= 0) {
+ bdrv_discard(bs->file,
+ d->offset >> BDRV_SECTOR_BITS,
+ d->bytes >> BDRV_SECTOR_BITS);
+ }
+
+ g_free(d);
+ }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+ uint64_t offset, uint64_t length)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2DiscardRegion *d, *p, *next;
+
+ QTAILQ_FOREACH(d, &s->discards, next) {
+ uint64_t new_start = MIN(offset, d->offset);
+ uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+ if (new_end - new_start <= length + d->bytes) {
+ /* There can't be any overlap, areas ending up here have no
+ * references any more and therefore shouldn't get freed another
+ * time. */
+ assert(d->bytes + length == new_end - new_start);
+ d->offset = new_start;
+ d->bytes = new_end - new_start;
+ goto found;
+ }
+ }
+
+ d = g_malloc(sizeof(*d));
+ *d = (Qcow2DiscardRegion) {
+ .bs = bs,
+ .offset = offset,
+ .bytes = length,
+ };
+ QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+ /* Merge discard requests if they are adjacent now */
+ QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+ if (p == d
+ || p->offset > d->offset + d->bytes
+ || d->offset > p->offset + p->bytes)
+ {
+ continue;
+ }
+
+ /* Still no overlap possible */
+ assert(p->offset == d->offset + d->bytes
+ || d->offset == p->offset + p->bytes);
+
+ QTAILQ_REMOVE(&s->discards, p, next);
+ d->offset = MIN(d->offset, p->offset);
+ d->bytes += p->bytes;
+ }
+}
+
/* XXX: cache several refcount block clusters ? */
static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
- int64_t offset, int64_t length, int addend)
+ int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
{
BDRVQcowState *s = bs->opaque;
int64_t start, last, cluster_offset;
@@ -480,10 +556,18 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
s->free_cluster_index = cluster_index;
}
refcount_block[block_index] = cpu_to_be16(refcount);
+
+ if (refcount == 0 && s->discard_passthrough[type]) {
+ update_refcount_discard(bs, cluster_offset, s->cluster_size);
+ }
}
ret = 0;
fail:
+ if (!s->cache_discards) {
+ qcow2_process_discards(bs, ret);
+ }
+
/* Write last changed block to disk */
if (refcount_block) {
int wret;
@@ -500,7 +584,8 @@ fail:
*/
if (ret < 0) {
int dummy;
- dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
+ dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+ QCOW2_DISCARD_NEVER);
(void)dummy;
}
@@ -516,18 +601,18 @@ fail:
*/
static int update_cluster_refcount(BlockDriverState *bs,
int64_t cluster_index,
- int addend)
+ int addend,
+ enum qcow2_discard_type type)
{
BDRVQcowState *s = bs->opaque;
int ret;
- ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+ ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+ type);
if (ret < 0) {
return ret;
}
- bdrv_flush(bs->file);
-
return get_refcount(bs, cluster_index);
}
@@ -575,7 +660,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
return offset;
}
- ret = update_refcount(bs, offset, size, 1);
+ ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
if (ret < 0) {
return ret;
}
@@ -607,7 +692,8 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
old_free_cluster_index = s->free_cluster_index;
s->free_cluster_index = cluster_index + i;
- ret = update_refcount(bs, offset, i << s->cluster_bits, 1);
+ ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
if (ret < 0) {
return ret;
}
@@ -645,7 +731,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
if (free_in_cluster == 0)
s->free_byte_offset = 0;
if ((offset & (s->cluster_size - 1)) != 0)
- update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
} else {
offset = qcow2_alloc_clusters(bs, s->cluster_size);
if (offset < 0) {
@@ -655,7 +742,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
if ((cluster_offset + s->cluster_size) == offset) {
/* we are lucky: contiguous data */
offset = s->free_byte_offset;
- update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
s->free_byte_offset += size;
} else {
s->free_byte_offset = offset;
@@ -663,17 +751,22 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
}
}
- bdrv_flush(bs->file);
+ /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+ * or explicitly by update_cluster_refcount(). Refcount blocks must be
+ * flushed before the caller's L2 table updates.
+ */
+ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
return offset;
}
void qcow2_free_clusters(BlockDriverState *bs,
- int64_t offset, int64_t size)
+ int64_t offset, int64_t size,
+ enum qcow2_discard_type type)
{
int ret;
BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
- ret = update_refcount(bs, offset, size, -1);
+ ret = update_refcount(bs, offset, size, -1, type);
if (ret < 0) {
fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
/* TODO Remember the clusters to free them later and avoid leaking */
@@ -684,8 +777,8 @@ void qcow2_free_clusters(BlockDriverState *bs,
* Free a cluster using its L2 entry (handles clusters of all types, e.g.
* normal cluster, compressed cluster, etc.)
*/
-void qcow2_free_any_clusters(BlockDriverState *bs,
- uint64_t l2_entry, int nb_clusters)
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+ int nb_clusters, enum qcow2_discard_type type)
{
BDRVQcowState *s = bs->opaque;
@@ -697,12 +790,12 @@ void qcow2_free_any_clusters(BlockDriverState *bs,
s->csize_mask) + 1;
qcow2_free_clusters(bs,
(l2_entry & s->cluster_offset_mask) & ~511,
- nb_csectors * 512);
+ nb_csectors * 512, type);
}
break;
case QCOW2_CLUSTER_NORMAL:
qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
- nb_clusters << s->cluster_bits);
+ nb_clusters << s->cluster_bits, type);
break;
case QCOW2_CLUSTER_UNALLOCATED:
case QCOW2_CLUSTER_ZERO:
@@ -733,20 +826,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
l1_table = NULL;
l1_size2 = l1_size * sizeof(uint64_t);
+ s->cache_discards = true;
+
/* WARNING: qcow2_snapshot_goto relies on this function not using the
* l1_table_offset when it is the current s->l1_table_offset! Be careful
* when changing this! */
if (l1_table_offset != s->l1_table_offset) {
- if (l1_size2 != 0) {
- l1_table = g_malloc0(align_offset(l1_size2, 512));
- } else {
- l1_table = NULL;
- }
+ l1_table = g_malloc0(align_offset(l1_size2, 512));
l1_allocated = 1;
- if (bdrv_pread(bs->file, l1_table_offset,
- l1_table, l1_size2) != l1_size2)
- {
- ret = -EIO;
+
+ ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+ if (ret < 0) {
goto fail;
}
@@ -782,27 +872,25 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int ret;
ret = update_refcount(bs,
(offset & s->cluster_offset_mask) & ~511,
- nb_csectors * 512, addend);
+ nb_csectors * 512, addend,
+ QCOW2_DISCARD_SNAPSHOT);
if (ret < 0) {
goto fail;
}
-
- /* TODO Flushing once for the whole function should
- * be enough */
- bdrv_flush(bs->file);
}
/* compressed clusters are never modified */
refcount = 2;
} else {
uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
if (addend != 0) {
- refcount = update_cluster_refcount(bs, cluster_index, addend);
+ refcount = update_cluster_refcount(bs, cluster_index, addend,
+ QCOW2_DISCARD_SNAPSHOT);
} else {
refcount = get_refcount(bs, cluster_index);
}
if (refcount < 0) {
- ret = -EIO;
+ ret = refcount;
goto fail;
}
}
@@ -828,12 +916,13 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
if (addend != 0) {
- refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+ refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
+ QCOW2_DISCARD_SNAPSHOT);
} else {
refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
}
if (refcount < 0) {
- ret = -EIO;
+ ret = refcount;
goto fail;
} else if (refcount == 1) {
l2_offset |= QCOW_OFLAG_COPIED;
@@ -845,21 +934,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
}
}
- ret = 0;
+ ret = bdrv_flush(bs);
fail:
if (l2_table) {
qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
}
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
/* Update L1 only if it isn't deleted anyway (addend = -1) */
- if (addend >= 0 && l1_modified) {
- for(i = 0; i < l1_size; i++)
+ if (ret == 0 && addend >= 0 && l1_modified) {
+ for (i = 0; i < l1_size; i++) {
cpu_to_be64s(&l1_table[i]);
- if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
- l1_size2) < 0)
- goto fail;
- for(i = 0; i < l1_size; i++)
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+ for (i = 0; i < l1_size; i++) {
be64_to_cpus(&l1_table[i]);
+ }
}
if (l1_allocated)
g_free(l1_table);
@@ -918,6 +1012,12 @@ static void inc_refcounts(BlockDriverState *bs,
}
}
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+ CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */
+ CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */
+};
+
/*
* Increases the refcount in the given refcount table for the all clusters
* referenced in the L2 table. While doing so, performs some checks on L2
@@ -928,10 +1028,11 @@ static void inc_refcounts(BlockDriverState *bs,
*/
static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
- int check_copied)
+ int flags)
{
BDRVQcowState *s = bs->opaque;
uint64_t *l2_table, l2_entry;
+ uint64_t next_contiguous_offset = 0;
int i, l2_size, nb_csectors, refcount;
/* Read L2 table from disk */
@@ -962,6 +1063,18 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
l2_entry &= s->cluster_offset_mask;
inc_refcounts(bs, res, refcount_table, refcount_table_size,
l2_entry & ~511, nb_csectors * 512);
+
+ if (flags & CHECK_FRAG_INFO) {
+ res->bfi.allocated_clusters++;
+ res->bfi.compressed_clusters++;
+
+ /* Compressed clusters are fragmented by nature. Since they
+ * take up sub-sector space but we only have sector granularity
+ * I/O we need to re-read the same sectors even for adjacent
+ * compressed clusters.
+ */
+ res->bfi.fragmented_clusters++;
+ }
break;
case QCOW2_CLUSTER_ZERO:
@@ -975,7 +1088,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
/* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
uint64_t offset = l2_entry & L2E_OFFSET_MASK;
- if (check_copied) {
+ if (flags & CHECK_OFLAG_COPIED) {
refcount = get_refcount(bs, offset >> s->cluster_bits);
if (refcount < 0) {
fprintf(stderr, "Can't get refcount for offset %"
@@ -989,6 +1102,15 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
}
}
+ if (flags & CHECK_FRAG_INFO) {
+ res->bfi.allocated_clusters++;
+ if (next_contiguous_offset &&
+ offset != next_contiguous_offset) {
+ res->bfi.fragmented_clusters++;
+ }
+ next_contiguous_offset = offset + s->cluster_size;
+ }
+
/* Mark cluster as used */
inc_refcounts(bs, res, refcount_table,refcount_table_size,
offset, s->cluster_size);
@@ -1032,7 +1154,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
uint16_t *refcount_table,
int refcount_table_size,
int64_t l1_table_offset, int l1_size,
- int check_copied)
+ int flags)
{
BDRVQcowState *s = bs->opaque;
uint64_t *l1_table, l2_offset, l1_size2;
@@ -1061,7 +1183,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
l2_offset = l1_table[i];
if (l2_offset) {
/* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
- if (check_copied) {
+ if (flags & CHECK_OFLAG_COPIED) {
refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
>> s->cluster_bits);
if (refcount < 0) {
@@ -1090,7 +1212,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
/* Process and check L2 entries */
ret = check_refcounts_l2(bs, res, refcount_table,
- refcount_table_size, l2_offset, check_copied);
+ refcount_table_size, l2_offset, flags);
if (ret < 0) {
goto fail;
}
@@ -1116,7 +1238,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVQcowState *s = bs->opaque;
- int64_t size, i;
+ int64_t size, i, highest_cluster;
int nb_clusters, refcount1, refcount2;
QCowSnapshot *sn;
uint16_t *refcount_table;
@@ -1126,13 +1248,17 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
nb_clusters = size_to_clusters(s, size);
refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+ res->bfi.total_clusters =
+ size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
/* header */
inc_refcounts(bs, res, refcount_table, nb_clusters,
0, s->cluster_size);
/* current L1 table */
ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
- s->l1_table_offset, s->l1_size, 1);
+ s->l1_table_offset, s->l1_size,
+ CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
if (ret < 0) {
goto fail;
}
@@ -1187,7 +1313,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
}
/* compare ref counts */
- for(i = 0; i < nb_clusters; i++) {
+ for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
refcount1 = get_refcount(bs, i);
if (refcount1 < 0) {
fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
@@ -1197,6 +1323,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
}
refcount2 = refcount_table[i];
+
+ if (refcount1 > 0 || refcount2 > 0) {
+ highest_cluster = i;
+ }
+
if (refcount1 != refcount2) {
/* Check if we're allowed to fix the mismatch */
@@ -1215,7 +1346,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
if (num_fixed) {
ret = update_refcount(bs, i << s->cluster_bits, 1,
- refcount2 - refcount1);
+ refcount2 - refcount1,
+ QCOW2_DISCARD_ALWAYS);
if (ret >= 0) {
(*num_fixed)++;
continue;
@@ -1231,6 +1363,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
}
}
+ res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
ret = 0;
fail:
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 4e7c93b8b..0caac9055 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -23,7 +23,7 @@
*/
#include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
#include "block/qcow2.h"
typedef struct QEMU_PACKED QCowSnapshotHeader {
@@ -180,11 +180,14 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
/* Allocate space for the new snapshot list */
snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
- bdrv_flush(bs->file);
offset = snapshots_offset;
if (offset < 0) {
return offset;
}
+ ret = bdrv_flush(bs);
+ if (ret < 0) {
+ return ret;
+ }
/* Write all snapshots to the new list */
for(i = 0; i < s->nb_snapshots; i++) {
@@ -259,7 +262,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
}
/* free the old snapshot table */
- qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+ qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size,
+ QCOW2_DISCARD_SNAPSHOT);
s->snapshots_offset = snapshots_offset;
s->snapshots_size = snapshots_size;
return 0;
@@ -378,11 +382,6 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
goto fail;
}
- ret = bdrv_flush(bs);
- if (ret < 0) {
- goto fail;
- }
-
/* Append the new snapshot to the snapshot list */
new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
if (s->snapshots) {
@@ -571,7 +570,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
if (ret < 0) {
return ret;
}
- qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
+ qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
+ QCOW2_DISCARD_SNAPSHOT);
/* must update the copied flag on the current cluster offsets */
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
diff --git a/block/qcow2.c b/block/qcow2.c
index c1ff31f48..3376901bd 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -22,13 +22,14 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
#include <zlib.h>
-#include "aes.h"
+#include "qemu/aes.h"
#include "block/qcow2.h"
-#include "qemu-error.h"
-#include "qerror.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qbool.h"
#include "trace.h"
/*
@@ -222,7 +223,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
* updated successfully. Therefore it is not required to check the return
* value of this function.
*/
-static int qcow2_mark_dirty(BlockDriverState *bs)
+int qcow2_mark_dirty(BlockDriverState *bs)
{
BDRVQcowState *s = bs->opaque;
uint64_t val;
@@ -285,12 +286,44 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
return ret;
}
-static int qcow2_open(BlockDriverState *bs, int flags)
+static QemuOptsList qcow2_runtime_opts = {
+ .name = "qcow2",
+ .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
+ .desc = {
+ {
+ .name = QCOW2_OPT_LAZY_REFCOUNTS,
+ .type = QEMU_OPT_BOOL,
+ .help = "Postpone refcount updates",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_REQUEST,
+ .type = QEMU_OPT_BOOL,
+ .help = "Pass guest discard requests to the layer below",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_SNAPSHOT,
+ .type = QEMU_OPT_BOOL,
+ .help = "Generate discard requests when snapshot related space "
+ "is freed",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_OTHER,
+ .type = QEMU_OPT_BOOL,
+ .help = "Generate discard requests when other clusters are freed",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVQcowState *s = bs->opaque;
int len, i, ret = 0;
QCowHeader header;
+ QemuOpts *opts;
+ Error *local_err = NULL;
uint64_t ext_end;
+ uint64_t l1_vm_state_index;
ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
if (ret < 0) {
@@ -311,7 +344,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
be32_to_cpus(&header.nb_snapshots);
if (header.magic != QCOW_MAGIC) {
- ret = -EINVAL;
+ ret = -EMEDIUMTYPE;
goto fail;
}
if (header.version < 2 || header.version > 3) {
@@ -408,7 +441,14 @@ static int qcow2_open(BlockDriverState *bs, int flags)
/* read the level 1 table */
s->l1_size = header.l1_size;
- s->l1_vm_state_index = size_to_l1(s, header.size);
+
+ l1_vm_state_index = size_to_l1(s, header.size);
+ if (l1_vm_state_index > INT_MAX) {
+ ret = -EFBIG;
+ goto fail;
+ }
+ s->l1_vm_state_index = l1_vm_state_index;
+
/* the L1 table must contain at least enough entries to put
header.size bytes */
if (s->l1_size < s->l1_vm_state_index) {
@@ -446,6 +486,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
}
QLIST_INIT(&s->cluster_allocs);
+ QTAILQ_INIT(&s->discards);
/* read qcow2 extensions */
if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
@@ -495,6 +536,38 @@ static int qcow2_open(BlockDriverState *bs, int flags)
}
}
+ /* Enable lazy_refcounts according to image and command line options */
+ opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
+ (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
+
+ s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
+ s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
+ s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
+ flags & BDRV_O_UNMAP);
+ s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
+ s->discard_passthrough[QCOW2_DISCARD_OTHER] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+
+ qemu_opts_del(opts);
+
+ if (s->use_lazy_refcounts && s->qcow_version < 3) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
+ "a qcow2 image with at least qemu 1.1 compatibility level");
+ ret = -EINVAL;
+ goto fail;
+ }
+
#ifdef DEBUG_ALLOC
{
BdrvCheckResult result = {0};
@@ -584,7 +657,7 @@ static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
*pnum = 0;
}
- return (cluster_offset != 0);
+ return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
}
/* handle reading after the end of the backing file */
@@ -665,10 +738,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
break;
case QCOW2_CLUSTER_ZERO:
- if (s->qcow_version < 3) {
- ret = -EIO;
- goto fail;
- }
qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
break;
@@ -745,21 +814,6 @@ fail:
return ret;
}
-static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
-{
- /* Take the request off the list of running requests */
- if (m->nb_clusters != 0) {
- QLIST_REMOVE(m, next_in_flight);
- }
-
- /* Restart all dependent requests */
- if (!qemu_co_queue_empty(&m->dependent_requests)) {
- qemu_co_mutex_unlock(&s->lock);
- qemu_co_queue_restart_all(&m->dependent_requests);
- qemu_co_mutex_lock(&s->lock);
- }
-}
-
static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
int64_t sector_num,
int remaining_sectors,
@@ -774,15 +828,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
QEMUIOVector hd_qiov;
uint64_t bytes_done = 0;
uint8_t *cluster_data = NULL;
- QCowL2Meta l2meta = {
- .nb_clusters = 0,
- };
+ QCowL2Meta *l2meta = NULL;
trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
remaining_sectors);
- qemu_co_queue_init(&l2meta.dependent_requests);
-
qemu_iovec_init(&hd_qiov, qiov->niov);
s->cluster_cache_offset = -1; /* disable compressed cache */
@@ -791,6 +841,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
while (remaining_sectors != 0) {
+ l2meta = NULL;
+
trace_qcow2_writev_start_part(qemu_coroutine_self());
index_in_cluster = sector_num & (s->cluster_sectors - 1);
n_end = index_in_cluster + remaining_sectors;
@@ -800,17 +852,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
}
ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
- index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+ index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
if (ret < 0) {
goto fail;
}
- if (l2meta.nb_clusters > 0 &&
- (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
- qcow2_mark_dirty(bs);
- }
-
- cluster_offset = l2meta.cluster_offset;
assert((cluster_offset & 511) == 0);
qemu_iovec_reset(&hd_qiov);
@@ -835,8 +881,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
cur_nr_sectors * 512);
}
- BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
qemu_co_mutex_unlock(&s->lock);
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
trace_qcow2_writev_data(qemu_coroutine_self(),
(cluster_offset >> 9) + index_in_cluster);
ret = bdrv_co_writev(bs->file,
@@ -847,12 +893,25 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
goto fail;
}
- ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
- if (ret < 0) {
- goto fail;
- }
+ while (l2meta != NULL) {
+ QCowL2Meta *next;
+
+ ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Take the request off the list of running requests */
+ if (l2meta->nb_clusters != 0) {
+ QLIST_REMOVE(l2meta, next_in_flight);
+ }
+
+ qemu_co_queue_restart_all(&l2meta->dependent_requests);
- run_dependent_requests(s, &l2meta);
+ next = l2meta->next;
+ g_free(l2meta);
+ l2meta = next;
+ }
remaining_sectors -= cur_nr_sectors;
sector_num += cur_nr_sectors;
@@ -862,10 +921,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
ret = 0;
fail:
- run_dependent_requests(s, &l2meta);
-
qemu_co_mutex_unlock(&s->lock);
+ while (l2meta != NULL) {
+ QCowL2Meta *next;
+
+ if (l2meta->nb_clusters != 0) {
+ QLIST_REMOVE(l2meta, next_in_flight);
+ }
+ qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+ next = l2meta->next;
+ g_free(l2meta);
+ l2meta = next;
+ }
+
qemu_iovec_destroy(&hd_qiov);
qemu_vfree(cluster_data);
trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
@@ -902,6 +972,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
AES_KEY aes_encrypt_key;
AES_KEY aes_decrypt_key;
uint32_t crypt_method = 0;
+ QDict *options;
/*
* Backing files are read-only which makes all of their metadata immutable,
@@ -916,8 +987,14 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
qcow2_close(bs);
+ options = qdict_new();
+ qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
+ qbool_from_int(s->use_lazy_refcounts));
+
memset(s, 0, sizeof(BDRVQcowState));
- qcow2_open(bs, flags);
+ qcow2_open(bs, options, flags);
+
+ QDECREF(options);
if (crypt_method) {
s->crypt_method = crypt_method;
@@ -1128,31 +1205,34 @@ static int preallocate(BlockDriverState *bs)
{
uint64_t nb_sectors;
uint64_t offset;
+ uint64_t host_offset = 0;
int num;
int ret;
- QCowL2Meta meta;
+ QCowL2Meta *meta;
nb_sectors = bdrv_getlength(bs) >> 9;
offset = 0;
- qemu_co_queue_init(&meta.dependent_requests);
- meta.cluster_offset = 0;
while (nb_sectors) {
num = MIN(nb_sectors, INT_MAX >> 9);
- ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+ ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+ &host_offset, &meta);
if (ret < 0) {
return ret;
}
- ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+ ret = qcow2_alloc_cluster_link_l2(bs, meta);
if (ret < 0) {
- qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+ qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
+ QCOW2_DISCARD_NEVER);
return ret;
}
/* There are no dependent requests, but we need to remove our request
* from the list of in-flight requests */
- run_dependent_requests(bs->opaque, &meta);
+ if (meta != NULL) {
+ QLIST_REMOVE(meta, next_in_flight);
+ }
/* TODO Preallocate data if requested */
@@ -1165,10 +1245,10 @@ static int preallocate(BlockDriverState *bs)
* all of the allocated clusters (otherwise we get failing reads after
* EOF). Extend the image to the last allocated sector.
*/
- if (meta.cluster_offset != 0) {
+ if (host_offset != 0) {
uint8_t buf[512];
memset(buf, 0, 512);
- ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+ ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
if (ret < 0) {
return ret;
}
@@ -1216,7 +1296,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
return ret;
}
- ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
if (ret < 0) {
return ret;
}
@@ -1268,7 +1348,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
*/
BlockDriver* drv = bdrv_find_format("qcow2");
assert(drv != NULL);
- ret = bdrv_open(bs, filename,
+ ret = bdrv_open(bs, filename, NULL,
BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
if (ret < 0) {
goto out;
@@ -1436,7 +1516,8 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
{
BDRVQcowState *s = bs->opaque;
- int ret, new_l1_size;
+ int64_t new_l1_size;
+ int ret;
if (offset & 511) {
error_report("The new size must be a multiple of 512");
@@ -1493,8 +1574,21 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
return 0;
}
- if (nb_sectors != s->cluster_sectors)
- return -EINVAL;
+ if (nb_sectors != s->cluster_sectors) {
+ ret = -EINVAL;
+
+ /* Zero-pad last write if image size is not cluster aligned */
+ if (sector_num + nb_sectors == bs->total_sectors &&
+ nb_sectors < s->cluster_sectors) {
+ uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+ memset(pad_buf, 0, s->cluster_size);
+ memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+ ret = qcow2_write_compressed(bs, sector_num,
+ pad_buf, s->cluster_sectors);
+ qemu_vfree(pad_buf);
+ }
+ return ret;
+ }
out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
@@ -1608,8 +1702,8 @@ static void dump_refcounts(BlockDriverState *bs)
}
#endif
-static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
- int64_t pos, int size)
+static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t pos)
{
BDRVQcowState *s = bs->opaque;
int growable = bs->growable;
@@ -1617,7 +1711,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
bs->growable = 1;
- ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+ ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
bs->growable = growable;
return ret;
@@ -1691,6 +1785,7 @@ static BlockDriver bdrv_qcow2 = {
.bdrv_close = qcow2_close,
.bdrv_reopen_prepare = qcow2_reopen_prepare,
.bdrv_create = qcow2_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_is_allocated = qcow2_co_is_allocated,
.bdrv_set_key = qcow2_set_key,
.bdrv_make_empty = qcow2_make_empty,
diff --git a/block/qcow2.h b/block/qcow2.h
index b4eb65470..dba977141 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,8 +25,8 @@
#ifndef BLOCK_QCOW2_H
#define BLOCK_QCOW2_H
-#include "aes.h"
-#include "qemu-coroutine.h"
+#include "qemu/aes.h"
+#include "block/coroutine.h"
//#define DEBUG_ALLOC
//#define DEBUG_ALLOC2
@@ -58,6 +58,12 @@
#define DEFAULT_CLUSTER_SIZE 65536
+
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+
typedef struct QCowHeader {
uint32_t magic;
uint32_t version;
@@ -126,12 +132,28 @@ enum {
QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS,
};
+enum qcow2_discard_type {
+ QCOW2_DISCARD_NEVER = 0,
+ QCOW2_DISCARD_ALWAYS,
+ QCOW2_DISCARD_REQUEST,
+ QCOW2_DISCARD_SNAPSHOT,
+ QCOW2_DISCARD_OTHER,
+ QCOW2_DISCARD_MAX
+};
+
typedef struct Qcow2Feature {
uint8_t type;
uint8_t bit;
char name[46];
} QEMU_PACKED Qcow2Feature;
+typedef struct Qcow2DiscardRegion {
+ BlockDriverState *bs;
+ uint64_t offset;
+ uint64_t bytes;
+ QTAILQ_ENTRY(Qcow2DiscardRegion) next;
+} Qcow2DiscardRegion;
+
typedef struct BDRVQcowState {
int cluster_bits;
int cluster_size;
@@ -173,6 +195,9 @@ typedef struct BDRVQcowState {
int flags;
int qcow_version;
+ bool use_lazy_refcounts;
+
+ bool discard_passthrough[QCOW2_DISCARD_MAX];
uint64_t incompatible_features;
uint64_t compatible_features;
@@ -181,6 +206,8 @@ typedef struct BDRVQcowState {
size_t unknown_header_fields_size;
void* unknown_header_fields;
QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+ QTAILQ_HEAD (, Qcow2DiscardRegion) discards;
+ bool cache_discards;
} BDRVQcowState;
/* XXX: use std qcow open function ? */
@@ -196,17 +223,59 @@ typedef struct QCowCreateState {
struct QCowAIOCB;
-/* XXX This could be private for qcow2-cluster.c */
+typedef struct Qcow2COWRegion {
+ /**
+ * Offset of the COW region in bytes from the start of the first cluster
+ * touched by the request.
+ */
+ uint64_t offset;
+
+ /** Number of sectors to copy */
+ int nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
typedef struct QCowL2Meta
{
+ /** Guest offset of the first newly allocated cluster */
uint64_t offset;
- uint64_t cluster_offset;
+
+ /** Host offset of the first newly allocated cluster */
uint64_t alloc_offset;
- int n_start;
+
+ /**
+ * Number of sectors from the start of the first allocated cluster to
+ * the end of the (possibly shortened) request
+ */
int nb_available;
+
+ /** Number of newly allocated clusters */
int nb_clusters;
+
+ /**
+ * Requests that overlap with this allocation and wait to be restarted
+ * when the allocating request has completed.
+ */
CoQueue dependent_requests;
+ /**
+ * The COW Region between the start of the first allocated cluster and the
+ * area the guest actually writes to.
+ */
+ Qcow2COWRegion cow_start;
+
+ /**
+ * The COW Region between the area the guest actually writes to and the
+ * end of the last allocated cluster.
+ */
+ Qcow2COWRegion cow_end;
+
+ /** Pointer to next L2Meta of the same write request */
+ struct QCowL2Meta *next;
+
QLIST_ENTRY(QCowL2Meta) next_in_flight;
} QCowL2Meta;
@@ -223,17 +292,32 @@ enum {
#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+ return offset & ~(s->cluster_size - 1);
+}
+
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+ return offset & (s->cluster_size - 1);
+}
+
static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
{
return (size + (s->cluster_size - 1)) >> s->cluster_bits;
}
-static inline int size_to_l1(BDRVQcowState *s, int64_t size)
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
{
int shift = s->cluster_bits + s->l2_bits;
return (size + (1ULL << shift) - 1) >> shift;
}
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+ return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
static inline int64_t align_offset(int64_t offset, int n)
{
offset = (offset + n - 1) & ~(n - 1);
@@ -259,11 +343,24 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
}
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+ return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+ return m->offset + m->cow_end.offset
+ + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
// FIXME Need qcow2_ prefix to global functions
/* qcow2.c functions */
int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
int qcow2_update_header(BlockDriverState *bs);
/* qcow2-refcount.c functions */
@@ -275,9 +372,10 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
int nb_clusters);
int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
void qcow2_free_clusters(BlockDriverState *bs,
- int64_t offset, int64_t size);
-void qcow2_free_any_clusters(BlockDriverState *bs,
- uint64_t cluster_offset, int nb_clusters);
+ int64_t offset, int64_t size,
+ enum qcow2_discard_type type);
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+ int nb_clusters, enum qcow2_discard_type type);
int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int64_t l1_table_offset, int l1_size, int addend);
@@ -285,8 +383,11 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix);
+void qcow2_process_discards(BlockDriverState *bs, int ret);
+
/* qcow2-cluster.c functions */
-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size);
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+ bool exact_size);
void qcow2_l2_cache_reset(BlockDriverState *bs);
int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
@@ -297,7 +398,7 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
int *num, uint64_t *cluster_offset);
int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
- int n_start, int n_end, int *num, QCowL2Meta *m);
+ int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
uint64_t offset,
int compressed_size);
diff --git a/block/qed-table.c b/block/qed-table.c
index de845ec3d..76d2dcccf 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -13,7 +13,7 @@
*/
#include "trace.h"
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
#include "qed.h"
typedef struct {
diff --git a/block/qed.c b/block/qed.c
index 0b5374a20..f767b0528 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -12,11 +12,11 @@
*
*/
-#include "qemu-timer.h"
+#include "qemu/timer.h"
#include "trace.h"
#include "qed.h"
-#include "qerror.h"
-#include "migration.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
{
@@ -373,7 +373,7 @@ static void bdrv_qed_rebind(BlockDriverState *bs)
s->bs = bs;
}
-static int bdrv_qed_open(BlockDriverState *bs, int flags)
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVQEDState *s = bs->opaque;
QEDHeader le_header;
@@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
qed_header_le_to_cpu(&le_header, &s->header);
if (s->header.magic != QED_MAGIC) {
- return -EINVAL;
+ return -EMEDIUMTYPE;
}
if (s->header.features & ~QED_FEATURE_MASK) {
/* image uses unsupported feature bits */
@@ -558,7 +558,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
return ret;
}
- ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
if (ret < 0) {
return ret;
}
@@ -1526,7 +1526,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
bdrv_qed_close(bs);
memset(s, 0, sizeof(BDRVQEDState));
- bdrv_qed_open(bs, bs->open_flags);
+ bdrv_qed_open(bs, NULL, bs->open_flags);
}
static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
@@ -1574,6 +1574,7 @@ static BlockDriver bdrv_qed = {
.bdrv_close = bdrv_qed_close,
.bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
.bdrv_create = bdrv_qed_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
.bdrv_make_empty = bdrv_qed_make_empty,
.bdrv_aio_readv = bdrv_qed_aio_readv,
diff --git a/block/qed.h b/block/qed.h
index a063bf70a..2b4ddedf3 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -15,7 +15,7 @@
#ifndef BLOCK_QED_H
#define BLOCK_QED_H
-#include "block_int.h"
+#include "block/block_int.h"
/* The layout of a QED file is as follows:
*
diff --git a/block/raw-aio.h b/block/raw-aio.h
index e77f36114..c61f1595d 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -20,11 +20,14 @@
#define QEMU_AIO_WRITE 0x0002
#define QEMU_AIO_IOCTL 0x0004
#define QEMU_AIO_FLUSH 0x0008
+#define QEMU_AIO_DISCARD 0x0010
#define QEMU_AIO_TYPE_MASK \
- (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH)
+ (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
+ QEMU_AIO_DISCARD)
/* AIO flags */
#define QEMU_AIO_MISALIGNED 0x1000
+#define QEMU_AIO_BLKDEV 0x2000
/* linux-aio.c - Linux native implementation */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 550c81f22..ba721d3f5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -22,14 +22,13 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "qemu-timer.h"
-#include "qemu-char.h"
-#include "qemu-log.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "qemu/log.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
#include "trace.h"
-#include "thread-pool.h"
-#include "iov.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
#include "raw-aio.h"
#if defined(__APPLE__) && (__MACH__)
@@ -60,6 +59,9 @@
#ifdef CONFIG_FIEMAP
#include <linux/fiemap.h>
#endif
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include <linux/falloc.h>
+#endif
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
#include <sys/disk.h>
#include <sys/cdio.h>
@@ -139,6 +141,7 @@ typedef struct BDRVRawState {
#ifdef CONFIG_XFS
bool is_xfs : 1;
#endif
+ bool has_discard : 1;
} BDRVRawState;
typedef struct BDRVRawReopenState {
@@ -160,7 +163,7 @@ typedef struct RawPosixAIOData {
void *aio_ioctl_buf;
};
int aio_niov;
- size_t aio_nbytes;
+ uint64_t aio_nbytes;
#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
off_t aio_offset;
int aio_type;
@@ -259,15 +262,42 @@ error:
}
#endif
-static int raw_open_common(BlockDriverState *bs, const char *filename,
+static QemuOptsList raw_runtime_opts = {
+ .name = "raw",
+ .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "File name of the image",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int raw_open_common(BlockDriverState *bs, QDict *options,
int bdrv_flags, int open_flags)
{
BDRVRawState *s = bs->opaque;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
int fd, ret;
+ opts = qemu_opts_create_nofail(&raw_runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
ret = raw_normalize_devicepath(&filename);
if (ret != 0) {
- return ret;
+ goto fail;
}
s->open_flags = open_flags;
@@ -277,34 +307,40 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
fd = qemu_open(filename, s->open_flags, 0644);
if (fd < 0) {
ret = -errno;
- if (ret == -EROFS)
+ if (ret == -EROFS) {
ret = -EACCES;
- return ret;
+ }
+ goto fail;
}
s->fd = fd;
#ifdef CONFIG_LINUX_AIO
if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
qemu_close(fd);
- return -errno;
+ ret = -errno;
+ goto fail;
}
#endif
+ s->has_discard = 1;
#ifdef CONFIG_XFS
if (platform_test_xfs_fd(s->fd)) {
s->is_xfs = 1;
}
#endif
- return 0;
+ ret = 0;
+fail:
+ qemu_opts_del(opts);
+ return ret;
}
-static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
s->type = FTYPE_FILE;
- return raw_open_common(bs, filename, flags, 0);
+ return raw_open_common(bs, options, flags, 0);
}
static int raw_reopen_prepare(BDRVReopenState *state,
@@ -341,11 +377,20 @@ static int raw_reopen_prepare(BDRVReopenState *state,
raw_s->fd = -1;
- int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+ int fcntl_flags = O_APPEND | O_NONBLOCK;
#ifdef O_NOATIME
fcntl_flags |= O_NOATIME;
#endif
+#ifdef O_ASYNC
+ /* Not all operating systems have O_ASYNC, and those that don't
+ * will not let us track the state into raw_s->open_flags (typically
+ * you achieve the same effect with an ioctl, for example I_SETSIG
+ * on Solaris). But we do not use O_ASYNC, so that's fine.
+ */
+ assert((s->open_flags & O_ASYNC) == 0);
+#endif
+
if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
/* dup the original fd */
/* TODO: use qemu fcntl wrapper */
@@ -431,22 +476,6 @@ static void raw_reopen_abort(BDRVReopenState *state)
#endif
*/
-/*
- * Check if all memory in this vector is sector aligned.
- */
-static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
-{
- int i;
-
- for (i = 0; i < qiov->niov; i++) {
- if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
- return 0;
- }
- }
-
- return 1;
-}
-
static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
{
int ret;
@@ -456,15 +485,7 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
return -errno;
}
- /*
- * This looks weird, but the aio code only considers a request
- * successful if it has written the full number of bytes.
- *
- * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
- * so in fact we return the ioctl command here to make posix_aio_read()
- * happy..
- */
- return aiocb->aio_nbytes;
+ return 0;
}
static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
@@ -643,6 +664,72 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
return nbytes;
}
+#ifdef CONFIG_XFS
+static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
+{
+ struct xfs_flock64 fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.l_whence = SEEK_SET;
+ fl.l_start = offset;
+ fl.l_len = bytes;
+
+ if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
+ DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
+ return -errno;
+ }
+
+ return 0;
+}
+#endif
+
+static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
+{
+ int ret = -EOPNOTSUPP;
+ BDRVRawState *s = aiocb->bs->opaque;
+
+ if (s->has_discard == 0) {
+ return 0;
+ }
+
+ if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+#ifdef BLKDISCARD
+ do {
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+ if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
+
+ ret = -errno;
+#endif
+ } else {
+#ifdef CONFIG_XFS
+ if (s->is_xfs) {
+ return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
+ }
+#endif
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+ do {
+ if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
+
+ ret = -errno;
+#endif
+ }
+
+ if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
+ ret == -ENOTTY) {
+ s->has_discard = 0;
+ ret = 0;
+ }
+ return ret;
+}
+
static int aio_worker(void *arg)
{
RawPosixAIOData *aiocb = arg;
@@ -677,6 +764,9 @@ static int aio_worker(void *arg)
case QEMU_AIO_IOCTL:
ret = handle_aiocb_ioctl(aiocb);
break;
+ case QEMU_AIO_DISCARD:
+ ret = handle_aiocb_discard(aiocb);
+ break;
default:
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
ret = -EINVAL;
@@ -692,6 +782,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
BlockDriverCompletionFunc *cb, void *opaque, int type)
{
RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+ ThreadPool *pool;
acb->bs = bs;
acb->aio_type = type;
@@ -705,23 +796,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
acb->aio_offset = sector_num * 512;
trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
- return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
-}
-
-static BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
- unsigned long int req, void *buf,
- BlockDriverCompletionFunc *cb, void *opaque)
-{
- RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
-
- acb->bs = bs;
- acb->aio_type = QEMU_AIO_IOCTL;
- acb->aio_fildes = fd;
- acb->aio_offset = 0;
- acb->aio_ioctl_buf = buf;
- acb->aio_ioctl_cmd = req;
-
- return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+ pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+ return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
}
static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
@@ -739,7 +815,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
* driver that it needs to copy the buffer.
*/
if ((bs->open_flags & BDRV_O_NOCACHE)) {
- if (!qiov_is_aligned(bs, qiov)) {
+ if (!bdrv_qiov_is_aligned(bs, qiov)) {
type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_AIO
} else if (s->use_aio) {
@@ -1093,37 +1169,14 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
}
}
-#ifdef CONFIG_XFS
-static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
-{
- struct xfs_flock64 fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.l_whence = SEEK_SET;
- fl.l_start = sector_num << 9;
- fl.l_len = (int64_t)nb_sectors << 9;
-
- if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
- DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
- return -errno;
- }
-
- return 0;
-}
-#endif
-
-static coroutine_fn int raw_co_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors)
+static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
{
-#ifdef CONFIG_XFS
BDRVRawState *s = bs->opaque;
- if (s->is_xfs) {
- return xfs_discard(s, sector_num, nb_sectors);
- }
-#endif
-
- return 0;
+ return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+ cb, opaque, QEMU_AIO_DISCARD);
}
static QEMUOptionParameter raw_create_options[] = {
@@ -1146,12 +1199,13 @@ static BlockDriver bdrv_file = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_close = raw_close,
.bdrv_create = raw_create,
- .bdrv_co_discard = raw_co_discard,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_is_allocated = raw_co_is_allocated,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
.bdrv_aio_flush = raw_aio_flush,
+ .bdrv_aio_discard = raw_aio_discard,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
@@ -1238,9 +1292,44 @@ static int hdev_probe_device(const char *filename)
return 0;
}
-static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+static int check_hdev_writable(BDRVRawState *s)
+{
+#if defined(BLKROGET)
+ /* Linux block devices can be configured "read-only" using blockdev(8).
+ * This is independent of device node permissions and therefore open(2)
+ * with O_RDWR succeeds. Actual writes fail with EPERM.
+ *
+ * bdrv_open() is supposed to fail if the disk is read-only. Explicitly
+ * check for read-only block devices so that Linux block devices behave
+ * properly.
+ */
+ struct stat st;
+ int readonly = 0;
+
+ if (fstat(s->fd, &st)) {
+ return -errno;
+ }
+
+ if (!S_ISBLK(st.st_mode)) {
+ return 0;
+ }
+
+ if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
+ return -errno;
+ }
+
+ if (readonly) {
+ return -EACCES;
+ }
+#endif /* defined(BLKROGET) */
+ return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
+ int ret;
+ const char *filename = qdict_get_str(options, "filename");
#if defined(__APPLE__) && defined(__MACH__)
if (strstart(filename, "/dev/cdrom", NULL)) {
@@ -1262,6 +1351,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
qemu_close(fd);
}
filename = bsdPath;
+ qdict_put(options, "filename", qstring_from_str(filename));
}
if ( mediaIterator )
@@ -1281,7 +1371,20 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
}
#endif
- return raw_open_common(bs, filename, flags, 0);
+ ret = raw_open_common(bs, options, flags, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (flags & BDRV_O_RDWR) {
+ ret = check_hdev_writable(s);
+ if (ret < 0) {
+ raw_close(bs);
+ return ret;
+ }
+ }
+
+ return ret;
}
#if defined(__linux__)
@@ -1346,10 +1449,21 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVRawState *s = bs->opaque;
+ RawPosixAIOData *acb;
+ ThreadPool *pool;
if (fd_open(bs) < 0)
return NULL;
- return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
+
+ acb = g_slice_new(RawPosixAIOData);
+ acb->bs = bs;
+ acb->aio_type = QEMU_AIO_IOCTL;
+ acb->aio_fildes = s->fd;
+ acb->aio_offset = 0;
+ acb->aio_ioctl_buf = buf;
+ acb->aio_ioctl_cmd = req;
+ pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+ return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
}
#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1371,6 +1485,19 @@ static int fd_open(BlockDriverState *bs)
#endif /* !linux && !FreeBSD */
+static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (fd_open(bs) < 0) {
+ return NULL;
+ }
+ return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+ cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+}
+
static int hdev_create(const char *filename, QEMUOptionParameter *options)
{
int fd;
@@ -1401,11 +1528,6 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options)
return ret;
}
-static int hdev_has_zero_init(BlockDriverState *bs)
-{
- return 0;
-}
-
static BlockDriver bdrv_host_device = {
.format_name = "host_device",
.protocol_name = "host_device",
@@ -1418,11 +1540,11 @@ static BlockDriver bdrv_host_device = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
- .bdrv_has_zero_init = hdev_has_zero_init,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
.bdrv_aio_flush = raw_aio_flush,
+ .bdrv_aio_discard = hdev_aio_discard,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
@@ -1437,7 +1559,7 @@ static BlockDriver bdrv_host_device = {
};
#ifdef __linux__
-static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
+static int floppy_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
int ret;
@@ -1445,7 +1567,7 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
s->type = FTYPE_FD;
/* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
- ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
+ ret = raw_open_common(bs, options, flags, O_NONBLOCK);
if (ret)
return ret;
@@ -1542,7 +1664,6 @@ static BlockDriver bdrv_host_floppy = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
- .bdrv_has_zero_init = hdev_has_zero_init,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
@@ -1559,14 +1680,14 @@ static BlockDriver bdrv_host_floppy = {
.bdrv_eject = floppy_eject,
};
-static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
s->type = FTYPE_CD;
/* open will not fail even if no CD is inserted, so add O_NONBLOCK */
- return raw_open_common(bs, filename, flags, O_NONBLOCK);
+ return raw_open_common(bs, options, flags, O_NONBLOCK);
}
static int cdrom_probe_device(const char *filename)
@@ -1644,7 +1765,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
- .bdrv_has_zero_init = hdev_has_zero_init,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
@@ -1667,14 +1787,14 @@ static BlockDriver bdrv_host_cdrom = {
#endif /* __linux__ */
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
-static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
int ret;
s->type = FTYPE_CD;
- ret = raw_open_common(bs, filename, flags, 0);
+ ret = raw_open_common(bs, options, flags, 0);
if (ret)
return ret;
@@ -1766,7 +1886,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
- .bdrv_has_zero_init = hdev_has_zero_init,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
@@ -1784,6 +1903,40 @@ static BlockDriver bdrv_host_cdrom = {
};
#endif /* __FreeBSD__ */
+#ifdef CONFIG_LINUX_AIO
+/**
+ * Return the file descriptor for Linux AIO
+ *
+ * This function is a layering violation and should be removed when it becomes
+ * possible to call the block layer outside the global mutex. It allows the
+ * caller to hijack the file descriptor so I/O can be performed outside the
+ * block layer.
+ */
+int raw_get_aio_fd(BlockDriverState *bs)
+{
+ BDRVRawState *s;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+
+ if (bs->drv == bdrv_find_format("raw")) {
+ bs = bs->file;
+ }
+
+ /* raw-posix has several protocols so just check for raw_aio_readv */
+ if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
+ return -ENOTSUP;
+ }
+
+ s = bs->opaque;
+ if (!s->use_aio) {
+ return -ENOTSUP;
+ }
+ return s->fd;
+}
+#endif /* CONFIG_LINUX_AIO */
+
static void bdrv_file_init(void)
{
/*
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 0c05c58c5..9b5b2af4e 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -22,13 +22,13 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
#include "raw-aio.h"
#include "trace.h"
-#include "thread-pool.h"
-#include "iov.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
#include <windows.h>
#include <winioctl.h>
@@ -144,6 +144,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
BlockDriverCompletionFunc *cb, void *opaque, int type)
{
RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
+ ThreadPool *pool;
acb->bs = bs;
acb->hfile = hfile;
@@ -157,7 +158,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
acb->aio_offset = sector_num * 512;
trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
- return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+ pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+ return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
}
int qemu_ftruncate64(int fd, int64_t length)
@@ -219,20 +221,49 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
}
}
-static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+static QemuOptsList raw_runtime_opts = {
+ .name = "raw",
+ .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "File name of the image",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
int access_flags;
DWORD overlapped;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
+ int ret;
s->type = FTYPE_FILE;
+ opts = qemu_opts_create_nofail(&raw_runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
raw_parse_flags(flags, &access_flags, &overlapped);
-
+
if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
aio = win32_aio_init();
if (aio == NULL) {
- return -EINVAL;
+ ret = -EINVAL;
+ goto fail;
}
}
@@ -242,20 +273,27 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
if (s->hfile == INVALID_HANDLE_VALUE) {
int err = GetLastError();
- if (err == ERROR_ACCESS_DENIED)
- return -EACCES;
- return -EINVAL;
+ if (err == ERROR_ACCESS_DENIED) {
+ ret = -EACCES;
+ } else {
+ ret = -EINVAL;
+ }
+ goto fail;
}
if (flags & BDRV_O_NATIVE_AIO) {
- int ret = win32_aio_attach(aio, s->hfile);
+ ret = win32_aio_attach(aio, s->hfile);
if (ret < 0) {
CloseHandle(s->hfile);
- return ret;
+ goto fail;
}
s->aio = aio;
}
- return 0;
+
+ ret = 0;
+fail:
+ qemu_opts_del(opts);
+ return ret;
}
static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
@@ -303,13 +341,24 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
{
BDRVRawState *s = bs->opaque;
LONG low, high;
+ DWORD dwPtrLow;
low = offset;
high = offset >> 32;
- if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
- return -EIO;
- if (!SetEndOfFile(s->hfile))
+
+ /*
+ * An error has occurred if the return value is INVALID_SET_FILE_POINTER
+ * and GetLastError doesn't return NO_ERROR.
+ */
+ dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN);
+ if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+ fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError());
+ return -EIO;
+ }
+ if (SetEndOfFile(s->hfile) == 0) {
+ fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError());
return -EIO;
+ }
return 0;
}
@@ -410,6 +459,7 @@ static BlockDriver bdrv_file = {
.bdrv_file_open = raw_open,
.bdrv_close = raw_close,
.bdrv_create = raw_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
@@ -481,12 +531,13 @@ static int hdev_probe_device(const char *filename)
return 0;
}
-static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRawState *s = bs->opaque;
int access_flags, create_flags;
DWORD overlapped;
char device_name[64];
+ const char *filename = qdict_get_str(options, "filename");
if (strstart(filename, "/dev/cdrom", NULL)) {
if (find_cdrom(device_name, sizeof(device_name)) < 0)
@@ -520,11 +571,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
return 0;
}
-static int hdev_has_zero_init(BlockDriverState *bs)
-{
- return 0;
-}
-
static BlockDriver bdrv_host_device = {
.format_name = "host_device",
.protocol_name = "host_device",
@@ -532,7 +578,6 @@ static BlockDriver bdrv_host_device = {
.bdrv_probe_device = hdev_probe_device,
.bdrv_file_open = hdev_open,
.bdrv_close = raw_close,
- .bdrv_has_zero_init = hdev_has_zero_init,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
diff --git a/block/raw.c b/block/raw.c
index 253e949b8..47518253f 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -1,9 +1,32 @@
+/*
+ * Block driver for RAW format
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
-static int raw_open(BlockDriverState *bs, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
{
bs->sg = bs->file->sg;
return 0;
@@ -42,6 +65,13 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
}
+static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors)
+{
+ return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors);
+}
+
static int64_t raw_getlength(BlockDriverState *bs)
{
return bdrv_getlength(bs->file);
@@ -114,6 +144,11 @@ static int raw_has_zero_init(BlockDriverState *bs)
return bdrv_has_zero_init(bs->file);
}
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ return bdrv_get_info(bs->file, bdi);
+}
+
static BlockDriver bdrv_raw = {
.format_name = "raw",
@@ -128,10 +163,12 @@ static BlockDriver bdrv_raw = {
.bdrv_co_readv = raw_co_readv,
.bdrv_co_writev = raw_co_writev,
.bdrv_co_is_allocated = raw_co_is_allocated,
+ .bdrv_co_write_zeroes = raw_co_write_zeroes,
.bdrv_co_discard = raw_co_discard,
.bdrv_probe = raw_probe,
.bdrv_getlength = raw_getlength,
+ .bdrv_get_info = raw_get_info,
.bdrv_truncate = raw_truncate,
.bdrv_is_inserted = raw_is_inserted,
diff --git a/block/rbd.c b/block/rbd.c
index f3becc7a8..cb7175121 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -14,8 +14,8 @@
#include <inttypes.h>
#include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
#include <rbd/librbd.h>
@@ -63,7 +63,8 @@
typedef enum {
RBD_AIO_READ,
RBD_AIO_WRITE,
- RBD_AIO_DISCARD
+ RBD_AIO_DISCARD,
+ RBD_AIO_FLUSH
} RBDAIOCmd;
typedef struct RBDAIOCB {
@@ -77,6 +78,7 @@ typedef struct RBDAIOCB {
int error;
struct BDRVRBDState *s;
int cancelled;
+ int status;
} RBDAIOCB;
typedef struct RADOSCB {
@@ -376,16 +378,9 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
RBDAIOCB *acb = rcb->acb;
int64_t r;
- if (acb->cancelled) {
- qemu_vfree(acb->bounce);
- qemu_aio_release(acb);
- goto done;
- }
-
r = rcb->ret;
- if (acb->cmd == RBD_AIO_WRITE ||
- acb->cmd == RBD_AIO_DISCARD) {
+ if (acb->cmd != RBD_AIO_READ) {
if (r < 0) {
acb->ret = r;
acb->error = 1;
@@ -409,7 +404,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
/* Note that acb->bh can be NULL in case where the aio was cancelled */
acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
qemu_bh_schedule(acb->bh);
-done:
g_free(rcb);
}
@@ -447,7 +441,21 @@ static int qemu_rbd_aio_flush_cb(void *opaque)
return (s->qemu_aio_count > 0);
}
-static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+ .name = "rbd",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "Specification of the rbd image",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVRBDState *s = bs->opaque;
char pool[RBD_MAX_POOL_NAME_SIZE];
@@ -455,20 +463,35 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
char conf[RBD_MAX_CONF_SIZE];
char clientname_buf[RBD_MAX_CONF_SIZE];
char *clientname;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
int r;
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ qemu_opts_del(opts);
+ return -EINVAL;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
if (qemu_rbd_parsename(filename, pool, sizeof(pool),
snap_buf, sizeof(snap_buf),
s->name, sizeof(s->name),
conf, sizeof(conf)) < 0) {
- return -EINVAL;
+ r = -EINVAL;
+ goto failed_opts;
}
clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
r = rados_create(&s->cluster, clientname);
if (r < 0) {
error_report("error initializing");
- return r;
+ goto failed_opts;
}
s->snap = NULL;
@@ -534,6 +557,7 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
NULL, qemu_rbd_aio_flush_cb, s);
+ qemu_opts_del(opts);
return 0;
failed:
@@ -543,6 +567,8 @@ failed_open:
failed_shutdown:
rados_shutdown(s->cluster);
g_free(s->snap);
+failed_opts:
+ qemu_opts_del(opts);
return r;
}
@@ -568,6 +594,12 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
{
RBDAIOCB *acb = (RBDAIOCB *) blockacb;
acb->cancelled = 1;
+
+ while (acb->status == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ qemu_aio_release(acb);
}
static const AIOCBInfo rbd_aiocb_info = {
@@ -639,8 +671,11 @@ static void rbd_aio_bh_cb(void *opaque)
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
qemu_bh_delete(acb->bh);
acb->bh = NULL;
+ acb->status = 0;
- qemu_aio_release(acb);
+ if (!acb->cancelled) {
+ qemu_aio_release(acb);
+ }
}
static int rbd_aio_discard_wrapper(rbd_image_t image,
@@ -655,6 +690,16 @@ static int rbd_aio_discard_wrapper(rbd_image_t image,
#endif
}
+static int rbd_aio_flush_wrapper(rbd_image_t image,
+ rbd_completion_t comp)
+{
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+ return rbd_aio_flush(image, comp);
+#else
+ return -ENOTSUP;
+#endif
+}
+
static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
@@ -675,7 +720,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
acb->cmd = cmd;
acb->qiov = qiov;
- if (cmd == RBD_AIO_DISCARD) {
+ if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
acb->bounce = NULL;
} else {
acb->bounce = qemu_blockalign(bs, qiov->size);
@@ -685,6 +730,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
acb->s = s;
acb->cancelled = 0;
acb->bh = NULL;
+ acb->status = -EINPROGRESS;
if (cmd == RBD_AIO_WRITE) {
qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -718,6 +764,9 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
case RBD_AIO_DISCARD:
r = rbd_aio_discard_wrapper(s->image, off, size, c);
break;
+ case RBD_AIO_FLUSH:
+ r = rbd_aio_flush_wrapper(s->image, c);
+ break;
default:
r = -EINVAL;
}
@@ -757,6 +806,16 @@ static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
RBD_AIO_WRITE);
}
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
+}
+
+#else
+
static int qemu_rbd_co_flush(BlockDriverState *bs)
{
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
@@ -767,6 +826,7 @@ static int qemu_rbd_co_flush(BlockDriverState *bs)
return 0;
#endif
}
+#endif
static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
{
@@ -936,6 +996,7 @@ static BlockDriver bdrv_rbd = {
.bdrv_file_open = qemu_rbd_open,
.bdrv_close = qemu_rbd_close,
.bdrv_create = qemu_rbd_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_get_info = qemu_rbd_getinfo,
.create_options = qemu_rbd_create_options,
.bdrv_getlength = qemu_rbd_getlength,
@@ -944,7 +1005,12 @@ static BlockDriver bdrv_rbd = {
.bdrv_aio_readv = qemu_rbd_aio_readv,
.bdrv_aio_writev = qemu_rbd_aio_writev,
+
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+ .bdrv_aio_flush = qemu_rbd_aio_flush,
+#else
.bdrv_co_flush_to_disk = qemu_rbd_co_flush,
+#endif
#ifdef LIBRBD_SUPPORTS_DISCARD
.bdrv_aio_discard = qemu_rbd_aio_discard,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a48f58cfe..afe053376 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,19 +13,22 @@
*/
#include "qemu-common.h"
-#include "qemu-error.h"
-#include "qemu_socket.h"
-#include "block_int.h"
-#include "bitops.h"
+#include "qemu/uri.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"
#define SD_PROTO_VER 0x01
#define SD_DEFAULT_ADDR "localhost"
-#define SD_DEFAULT_PORT "7000"
+#define SD_DEFAULT_PORT 7000
#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
#define SD_OP_READ_OBJ 0x02
#define SD_OP_WRITE_OBJ 0x03
+/* 0x04 is used internally by Sheepdog */
+#define SD_OP_DISCARD_OBJ 0x05
#define SD_OP_NEW_VDI 0x11
#define SD_OP_LOCK_VDI 0x12
@@ -33,10 +36,12 @@
#define SD_OP_GET_VDI_INFO 0x14
#define SD_OP_READ_VDIS 0x15
#define SD_OP_FLUSH_VDI 0x16
+#define SD_OP_DEL_VDI 0x17
#define SD_FLAG_CMD_WRITE 0x01
#define SD_FLAG_CMD_COW 0x02
-#define SD_FLAG_CMD_CACHE 0x04
+#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
+#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
#define SD_RES_SUCCESS 0x00 /* Success */
#define SD_RES_UNKNOWN 0x01 /* Unknown error */
@@ -63,6 +68,8 @@
#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
+#define SD_RES_READONLY 0x1A /* Object is read-only */
/*
* Object ID rules
@@ -84,7 +91,6 @@
#define SD_NR_VDIS (1U << 24)
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
-#define SECTOR_SIZE 512
#define SD_INODE_SIZE (sizeof(SheepdogInode))
#define CURRENT_VDI_ID 0
@@ -144,7 +150,7 @@ typedef struct SheepdogVdiReq {
uint32_t id;
uint32_t data_length;
uint64_t vdi_size;
- uint32_t base_vdi_id;
+ uint32_t vdi_id;
uint32_t copies;
uint32_t snapid;
uint32_t pad[3];
@@ -236,14 +242,14 @@ static inline bool is_snapshot(struct SheepdogInode *inode)
return !!inode->snap_ctime;
}
-#undef dprintf
+#undef DPRINTF
#ifdef DEBUG_SDOG
-#define dprintf(fmt, args...) \
+#define DPRINTF(fmt, args...) \
do { \
fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
} while (0)
#else
-#define dprintf(fmt, args...)
+#define DPRINTF(fmt, args...)
#endif
typedef struct SheepdogAIOCB SheepdogAIOCB;
@@ -265,6 +271,8 @@ typedef struct AIOReq {
enum AIOCBState {
AIOCB_WRITE_UDATA,
AIOCB_READ_UDATA,
+ AIOCB_FLUSH_CACHE,
+ AIOCB_DISCARD_OBJ,
};
struct SheepdogAIOCB {
@@ -293,12 +301,12 @@ typedef struct BDRVSheepdogState {
char name[SD_MAX_VDI_LEN];
bool is_snapshot;
- bool cache_enabled;
+ uint32_t cache_flags;
+ bool discard_supported;
- char *addr;
- char *port;
+ char *host_spec;
+ bool is_unix;
int fd;
- int flush_fd;
CoMutex lock;
Coroutine *co_send;
@@ -342,6 +350,8 @@ static const char * sd_strerror(int err)
{SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
{SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
{SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
+ {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
+ {SD_RES_READONLY, "Object is read-only"},
};
for (i = 0; i < ARRAY_SIZE(errors); ++i) {
@@ -426,12 +436,11 @@ static const AIOCBInfo sd_aiocb_info = {
};
static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
- int64_t sector_num, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ int64_t sector_num, int nb_sectors)
{
SheepdogAIOCB *acb;
- acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque);
+ acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
acb->qiov = qiov;
@@ -446,56 +455,31 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
return acb;
}
-static int connect_to_sdog(const char *addr, const char *port)
+static int connect_to_sdog(BDRVSheepdogState *s)
{
- char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
- int fd, ret;
- struct addrinfo hints, *res, *res0;
-
- if (!addr) {
- addr = SD_DEFAULT_ADDR;
- port = SD_DEFAULT_PORT;
- }
-
- memset(&hints, 0, sizeof(hints));
- hints.ai_socktype = SOCK_STREAM;
-
- ret = getaddrinfo(addr, port, &hints, &res0);
- if (ret) {
- error_report("unable to get address info %s, %s",
- addr, strerror(errno));
- return -errno;
- }
-
- for (res = res0; res; res = res->ai_next) {
- ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
- sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
- if (ret) {
- continue;
- }
+ int fd;
+ Error *err = NULL;
- fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
- if (fd < 0) {
- continue;
- }
+ if (s->is_unix) {
+ fd = unix_connect(s->host_spec, &err);
+ } else {
+ fd = inet_connect(s->host_spec, &err);
- reconnect:
- ret = connect(fd, res->ai_addr, res->ai_addrlen);
- if (ret < 0) {
- if (errno == EINTR) {
- goto reconnect;
+ if (err == NULL) {
+ int ret = socket_set_nodelay(fd);
+ if (ret < 0) {
+ error_report("%s", strerror(errno));
}
- close(fd);
- break;
}
+ }
- dprintf("connected to %s:%s\n", addr, port);
- goto success;
+ if (err != NULL) {
+ qerror_report_err(err);
+ error_free(err);
+ } else {
+ qemu_set_nonblock(fd);
}
- fd = -errno;
- error_report("failed connect to %s:%s", addr, port);
-success:
- freeaddrinfo(res0);
+
return fd;
}
@@ -525,6 +509,13 @@ static void restart_co_req(void *opaque)
qemu_coroutine_enter(co, NULL);
}
+static int have_co_req(void *opaque)
+{
+ /* this handler is set only when there is a pending request, so
+ * always returns 1. */
+ return 1;
+}
+
typedef struct SheepdogReqCo {
int sockfd;
SheepdogReq *hdr;
@@ -547,15 +538,14 @@ static coroutine_fn void do_co_req(void *opaque)
unsigned int *rlen = srco->rlen;
co = qemu_coroutine_self();
- qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
+ qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co);
- socket_set_block(sockfd);
ret = send_co_req(sockfd, hdr, data, wlen);
if (ret < 0) {
goto out;
}
- qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
+ qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co);
ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
if (ret < sizeof(*hdr)) {
@@ -578,8 +568,9 @@ static coroutine_fn void do_co_req(void *opaque)
}
ret = 0;
out:
+ /* there is at most one request for this sockfd, so it is safe to
+ * set each handler to NULL. */
qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
- socket_set_nonblock(sockfd);
srco->ret = ret;
srco->finished = true;
@@ -615,6 +606,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type);
+static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
@@ -671,7 +663,7 @@ static void coroutine_fn aio_read_response(void *opaque)
int ret;
AIOReq *aio_req = NULL;
SheepdogAIOCB *acb;
- unsigned long idx;
+ uint64_t idx;
if (QLIST_EMPTY(&s->inflight_aio_head)) {
goto out;
@@ -714,16 +706,17 @@ static void coroutine_fn aio_read_response(void *opaque)
* and max_dirty_data_idx are changed to include updated
* index between them.
*/
- s->inode.data_vdi_id[idx] = s->inode.vdi_id;
- s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
- s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
-
+ if (rsp.result == SD_RES_SUCCESS) {
+ s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+ s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+ s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+ }
/*
* Some requests may be blocked because simultaneous
* create requests are not allowed, so we search the
* pending requests here.
*/
- send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+ send_pending_req(s, aio_req->oid);
}
break;
case AIOCB_READ_UDATA:
@@ -734,11 +727,43 @@ static void coroutine_fn aio_read_response(void *opaque)
goto out;
}
break;
+ case AIOCB_FLUSH_CACHE:
+ if (rsp.result == SD_RES_INVALID_PARMS) {
+ DPRINTF("disable cache since the server doesn't support it\n");
+ s->cache_flags = SD_FLAG_CMD_DIRECT;
+ rsp.result = SD_RES_SUCCESS;
+ }
+ break;
+ case AIOCB_DISCARD_OBJ:
+ switch (rsp.result) {
+ case SD_RES_INVALID_PARMS:
+ error_report("sheep(%s) doesn't support discard command",
+ s->host_spec);
+ rsp.result = SD_RES_SUCCESS;
+ s->discard_supported = false;
+ break;
+ case SD_RES_SUCCESS:
+ idx = data_oid_to_idx(aio_req->oid);
+ s->inode.data_vdi_id[idx] = 0;
+ break;
+ default:
+ break;
+ }
}
- if (rsp.result != SD_RES_SUCCESS) {
+ switch (rsp.result) {
+ case SD_RES_SUCCESS:
+ break;
+ case SD_RES_READONLY:
+ ret = resend_aioreq(s, aio_req);
+ if (ret == SD_RES_SUCCESS) {
+ goto out;
+ }
+ /* fall through */
+ default:
acb->ret = -EIO;
error_report("%s", sd_strerror(rsp.result));
+ break;
}
free_aio_req(s, aio_req);
@@ -779,15 +804,6 @@ static int aio_flush_request(void *opaque)
!QLIST_EMPTY(&s->pending_aio_head);
}
-static int set_nodelay(int fd)
-{
- int ret, opt;
-
- opt = 1;
- ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
- return ret;
-}
-
/*
* Return a socket discriptor to read/write objects.
*
@@ -796,29 +812,86 @@ static int set_nodelay(int fd)
*/
static int get_sheep_fd(BDRVSheepdogState *s)
{
- int ret, fd;
+ int fd;
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
- error_report("%s", strerror(errno));
return fd;
}
- socket_set_nonblock(fd);
+ qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
+ return fd;
+}
- ret = set_nodelay(fd);
- if (ret) {
- error_report("%s", strerror(errno));
- closesocket(fd);
- return -errno;
+static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
+ char *vdi, uint32_t *snapid, char *tag)
+{
+ URI *uri;
+ QueryParams *qp = NULL;
+ int ret = 0;
+
+ uri = uri_parse(filename);
+ if (!uri) {
+ return -EINVAL;
}
- qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
- return fd;
+ /* transport */
+ if (!strcmp(uri->scheme, "sheepdog")) {
+ s->is_unix = false;
+ } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+ s->is_unix = false;
+ } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+ s->is_unix = true;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (uri->path == NULL || !strcmp(uri->path, "/")) {
+ ret = -EINVAL;
+ goto out;
+ }
+ pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
+
+ qp = query_params_parse(uri->query);
+ if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (s->is_unix) {
+ /* sheepdog+unix:///vdiname?socket=path */
+ if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
+ ret = -EINVAL;
+ goto out;
+ }
+ s->host_spec = g_strdup(qp->p[0].value);
+ } else {
+ /* sheepdog[+tcp]://[host:port]/vdiname */
+ s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR,
+ uri->port ?: SD_DEFAULT_PORT);
+ }
+
+ /* snapshot tag */
+ if (uri->fragment) {
+ *snapid = strtoul(uri->fragment, NULL, 10);
+ if (*snapid == 0) {
+ pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
+ }
+ } else {
+ *snapid = CURRENT_VDI_ID; /* search current vdi */
+ }
+
+out:
+ if (qp) {
+ query_params_free(qp);
+ }
+ uri_free(uri);
+ return ret;
}
/*
- * Parse a filename
+ * Parse a filename (old syntax)
*
* filename must be one of the following formats:
* 1. [vdiname]
@@ -837,9 +910,11 @@ static int get_sheep_fd(BDRVSheepdogState *s)
static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
char *vdi, uint32_t *snapid, char *tag)
{
- char *p, *q;
- int nr_sep;
+ char *p, *q, *uri;
+ const char *host_spec, *vdi_spec;
+ int nr_sep, ret;
+ strstart(filename, "sheepdog:", (const char **)&filename);
p = q = g_strdup(filename);
/* count the number of separators */
@@ -852,42 +927,37 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
}
p = q;
- /* use the first two tokens as hostname and port number. */
+ /* use the first two tokens as host_spec. */
if (nr_sep >= 2) {
- s->addr = p;
+ host_spec = p;
p = strchr(p, ':');
- *p++ = '\0';
-
- s->port = p;
+ p++;
p = strchr(p, ':');
*p++ = '\0';
} else {
- s->addr = NULL;
- s->port = 0;
+ host_spec = "";
}
- pstrcpy(vdi, SD_MAX_VDI_LEN, p);
+ vdi_spec = p;
- p = strchr(vdi, ':');
+ p = strchr(vdi_spec, ':');
if (p) {
- *p++ = '\0';
- *snapid = strtoul(p, NULL, 10);
- if (*snapid == 0) {
- pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
- }
- } else {
- *snapid = CURRENT_VDI_ID; /* search current vdi */
+ *p++ = '#';
}
- if (s->addr == NULL) {
- g_free(q);
- }
+ uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
- return 0;
+ ret = sd_parse_uri(s, uri, vdi, snapid, tag);
+
+ g_free(q);
+ g_free(uri);
+
+ return ret;
}
-static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
- char *tag, uint32_t *vid, int for_snapshot)
+static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
+ uint32_t snapid, const char *tag, uint32_t *vid,
+ bool lock)
{
int ret, fd;
SheepdogVdiReq hdr;
@@ -895,7 +965,7 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
unsigned int wlen, rlen = 0;
char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
return fd;
}
@@ -908,10 +978,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
memset(&hdr, 0, sizeof(hdr));
- if (for_snapshot) {
- hdr.opcode = SD_OP_GET_VDI_INFO;
- } else {
+ if (lock) {
hdr.opcode = SD_OP_LOCK_VDI;
+ } else {
+ hdr.opcode = SD_OP_GET_VDI_INFO;
}
wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
hdr.proto_ver = SD_PROTO_VER;
@@ -948,7 +1018,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
{
int nr_copies = s->inode.nr_copies;
SheepdogObjReq hdr;
- unsigned int wlen;
+ unsigned int wlen = 0;
int ret;
uint64_t oid = aio_req->oid;
unsigned int datalen = aio_req->data_len;
@@ -962,22 +1032,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
memset(&hdr, 0, sizeof(hdr));
- if (aiocb_type == AIOCB_READ_UDATA) {
- wlen = 0;
+ switch (aiocb_type) {
+ case AIOCB_FLUSH_CACHE:
+ hdr.opcode = SD_OP_FLUSH_VDI;
+ break;
+ case AIOCB_READ_UDATA:
hdr.opcode = SD_OP_READ_OBJ;
hdr.flags = flags;
- } else if (create) {
- wlen = datalen;
- hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
- hdr.flags = SD_FLAG_CMD_WRITE | flags;
- } else {
+ break;
+ case AIOCB_WRITE_UDATA:
+ if (create) {
+ hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
+ } else {
+ hdr.opcode = SD_OP_WRITE_OBJ;
+ }
wlen = datalen;
- hdr.opcode = SD_OP_WRITE_OBJ;
hdr.flags = SD_FLAG_CMD_WRITE | flags;
+ break;
+ case AIOCB_DISCARD_OBJ:
+ hdr.opcode = SD_OP_DISCARD_OBJ;
+ break;
}
- if (s->cache_enabled) {
- hdr.flags |= SD_FLAG_CMD_CACHE;
+ if (s->cache_flags) {
+ hdr.flags |= s->cache_flags;
}
hdr.oid = oid;
@@ -1022,7 +1100,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
unsigned int datalen, uint64_t offset,
- bool write, bool create, bool cache)
+ bool write, bool create, uint32_t cache_flags)
{
SheepdogObjReq hdr;
SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -1046,9 +1124,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
hdr.opcode = SD_OP_READ_OBJ;
}
- if (cache) {
- hdr.flags |= SD_FLAG_CMD_CACHE;
- }
+ hdr.flags |= cache_flags;
hdr.oid = oid;
hdr.data_length = datalen;
@@ -1071,21 +1147,119 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
}
static int read_object(int fd, char *buf, uint64_t oid, int copies,
- unsigned int datalen, uint64_t offset, bool cache)
+ unsigned int datalen, uint64_t offset,
+ uint32_t cache_flags)
{
return read_write_object(fd, buf, oid, copies, datalen, offset, false,
- false, cache);
+ false, cache_flags);
}
static int write_object(int fd, char *buf, uint64_t oid, int copies,
unsigned int datalen, uint64_t offset, bool create,
- bool cache)
+ uint32_t cache_flags)
{
return read_write_object(fd, buf, oid, copies, datalen, offset, true,
- create, cache);
+ create, cache_flags);
}
-static int sd_open(BlockDriverState *bs, const char *filename, int flags)
+/* update inode with the latest state */
+static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
+{
+ SheepdogInode *inode;
+ int ret = 0, fd;
+ uint32_t vid = 0;
+
+ fd = connect_to_sdog(s);
+ if (fd < 0) {
+ return -EIO;
+ }
+
+ inode = g_malloc(sizeof(s->inode));
+
+ ret = find_vdi_name(s, s->name, snapid, tag, &vid, false);
+ if (ret) {
+ goto out;
+ }
+
+ ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid),
+ s->inode.nr_copies, sizeof(*inode), 0, s->cache_flags);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (inode->vdi_id != s->inode.vdi_id) {
+ memcpy(&s->inode, inode, sizeof(s->inode));
+ }
+
+out:
+ g_free(inode);
+ closesocket(fd);
+
+ return ret;
+}
+
+static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+ SheepdogAIOCB *acb = aio_req->aiocb;
+ bool create = false;
+ int ret;
+
+ ret = reload_inode(s, 0, "");
+ if (ret < 0) {
+ return ret;
+ }
+
+ aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
+ data_oid_to_idx(aio_req->oid));
+
+ /* check whether this request becomes a CoW one */
+ if (acb->aiocb_type == AIOCB_WRITE_UDATA) {
+ int idx = data_oid_to_idx(aio_req->oid);
+ AIOReq *areq;
+
+ if (s->inode.data_vdi_id[idx] == 0) {
+ create = true;
+ goto out;
+ }
+ if (is_data_obj_writable(&s->inode, idx)) {
+ goto out;
+ }
+
+ /* link to the pending list if there is another CoW request to
+ * the same object */
+ QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
+ if (areq != aio_req && areq->oid == aio_req->oid) {
+ DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid);
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
+ return SD_RES_SUCCESS;
+ }
+ }
+
+ aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
+ aio_req->flags |= SD_FLAG_CMD_COW;
+ create = true;
+ }
+out:
+ return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
+ create, acb->aiocb_type);
+}
+
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+ .name = "sheepdog",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "URL to the sheepdog image",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int sd_open(BlockDriverState *bs, QDict *options, int flags)
{
int ret, fd;
uint32_t vid = 0;
@@ -1093,8 +1267,20 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid;
char *buf = NULL;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ const char *filename;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto out;
+ }
- strstart(filename, "sheepdog:", (const char **)&filename);
+ filename = qemu_opt_get(opts, "filename");
QLIST_INIT(&s->inflight_aio_head);
QLIST_INIT(&s->pending_aio_head);
@@ -1102,8 +1288,13 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
memset(vdi, 0, sizeof(vdi));
memset(tag, 0, sizeof(tag));
- if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
- ret = -EINVAL;
+
+ if (strstr(filename, "://")) {
+ ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+ } else {
+ ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+ }
+ if (ret < 0) {
goto out;
}
s->fd = get_sheep_fd(s);
@@ -1112,34 +1303,35 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
goto out;
}
- ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
+ ret = find_vdi_name(s, vdi, snapid, tag, &vid, true);
if (ret) {
goto out;
}
- s->cache_enabled = true;
- s->flush_fd = connect_to_sdog(s->addr, s->port);
- if (s->flush_fd < 0) {
- error_report("failed to connect");
- ret = s->flush_fd;
- goto out;
+ /*
+ * QEMU block layer emulates writethrough cache as 'writeback + flush', so
+ * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
+ */
+ s->cache_flags = SD_FLAG_CMD_CACHE;
+ if (flags & BDRV_O_NOCACHE) {
+ s->cache_flags = SD_FLAG_CMD_DIRECT;
}
+ s->discard_supported = true;
if (snapid || tag[0] != '\0') {
- dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
+ DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
s->is_snapshot = true;
}
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
- error_report("failed to connect");
ret = fd;
goto out;
}
buf = g_malloc(SD_INODE_SIZE);
ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
- s->cache_enabled);
+ s->cache_flags);
closesocket(fd);
@@ -1151,9 +1343,10 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
s->min_dirty_data_idx = UINT32_MAX;
s->max_dirty_data_idx = 0;
- bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
+ bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
pstrcpy(s->name, sizeof(s->name), vdi);
qemu_co_mutex_init(&s->lock);
+ qemu_opts_del(opts);
g_free(buf);
return 0;
out:
@@ -1161,13 +1354,13 @@ out:
if (s->fd >= 0) {
closesocket(s->fd);
}
+ qemu_opts_del(opts);
g_free(buf);
return ret;
}
-static int do_sd_create(char *filename, int64_t vdi_size,
- uint32_t base_vid, uint32_t *vdi_id, int snapshot,
- const char *addr, const char *port)
+static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
+ uint32_t base_vid, uint32_t *vdi_id, int snapshot)
{
SheepdogVdiReq hdr;
SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
@@ -1175,7 +1368,7 @@ static int do_sd_create(char *filename, int64_t vdi_size,
unsigned int wlen, rlen = 0;
char buf[SD_MAX_VDI_LEN];
- fd = connect_to_sdog(addr, port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
return fd;
}
@@ -1188,7 +1381,7 @@ static int do_sd_create(char *filename, int64_t vdi_size,
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_NEW_VDI;
- hdr.base_vdi_id = base_vid;
+ hdr.vdi_id = base_vid;
wlen = SD_MAX_VDI_LEN;
@@ -1226,7 +1419,7 @@ static int sd_prealloc(const char *filename)
void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
int ret;
- ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
if (ret < 0) {
goto out;
}
@@ -1271,17 +1464,17 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid;
bool prealloc = false;
- const char *vdiname;
s = g_malloc0(sizeof(BDRVSheepdogState));
- strstart(filename, "sheepdog:", &vdiname);
-
memset(vdi, 0, sizeof(vdi));
memset(tag, 0, sizeof(tag));
- if (parse_vdiname(s, vdiname, vdi, &snapid, tag) < 0) {
- error_report("invalid filename");
- ret = -EINVAL;
+ if (strstr(filename, "://")) {
+ ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+ } else {
+ ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+ }
+ if (ret < 0) {
goto out;
}
@@ -1317,14 +1510,14 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
BlockDriver *drv;
/* Currently, only Sheepdog backing image is supported. */
- drv = bdrv_find_protocol(backing_file);
+ drv = bdrv_find_protocol(backing_file, true);
if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
error_report("backing_file must be a sheepdog image");
ret = -EINVAL;
goto out;
}
- ret = bdrv_file_open(&bs, backing_file, 0);
+ ret = bdrv_file_open(&bs, backing_file, NULL, 0);
if (ret < 0) {
goto out;
}
@@ -1342,7 +1535,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
bdrv_delete(bs);
}
- ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port);
+ ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0);
if (!prealloc || ret) {
goto out;
}
@@ -1361,9 +1554,9 @@ static void sd_close(BlockDriverState *bs)
unsigned int wlen, rlen = 0;
int fd, ret;
- dprintf("%s\n", s->name);
+ DPRINTF("%s\n", s->name);
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
return;
}
@@ -1371,6 +1564,7 @@ static void sd_close(BlockDriverState *bs)
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_RELEASE_VDI;
+ hdr.vdi_id = s->inode.vdi_id;
wlen = strlen(s->name) + 1;
hdr.data_length = wlen;
hdr.flags = SD_FLAG_CMD_WRITE;
@@ -1386,10 +1580,7 @@ static void sd_close(BlockDriverState *bs)
qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
closesocket(s->fd);
- if (s->cache_enabled) {
- closesocket(s->flush_fd);
- }
- g_free(s->addr);
+ g_free(s->host_spec);
}
static int64_t sd_getlength(BlockDriverState *bs)
@@ -1413,7 +1604,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
return -EINVAL;
}
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
return fd;
}
@@ -1422,7 +1613,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
s->inode.vdi_size = offset;
ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
- s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
+ s->inode.nr_copies, datalen, 0, false, s->cache_flags);
close(fd);
if (ret < 0) {
@@ -1476,6 +1667,43 @@ out:
sd_finish_aiocb(acb);
}
+/* Delete current working VDI on the snapshot chain */
+static bool sd_delete(BDRVSheepdogState *s)
+{
+ unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
+ SheepdogVdiReq hdr = {
+ .opcode = SD_OP_DEL_VDI,
+ .vdi_id = s->inode.vdi_id,
+ .data_length = wlen,
+ .flags = SD_FLAG_CMD_WRITE,
+ };
+ SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+ int fd, ret;
+
+ fd = connect_to_sdog(s);
+ if (fd < 0) {
+ return false;
+ }
+
+ ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
+ closesocket(fd);
+ if (ret) {
+ return false;
+ }
+ switch (rsp->result) {
+ case SD_RES_NO_VDI:
+ error_report("%s was already deleted", s->name);
+ /* fall through */
+ case SD_RES_SUCCESS:
+ break;
+ default:
+ error_report("%s, %s", sd_strerror(rsp->result), s->name);
+ return false;
+ }
+
+ return true;
+}
+
/*
* Create a writable VDI from a snapshot
*/
@@ -1484,28 +1712,34 @@ static int sd_create_branch(BDRVSheepdogState *s)
int ret, fd;
uint32_t vid;
char *buf;
+ bool deleted;
- dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
+ DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
buf = g_malloc(SD_INODE_SIZE);
- ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
- s->addr, s->port);
+ /*
+ * Even If deletion fails, we will just create extra snapshot based on
+ * the workding VDI which was supposed to be deleted. So no need to
+ * false bail out.
+ */
+ deleted = sd_delete(s);
+ ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid,
+ !deleted);
if (ret) {
goto out;
}
- dprintf("%" PRIx32 " is created.\n", vid);
+ DPRINTF("%" PRIx32 " is created.\n", vid);
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
- error_report("failed to connect");
ret = fd;
goto out;
}
ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
- SD_INODE_SIZE, 0, s->cache_enabled);
+ SD_INODE_SIZE, 0, s->cache_flags);
closesocket(fd);
@@ -1517,7 +1751,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
s->is_snapshot = false;
ret = 0;
- dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
+ DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
out:
g_free(buf);
@@ -1541,10 +1775,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
{
SheepdogAIOCB *acb = p;
int ret = 0;
- unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
- unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+ unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
+ unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
uint64_t oid;
- uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+ uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
BDRVSheepdogState *s = acb->common.bs->opaque;
SheepdogInode *inode = &s->inode;
AIOReq *aio_req;
@@ -1593,16 +1827,25 @@ static int coroutine_fn sd_co_rw_vector(void *p)
flags = SD_FLAG_CMD_COW;
}
break;
+ case AIOCB_DISCARD_OBJ:
+ /*
+ * We discard the object only when the whole object is
+ * 1) allocated 2) trimmed. Otherwise, simply skip it.
+ */
+ if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+ goto done;
+ }
+ break;
default:
break;
}
if (create) {
- dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
+ DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
inode->vdi_id, oid,
vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
oid = vid_to_data_oid(inode->vdi_id, idx);
- dprintf("new oid %" PRIx64 "\n", oid);
+ DPRINTF("new oid %" PRIx64 "\n", oid);
}
aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
@@ -1654,14 +1897,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
int ret;
if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
- ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
+ ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
if (ret < 0) {
return ret;
}
bs->total_sectors = sector_num + nb_sectors;
}
- acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+ acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
acb->aio_done_func = sd_write_done;
acb->aiocb_type = AIOCB_WRITE_UDATA;
@@ -1682,7 +1925,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
SheepdogAIOCB *acb;
int ret;
- acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+ acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
acb->aiocb_type = AIOCB_READ_UDATA;
acb->aio_done_func = sd_finish_aiocb;
@@ -1700,39 +1943,31 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
{
BDRVSheepdogState *s = bs->opaque;
- SheepdogObjReq hdr = { 0 };
- SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
- SheepdogInode *inode = &s->inode;
+ SheepdogAIOCB *acb;
+ AIOReq *aio_req;
int ret;
- unsigned int wlen = 0, rlen = 0;
- if (!s->cache_enabled) {
+ if (s->cache_flags != SD_FLAG_CMD_CACHE) {
return 0;
}
- hdr.opcode = SD_OP_FLUSH_VDI;
- hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+ acb = sd_aio_setup(bs, NULL, 0, 0);
+ acb->aiocb_type = AIOCB_FLUSH_CACHE;
+ acb->aio_done_func = sd_finish_aiocb;
- ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
- if (ret) {
- error_report("failed to send a request to the sheep");
+ aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+ 0, 0, 0, 0, 0);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
+ if (ret < 0) {
+ error_report("add_aio_request is failed");
+ free_aio_req(s, aio_req);
+ qemu_aio_release(acb);
return ret;
}
- if (rsp->result == SD_RES_INVALID_PARMS) {
- dprintf("disable write cache since the server doesn't support it\n");
-
- s->cache_enabled = false;
- closesocket(s->flush_fd);
- return 0;
- }
-
- if (rsp->result != SD_RES_SUCCESS) {
- error_report("%s", sd_strerror(rsp->result));
- return -EIO;
- }
-
- return 0;
+ qemu_coroutine_yield();
+ return acb->ret;
}
static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -1743,7 +1978,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
SheepdogInode *inode;
unsigned int datalen;
- dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
+ DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
"is_snapshot %d\n", sn_info->name, sn_info->id_str,
s->name, sn_info->vm_state_size, s->is_snapshot);
@@ -1754,7 +1989,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
return -EINVAL;
}
- dprintf("%s %s\n", sn_info->name, sn_info->id_str);
+ DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
s->inode.vm_state_size = sn_info->vm_state_size;
s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
@@ -1766,21 +2001,21 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
/* refresh inode. */
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
ret = fd;
goto cleanup;
}
ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
- s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
+ s->inode.nr_copies, datalen, 0, false, s->cache_flags);
if (ret < 0) {
error_report("failed to write snapshot's inode.");
goto cleanup;
}
- ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
- s->addr, s->port);
+ ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid,
+ 1);
if (ret < 0) {
error_report("failed to create inode for snapshot. %s",
strerror(errno));
@@ -1790,7 +2025,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
inode = (SheepdogInode *)g_malloc(datalen);
ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
- s->inode.nr_copies, datalen, 0, s->cache_enabled);
+ s->inode.nr_copies, datalen, 0, s->cache_flags);
if (ret < 0) {
error_report("failed to read new inode info. %s", strerror(errno));
@@ -1798,7 +2033,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
}
memcpy(&s->inode, inode, datalen);
- dprintf("s->inode: name %s snap_id %x oid %x\n",
+ DPRINTF("s->inode: name %s snap_id %x oid %x\n",
s->inode.name, s->inode.snap_id, s->inode.vdi_id);
cleanup:
@@ -1806,70 +2041,47 @@ cleanup:
return ret;
}
+/*
+ * We implement rollback(loadvm) operation to the specified snapshot by
+ * 1) switch to the snapshot
+ * 2) rely on sd_create_branch to delete working VDI and
+ * 3) create a new working VDI based on the speicified snapshot
+ */
static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
{
BDRVSheepdogState *s = bs->opaque;
BDRVSheepdogState *old_s;
- char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
- char *buf = NULL;
- uint32_t vid;
+ char tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid = 0;
- int ret = 0, fd;
+ int ret = 0;
old_s = g_malloc(sizeof(BDRVSheepdogState));
memcpy(old_s, s, sizeof(BDRVSheepdogState));
- pstrcpy(vdi, sizeof(vdi), s->name);
-
snapid = strtoul(snapshot_id, NULL, 10);
if (snapid) {
tag[0] = 0;
} else {
- pstrcpy(tag, sizeof(tag), s->name);
+ pstrcpy(tag, sizeof(tag), snapshot_id);
}
- ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
+ ret = reload_inode(s, snapid, tag);
if (ret) {
- error_report("Failed to find_vdi_name");
goto out;
}
- fd = connect_to_sdog(s->addr, s->port);
- if (fd < 0) {
- error_report("failed to connect");
- ret = fd;
- goto out;
- }
-
- buf = g_malloc(SD_INODE_SIZE);
- ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
- SD_INODE_SIZE, 0, s->cache_enabled);
-
- closesocket(fd);
-
+ ret = sd_create_branch(s);
if (ret) {
goto out;
}
- memcpy(&s->inode, buf, sizeof(s->inode));
-
- if (!s->inode.vm_state_size) {
- error_report("Invalid snapshot");
- ret = -ENOENT;
- goto out;
- }
-
- s->is_snapshot = true;
-
- g_free(buf);
g_free(old_s);
return 0;
out:
/* recover bdrv_sd_state */
memcpy(s, old_s, sizeof(BDRVSheepdogState));
- g_free(buf);
g_free(old_s);
error_report("failed to open. recover old bdrv_sd_state.");
@@ -1899,7 +2111,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
vdi_inuse = g_malloc(max);
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
ret = fd;
goto out;
@@ -1926,9 +2138,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
start_nr = hval & (SD_NR_VDIS - 1);
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
- error_report("failed to connect");
ret = fd;
goto out;
}
@@ -1941,7 +2152,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
/* we don't need to read entire object */
ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
- s->cache_enabled);
+ s->cache_flags);
if (ret) {
continue;
@@ -1982,10 +2193,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
int fd, ret = 0, remaining = size;
unsigned int data_len;
uint64_t vmstate_oid;
- uint32_t vdi_index;
uint64_t offset;
+ uint32_t vdi_index;
+ uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
- fd = connect_to_sdog(s->addr, s->port);
+ fd = connect_to_sdog(s);
if (fd < 0) {
return fd;
}
@@ -1996,17 +2208,17 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
- vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
+ vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
create = (offset == 0);
if (load) {
ret = read_object(fd, (char *)data, vmstate_oid,
s->inode.nr_copies, data_len, offset,
- s->cache_enabled);
+ s->cache_flags);
} else {
ret = write_object(fd, (char *)data, vmstate_oid,
s->inode.nr_copies, data_len, offset, create,
- s->cache_enabled);
+ s->cache_flags);
}
if (ret < 0) {
@@ -2024,12 +2236,19 @@ cleanup:
return ret;
}
-static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
- int64_t pos, int size)
+static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t pos)
{
BDRVSheepdogState *s = bs->opaque;
+ void *buf;
+ int ret;
+
+ buf = qemu_blockalign(bs, qiov->size);
+ qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+ ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
+ qemu_vfree(buf);
- return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
+ return ret;
}
static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
@@ -2041,6 +2260,67 @@ static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
}
+static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ SheepdogAIOCB *acb;
+ QEMUIOVector dummy;
+ BDRVSheepdogState *s = bs->opaque;
+ int ret;
+
+ if (!s->discard_supported) {
+ return 0;
+ }
+
+ acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors);
+ acb->aiocb_type = AIOCB_DISCARD_OBJ;
+ acb->aio_done_func = sd_finish_aiocb;
+
+ ret = sd_co_rw_vector(acb);
+ if (ret <= 0) {
+ qemu_aio_release(acb);
+ return ret;
+ }
+
+ qemu_coroutine_yield();
+
+ return acb->ret;
+}
+
+static coroutine_fn int
+sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ int *pnum)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ SheepdogInode *inode = &s->inode;
+ unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+ end = DIV_ROUND_UP((sector_num + nb_sectors) *
+ BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+ unsigned long idx;
+ int ret = 1;
+
+ for (idx = start; idx < end; idx++) {
+ if (inode->data_vdi_id[idx] == 0) {
+ break;
+ }
+ }
+ if (idx == start) {
+ /* Get the longest length of unallocated sectors */
+ ret = 0;
+ for (idx = start + 1; idx < end; idx++) {
+ if (inode->data_vdi_id[idx] != 0) {
+ break;
+ }
+ }
+ }
+
+ *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+ if (*pnum > nb_sectors) {
+ *pnum = nb_sectors;
+ }
+ return ret;
+}
+
static QEMUOptionParameter sd_create_options[] = {
{
.name = BLOCK_OPT_SIZE,
@@ -2060,19 +2340,78 @@ static QEMUOptionParameter sd_create_options[] = {
{ NULL }
};
-BlockDriver bdrv_sheepdog = {
+static BlockDriver bdrv_sheepdog = {
.format_name = "sheepdog",
.protocol_name = "sheepdog",
.instance_size = sizeof(BDRVSheepdogState),
.bdrv_file_open = sd_open,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+ .bdrv_getlength = sd_getlength,
+ .bdrv_truncate = sd_truncate,
+
+ .bdrv_co_readv = sd_co_readv,
+ .bdrv_co_writev = sd_co_writev,
+ .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
+ .bdrv_co_discard = sd_co_discard,
+ .bdrv_co_is_allocated = sd_co_is_allocated,
+
+ .bdrv_snapshot_create = sd_snapshot_create,
+ .bdrv_snapshot_goto = sd_snapshot_goto,
+ .bdrv_snapshot_delete = sd_snapshot_delete,
+ .bdrv_snapshot_list = sd_snapshot_list,
+
+ .bdrv_save_vmstate = sd_save_vmstate,
+ .bdrv_load_vmstate = sd_load_vmstate,
+
+ .create_options = sd_create_options,
+};
+
+static BlockDriver bdrv_sheepdog_tcp = {
+ .format_name = "sheepdog",
+ .protocol_name = "sheepdog+tcp",
+ .instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_file_open = sd_open,
+ .bdrv_close = sd_close,
+ .bdrv_create = sd_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+ .bdrv_getlength = sd_getlength,
+ .bdrv_truncate = sd_truncate,
+
+ .bdrv_co_readv = sd_co_readv,
+ .bdrv_co_writev = sd_co_writev,
+ .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
+ .bdrv_co_discard = sd_co_discard,
+ .bdrv_co_is_allocated = sd_co_is_allocated,
+
+ .bdrv_snapshot_create = sd_snapshot_create,
+ .bdrv_snapshot_goto = sd_snapshot_goto,
+ .bdrv_snapshot_delete = sd_snapshot_delete,
+ .bdrv_snapshot_list = sd_snapshot_list,
+
+ .bdrv_save_vmstate = sd_save_vmstate,
+ .bdrv_load_vmstate = sd_load_vmstate,
+
+ .create_options = sd_create_options,
+};
+
+static BlockDriver bdrv_sheepdog_unix = {
+ .format_name = "sheepdog",
+ .protocol_name = "sheepdog+unix",
+ .instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_file_open = sd_open,
+ .bdrv_close = sd_close,
+ .bdrv_create = sd_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_getlength = sd_getlength,
.bdrv_truncate = sd_truncate,
.bdrv_co_readv = sd_co_readv,
.bdrv_co_writev = sd_co_writev,
.bdrv_co_flush_to_disk = sd_co_flush_to_disk,
+ .bdrv_co_discard = sd_co_discard,
+ .bdrv_co_is_allocated = sd_co_is_allocated,
.bdrv_snapshot_create = sd_snapshot_create,
.bdrv_snapshot_goto = sd_snapshot_goto,
@@ -2088,5 +2427,7 @@ BlockDriver bdrv_sheepdog = {
static void bdrv_sheepdog_init(void)
{
bdrv_register(&bdrv_sheepdog);
+ bdrv_register(&bdrv_sheepdog_tcp);
+ bdrv_register(&bdrv_sheepdog_unix);
}
block_init(bdrv_sheepdog_init);
diff --git a/block/snapshot.c b/block/snapshot.c
new file mode 100644
index 000000000..6c6d9deea
--- /dev/null
+++ b/block/snapshot.c
@@ -0,0 +1,157 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/snapshot.h"
+#include "block/block_int.h"
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+ const char *name)
+{
+ QEMUSnapshotInfo *sn_tab, *sn;
+ int nb_sns, i, ret;
+
+ ret = -ENOENT;
+ nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+ if (nb_sns < 0) {
+ return ret;
+ }
+ for (i = 0; i < nb_sns; i++) {
+ sn = &sn_tab[i];
+ if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) {
+ *sn_info = *sn;
+ ret = 0;
+ break;
+ }
+ }
+ g_free(sn_tab);
+ return ret;
+}
+
+int bdrv_can_snapshot(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ return 0;
+ }
+
+ if (!drv->bdrv_snapshot_create) {
+ if (bs->file != NULL) {
+ return bdrv_can_snapshot(bs->file);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+int bdrv_snapshot_create(BlockDriverState *bs,
+ QEMUSnapshotInfo *sn_info)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (drv->bdrv_snapshot_create) {
+ return drv->bdrv_snapshot_create(bs, sn_info);
+ }
+ if (bs->file) {
+ return bdrv_snapshot_create(bs->file, sn_info);
+ }
+ return -ENOTSUP;
+}
+
+int bdrv_snapshot_goto(BlockDriverState *bs,
+ const char *snapshot_id)
+{
+ BlockDriver *drv = bs->drv;
+ int ret, open_ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (drv->bdrv_snapshot_goto) {
+ return drv->bdrv_snapshot_goto(bs, snapshot_id);
+ }
+
+ if (bs->file) {
+ drv->bdrv_close(bs);
+ ret = bdrv_snapshot_goto(bs->file, snapshot_id);
+ open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
+ if (open_ret < 0) {
+ bdrv_delete(bs->file);
+ bs->drv = NULL;
+ return open_ret;
+ }
+ return ret;
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (drv->bdrv_snapshot_delete) {
+ return drv->bdrv_snapshot_delete(bs, snapshot_id);
+ }
+ if (bs->file) {
+ return bdrv_snapshot_delete(bs->file, snapshot_id);
+ }
+ return -ENOTSUP;
+}
+
+int bdrv_snapshot_list(BlockDriverState *bs,
+ QEMUSnapshotInfo **psn_info)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (drv->bdrv_snapshot_list) {
+ return drv->bdrv_snapshot_list(bs, psn_info);
+ }
+ if (bs->file) {
+ return bdrv_snapshot_list(bs->file, psn_info);
+ }
+ return -ENOTSUP;
+}
+
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+ const char *snapshot_name)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (!bs->read_only) {
+ return -EINVAL;
+ }
+ if (drv->bdrv_snapshot_load_tmp) {
+ return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+ }
+ return -ENOTSUP;
+}
diff --git a/block/ssh.c b/block/ssh.c
new file mode 100644
index 000000000..d7e7bf8dd
--- /dev/null
+++ b/block/ssh.c
@@ -0,0 +1,1076 @@
+/*
+ * Secure Shell (ssh) backend for QEMU.
+ *
+ * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones <rjones@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include <libssh2.h>
+#include <libssh2_sftp.h>
+
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
+#include "qapi/qmp/qint.h"
+
+/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
+ * this block driver code.
+ *
+ * TRACE_LIBSSH2=<bitmask> enables tracing in libssh2 itself. Note
+ * that this requires that libssh2 was specially compiled with the
+ * `./configure --enable-debug' option, so most likely you will have
+ * to compile it yourself. The meaning of <bitmask> is described
+ * here: http://www.libssh2.org/libssh2_trace.html
+ */
+#define DEBUG_SSH 0
+#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */
+
+#define DPRINTF(fmt, ...) \
+ do { \
+ if (DEBUG_SSH) { \
+ fprintf(stderr, "ssh: %-15s " fmt "\n", \
+ __func__, ##__VA_ARGS__); \
+ } \
+ } while (0)
+
+typedef struct BDRVSSHState {
+ /* Coroutine. */
+ CoMutex lock;
+
+ /* SSH connection. */
+ int sock; /* socket */
+ LIBSSH2_SESSION *session; /* ssh session */
+ LIBSSH2_SFTP *sftp; /* sftp session */
+ LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */
+
+ /* See ssh_seek() function below. */
+ int64_t offset;
+ bool offset_op_read;
+
+ /* File attributes at open. We try to keep the .filesize field
+ * updated if it changes (eg by writing at the end of the file).
+ */
+ LIBSSH2_SFTP_ATTRIBUTES attrs;
+
+ /* Used to warn if 'flush' is not supported. */
+ char *hostport;
+ bool unsafe_flush_warning;
+} BDRVSSHState;
+
+static void ssh_state_init(BDRVSSHState *s)
+{
+ memset(s, 0, sizeof *s);
+ s->sock = -1;
+ s->offset = -1;
+ qemu_co_mutex_init(&s->lock);
+}
+
+static void ssh_state_free(BDRVSSHState *s)
+{
+ g_free(s->hostport);
+ if (s->sftp_handle) {
+ libssh2_sftp_close(s->sftp_handle);
+ }
+ if (s->sftp) {
+ libssh2_sftp_shutdown(s->sftp);
+ }
+ if (s->session) {
+ libssh2_session_disconnect(s->session,
+ "from qemu ssh client: "
+ "user closed the connection");
+ libssh2_session_free(s->session);
+ }
+ if (s->sock >= 0) {
+ close(s->sock);
+ }
+}
+
+/* Wrappers around error_report which make sure to dump as much
+ * information from libssh2 as possible.
+ */
+static void GCC_FMT_ATTR(2, 3)
+session_error_report(BDRVSSHState *s, const char *fs, ...)
+{
+ va_list args;
+
+ va_start(args, fs);
+ error_vprintf(fs, args);
+
+ if ((s)->session) {
+ char *ssh_err;
+ int ssh_err_code;
+
+ libssh2_session_last_error((s)->session, &ssh_err, NULL, 0);
+ /* This is not an errno. See <libssh2.h>. */
+ ssh_err_code = libssh2_session_last_errno((s)->session);
+
+ error_printf(": %s (libssh2 error code: %d)", ssh_err, ssh_err_code);
+ }
+
+ va_end(args);
+ error_printf("\n");
+}
+
+static void GCC_FMT_ATTR(2, 3)
+sftp_error_report(BDRVSSHState *s, const char *fs, ...)
+{
+ va_list args;
+
+ va_start(args, fs);
+ error_vprintf(fs, args);
+
+ if ((s)->sftp) {
+ char *ssh_err;
+ int ssh_err_code;
+ unsigned long sftp_err_code;
+
+ libssh2_session_last_error((s)->session, &ssh_err, NULL, 0);
+ /* This is not an errno. See <libssh2.h>. */
+ ssh_err_code = libssh2_session_last_errno((s)->session);
+ /* See <libssh2_sftp.h>. */
+ sftp_err_code = libssh2_sftp_last_error((s)->sftp);
+
+ error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)",
+ ssh_err, ssh_err_code, sftp_err_code);
+ }
+
+ va_end(args);
+ error_printf("\n");
+}
+
+static int parse_uri(const char *filename, QDict *options, Error **errp)
+{
+ URI *uri = NULL;
+ QueryParams *qp = NULL;
+ int i;
+
+ uri = uri_parse(filename);
+ if (!uri) {
+ return -EINVAL;
+ }
+
+ if (strcmp(uri->scheme, "ssh") != 0) {
+ error_setg(errp, "URI scheme must be 'ssh'");
+ goto err;
+ }
+
+ if (!uri->server || strcmp(uri->server, "") == 0) {
+ error_setg(errp, "missing hostname in URI");
+ goto err;
+ }
+
+ if (!uri->path || strcmp(uri->path, "") == 0) {
+ error_setg(errp, "missing remote path in URI");
+ goto err;
+ }
+
+ qp = query_params_parse(uri->query);
+ if (!qp) {
+ error_setg(errp, "could not parse query parameters");
+ goto err;
+ }
+
+ if(uri->user && strcmp(uri->user, "") != 0) {
+ qdict_put(options, "user", qstring_from_str(uri->user));
+ }
+
+ qdict_put(options, "host", qstring_from_str(uri->server));
+
+ if (uri->port) {
+ qdict_put(options, "port", qint_from_int(uri->port));
+ }
+
+ qdict_put(options, "path", qstring_from_str(uri->path));
+
+ /* Pick out any query parameters that we understand, and ignore
+ * the rest.
+ */
+ for (i = 0; i < qp->n; ++i) {
+ if (strcmp(qp->p[i].name, "host_key_check") == 0) {
+ qdict_put(options, "host_key_check",
+ qstring_from_str(qp->p[i].value));
+ }
+ }
+
+ query_params_free(qp);
+ uri_free(uri);
+ return 0;
+
+ err:
+ if (qp) {
+ query_params_free(qp);
+ }
+ if (uri) {
+ uri_free(uri);
+ }
+ return -EINVAL;
+}
+
+static void ssh_parse_filename(const char *filename, QDict *options,
+ Error **errp)
+{
+ if (qdict_haskey(options, "user") ||
+ qdict_haskey(options, "host") ||
+ qdict_haskey(options, "port") ||
+ qdict_haskey(options, "path") ||
+ qdict_haskey(options, "host_key_check")) {
+ error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option");
+ return;
+ }
+
+ parse_uri(filename, options, errp);
+}
+
+static int check_host_key_knownhosts(BDRVSSHState *s,
+ const char *host, int port)
+{
+ const char *home;
+ char *knh_file = NULL;
+ LIBSSH2_KNOWNHOSTS *knh = NULL;
+ struct libssh2_knownhost *found;
+ int ret, r;
+ const char *hostkey;
+ size_t len;
+ int type;
+
+ hostkey = libssh2_session_hostkey(s->session, &len, &type);
+ if (!hostkey) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to read remote host key");
+ goto out;
+ }
+
+ knh = libssh2_knownhost_init(s->session);
+ if (!knh) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to initialize known hosts support");
+ goto out;
+ }
+
+ home = getenv("HOME");
+ if (home) {
+ knh_file = g_strdup_printf("%s/.ssh/known_hosts", home);
+ } else {
+ knh_file = g_strdup_printf("/root/.ssh/known_hosts");
+ }
+
+ /* Read all known hosts from OpenSSH-style known_hosts file. */
+ libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH);
+
+ r = libssh2_knownhost_checkp(knh, host, port, hostkey, len,
+ LIBSSH2_KNOWNHOST_TYPE_PLAIN|
+ LIBSSH2_KNOWNHOST_KEYENC_RAW,
+ &found);
+ switch (r) {
+ case LIBSSH2_KNOWNHOST_CHECK_MATCH:
+ /* OK */
+ DPRINTF("host key OK: %s", found->key);
+ break;
+ case LIBSSH2_KNOWNHOST_CHECK_MISMATCH:
+ ret = -EINVAL;
+ session_error_report(s, "host key does not match the one in known_hosts (found key %s)",
+ found->key);
+ goto out;
+ case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND:
+ ret = -EINVAL;
+ session_error_report(s, "no host key was found in known_hosts");
+ goto out;
+ case LIBSSH2_KNOWNHOST_CHECK_FAILURE:
+ ret = -EINVAL;
+ session_error_report(s, "failure matching the host key with known_hosts");
+ goto out;
+ default:
+ ret = -EINVAL;
+ session_error_report(s, "unknown error matching the host key with known_hosts (%d)",
+ r);
+ goto out;
+ }
+
+ /* known_hosts checking successful. */
+ ret = 0;
+
+ out:
+ if (knh != NULL) {
+ libssh2_knownhost_free(knh);
+ }
+ g_free(knh_file);
+ return ret;
+}
+
+static unsigned hex2decimal(char ch)
+{
+ if (ch >= '0' && ch <= '9') {
+ return (ch - '0');
+ } else if (ch >= 'a' && ch <= 'f') {
+ return 10 + (ch - 'a');
+ } else if (ch >= 'A' && ch <= 'F') {
+ return 10 + (ch - 'A');
+ }
+
+ return -1;
+}
+
+/* Compare the binary fingerprint (hash of host key) with the
+ * host_key_check parameter.
+ */
+static int compare_fingerprint(const unsigned char *fingerprint, size_t len,
+ const char *host_key_check)
+{
+ unsigned c;
+
+ while (len > 0) {
+ while (*host_key_check == ':')
+ host_key_check++;
+ if (!qemu_isxdigit(host_key_check[0]) ||
+ !qemu_isxdigit(host_key_check[1]))
+ return 1;
+ c = hex2decimal(host_key_check[0]) * 16 +
+ hex2decimal(host_key_check[1]);
+ if (c - *fingerprint != 0)
+ return c - *fingerprint;
+ fingerprint++;
+ len--;
+ host_key_check += 2;
+ }
+ return *host_key_check - '\0';
+}
+
+static int
+check_host_key_hash(BDRVSSHState *s, const char *hash,
+ int hash_type, size_t fingerprint_len)
+{
+ const char *fingerprint;
+
+ fingerprint = libssh2_hostkey_hash(s->session, hash_type);
+ if (!fingerprint) {
+ session_error_report(s, "failed to read remote host key");
+ return -EINVAL;
+ }
+
+ if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len,
+ hash) != 0) {
+ error_report("remote host key does not match host_key_check '%s'",
+ hash);
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int check_host_key(BDRVSSHState *s, const char *host, int port,
+ const char *host_key_check)
+{
+ /* host_key_check=no */
+ if (strcmp(host_key_check, "no") == 0) {
+ return 0;
+ }
+
+ /* host_key_check=md5:xx:yy:zz:... */
+ if (strncmp(host_key_check, "md5:", 4) == 0) {
+ return check_host_key_hash(s, &host_key_check[4],
+ LIBSSH2_HOSTKEY_HASH_MD5, 16);
+ }
+
+ /* host_key_check=sha1:xx:yy:zz:... */
+ if (strncmp(host_key_check, "sha1:", 5) == 0) {
+ return check_host_key_hash(s, &host_key_check[5],
+ LIBSSH2_HOSTKEY_HASH_SHA1, 20);
+ }
+
+ /* host_key_check=yes */
+ if (strcmp(host_key_check, "yes") == 0) {
+ return check_host_key_knownhosts(s, host, port);
+ }
+
+ error_report("unknown host_key_check setting (%s)", host_key_check);
+ return -EINVAL;
+}
+
+static int authenticate(BDRVSSHState *s, const char *user)
+{
+ int r, ret;
+ const char *userauthlist;
+ LIBSSH2_AGENT *agent = NULL;
+ struct libssh2_agent_publickey *identity;
+ struct libssh2_agent_publickey *prev_identity = NULL;
+
+ userauthlist = libssh2_userauth_list(s->session, user, strlen(user));
+ if (strstr(userauthlist, "publickey") == NULL) {
+ ret = -EPERM;
+ error_report("remote server does not support \"publickey\" authentication");
+ goto out;
+ }
+
+ /* Connect to ssh-agent and try each identity in turn. */
+ agent = libssh2_agent_init(s->session);
+ if (!agent) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to initialize ssh-agent support");
+ goto out;
+ }
+ if (libssh2_agent_connect(agent)) {
+ ret = -ECONNREFUSED;
+ session_error_report(s, "failed to connect to ssh-agent");
+ goto out;
+ }
+ if (libssh2_agent_list_identities(agent)) {
+ ret = -EINVAL;
+ session_error_report(s, "failed requesting identities from ssh-agent");
+ goto out;
+ }
+
+ for(;;) {
+ r = libssh2_agent_get_identity(agent, &identity, prev_identity);
+ if (r == 1) { /* end of list */
+ break;
+ }
+ if (r < 0) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to obtain identity from ssh-agent");
+ goto out;
+ }
+ r = libssh2_agent_userauth(agent, user, identity);
+ if (r == 0) {
+ /* Authenticated! */
+ ret = 0;
+ goto out;
+ }
+ /* Failed to authenticate with this identity, try the next one. */
+ prev_identity = identity;
+ }
+
+ ret = -EPERM;
+ error_report("failed to authenticate using publickey authentication "
+ "and the identities held by your ssh-agent");
+
+ out:
+ if (agent != NULL) {
+ /* Note: libssh2 implementation implicitly calls
+ * libssh2_agent_disconnect if necessary.
+ */
+ libssh2_agent_free(agent);
+ }
+
+ return ret;
+}
+
+static int connect_to_ssh(BDRVSSHState *s, QDict *options,
+ int ssh_flags, int creat_mode)
+{
+ int r, ret;
+ Error *err = NULL;
+ const char *host, *user, *path, *host_key_check;
+ int port;
+
+ host = qdict_get_str(options, "host");
+
+ if (qdict_haskey(options, "port")) {
+ port = qdict_get_int(options, "port");
+ } else {
+ port = 22;
+ }
+
+ path = qdict_get_str(options, "path");
+
+ if (qdict_haskey(options, "user")) {
+ user = qdict_get_str(options, "user");
+ } else {
+ user = g_get_user_name();
+ if (!user) {
+ ret = -errno;
+ goto err;
+ }
+ }
+
+ if (qdict_haskey(options, "host_key_check")) {
+ host_key_check = qdict_get_str(options, "host_key_check");
+ } else {
+ host_key_check = "yes";
+ }
+
+ /* Construct the host:port name for inet_connect. */
+ g_free(s->hostport);
+ s->hostport = g_strdup_printf("%s:%d", host, port);
+
+ /* Open the socket and connect. */
+ s->sock = inet_connect(s->hostport, &err);
+ if (err != NULL) {
+ ret = -errno;
+ qerror_report_err(err);
+ error_free(err);
+ goto err;
+ }
+
+ /* Create SSH session. */
+ s->session = libssh2_session_init();
+ if (!s->session) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to initialize libssh2 session");
+ goto err;
+ }
+
+#if TRACE_LIBSSH2 != 0
+ libssh2_trace(s->session, TRACE_LIBSSH2);
+#endif
+
+ r = libssh2_session_handshake(s->session, s->sock);
+ if (r != 0) {
+ ret = -EINVAL;
+ session_error_report(s, "failed to establish SSH session");
+ goto err;
+ }
+
+ /* Check the remote host's key against known_hosts. */
+ ret = check_host_key(s, host, port, host_key_check);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /* Authenticate. */
+ ret = authenticate(s, user);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /* Start SFTP. */
+ s->sftp = libssh2_sftp_init(s->session);
+ if (!s->sftp) {
+ session_error_report(s, "failed to initialize sftp handle");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Open the remote file. */
+ DPRINTF("opening file %s flags=0x%x creat_mode=0%o",
+ path, ssh_flags, creat_mode);
+ s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode);
+ if (!s->sftp_handle) {
+ session_error_report(s, "failed to open remote file '%s'", path);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs);
+ if (r < 0) {
+ sftp_error_report(s, "failed to read file attributes");
+ return -EINVAL;
+ }
+
+ /* Delete the options we've used; any not deleted will cause the
+ * block layer to give an error about unused options.
+ */
+ qdict_del(options, "host");
+ qdict_del(options, "port");
+ qdict_del(options, "user");
+ qdict_del(options, "path");
+ qdict_del(options, "host_key_check");
+
+ return 0;
+
+ err:
+ if (s->sftp_handle) {
+ libssh2_sftp_close(s->sftp_handle);
+ }
+ s->sftp_handle = NULL;
+ if (s->sftp) {
+ libssh2_sftp_shutdown(s->sftp);
+ }
+ s->sftp = NULL;
+ if (s->session) {
+ libssh2_session_disconnect(s->session,
+ "from qemu ssh client: "
+ "error opening connection");
+ libssh2_session_free(s->session);
+ }
+ s->session = NULL;
+
+ return ret;
+}
+
+static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags)
+{
+ BDRVSSHState *s = bs->opaque;
+ int ret;
+ int ssh_flags;
+
+ ssh_state_init(s);
+
+ ssh_flags = LIBSSH2_FXF_READ;
+ if (bdrv_flags & BDRV_O_RDWR) {
+ ssh_flags |= LIBSSH2_FXF_WRITE;
+ }
+
+ /* Start up SSH. */
+ ret = connect_to_ssh(s, options, ssh_flags, 0);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /* Go non-blocking. */
+ libssh2_session_set_blocking(s->session, 0);
+
+ return 0;
+
+ err:
+ if (s->sock >= 0) {
+ close(s->sock);
+ }
+ s->sock = -1;
+
+ return ret;
+}
+
+static QEMUOptionParameter ssh_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+static int ssh_create(const char *filename, QEMUOptionParameter *options)
+{
+ int r, ret;
+ Error *local_err = NULL;
+ int64_t total_size = 0;
+ QDict *uri_options = NULL;
+ BDRVSSHState s;
+ ssize_t r2;
+ char c[1] = { '\0' };
+
+ ssh_state_init(&s);
+
+ /* Get desired file size. */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n;
+ }
+ options++;
+ }
+ DPRINTF("total_size=%" PRIi64, total_size);
+
+ uri_options = qdict_new();
+ r = parse_uri(filename, uri_options, &local_err);
+ if (r < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = r;
+ goto out;
+ }
+
+ r = connect_to_ssh(&s, uri_options,
+ LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE|
+ LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, 0644);
+ if (r < 0) {
+ ret = r;
+ goto out;
+ }
+
+ if (total_size > 0) {
+ libssh2_sftp_seek64(s.sftp_handle, total_size-1);
+ r2 = libssh2_sftp_write(s.sftp_handle, c, 1);
+ if (r2 < 0) {
+ sftp_error_report(&s, "truncate failed");
+ ret = -EINVAL;
+ goto out;
+ }
+ s.attrs.filesize = total_size;
+ }
+
+ ret = 0;
+
+ out:
+ ssh_state_free(&s);
+ if (uri_options != NULL) {
+ QDECREF(uri_options);
+ }
+ return ret;
+}
+
+static void ssh_close(BlockDriverState *bs)
+{
+ BDRVSSHState *s = bs->opaque;
+
+ ssh_state_free(s);
+}
+
+static int ssh_has_zero_init(BlockDriverState *bs)
+{
+ BDRVSSHState *s = bs->opaque;
+ /* Assume false, unless we can positively prove it's true. */
+ int has_zero_init = 0;
+
+ if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) {
+ if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) {
+ has_zero_init = 1;
+ }
+ }
+
+ return has_zero_init;
+}
+
+static void restart_coroutine(void *opaque)
+{
+ Coroutine *co = opaque;
+
+ DPRINTF("co=%p", co);
+
+ qemu_coroutine_enter(co, NULL);
+}
+
+/* Always true because when we have called set_fd_handler there is
+ * always a request being processed.
+ */
+static int return_true(void *opaque)
+{
+ return 1;
+}
+
+static coroutine_fn void set_fd_handler(BDRVSSHState *s)
+{
+ int r;
+ IOHandler *rd_handler = NULL, *wr_handler = NULL;
+ Coroutine *co = qemu_coroutine_self();
+
+ r = libssh2_session_block_directions(s->session);
+
+ if (r & LIBSSH2_SESSION_BLOCK_INBOUND) {
+ rd_handler = restart_coroutine;
+ }
+ if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) {
+ wr_handler = restart_coroutine;
+ }
+
+ DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock,
+ rd_handler, wr_handler);
+
+ qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co);
+}
+
+static coroutine_fn void clear_fd_handler(BDRVSSHState *s)
+{
+ DPRINTF("s->sock=%d", s->sock);
+ qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
+}
+
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s)
+{
+ set_fd_handler(s);
+ qemu_coroutine_yield();
+ clear_fd_handler(s);
+}
+
+/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
+ * in the remote file. Notice that it just updates a field in the
+ * sftp_handle structure, so there is no network traffic and it cannot
+ * fail.
+ *
+ * However, `libssh2_sftp_seek64' does have a catastrophic effect on
+ * performance since it causes the handle to throw away all in-flight
+ * reads and buffered readahead data. Therefore this function tries
+ * to be intelligent about when to call the underlying libssh2 function.
+ */
+#define SSH_SEEK_WRITE 0
+#define SSH_SEEK_READ 1
+#define SSH_SEEK_FORCE 2
+
+static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags)
+{
+ bool op_read = (flags & SSH_SEEK_READ) != 0;
+ bool force = (flags & SSH_SEEK_FORCE) != 0;
+
+ if (force || op_read != s->offset_op_read || offset != s->offset) {
+ DPRINTF("seeking to offset=%" PRIi64, offset);
+ libssh2_sftp_seek64(s->sftp_handle, offset);
+ s->offset = offset;
+ s->offset_op_read = op_read;
+ }
+}
+
+static coroutine_fn int ssh_read(BDRVSSHState *s,
+ int64_t offset, size_t size,
+ QEMUIOVector *qiov)
+{
+ ssize_t r;
+ size_t got;
+ char *buf, *end_of_vec;
+ struct iovec *i;
+
+ DPRINTF("offset=%" PRIi64 " size=%zu", offset, size);
+
+ ssh_seek(s, offset, SSH_SEEK_READ);
+
+ /* This keeps track of the current iovec element ('i'), where we
+ * will write to next ('buf'), and the end of the current iovec
+ * ('end_of_vec').
+ */
+ i = &qiov->iov[0];
+ buf = i->iov_base;
+ end_of_vec = i->iov_base + i->iov_len;
+
+ /* libssh2 has a hard-coded limit of 2000 bytes per request,
+ * although it will also do readahead behind our backs. Therefore
+ * we may have to do repeated reads here until we have read 'size'
+ * bytes.
+ */
+ for (got = 0; got < size; ) {
+ again:
+ DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf);
+ r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf);
+ DPRINTF("sftp_read returned %zd", r);
+
+ if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+ co_yield(s);
+ goto again;
+ }
+ if (r < 0) {
+ sftp_error_report(s, "read failed");
+ s->offset = -1;
+ return -EIO;
+ }
+ if (r == 0) {
+ /* EOF: Short read so pad the buffer with zeroes and return it. */
+ qemu_iovec_memset(qiov, got, 0, size - got);
+ return 0;
+ }
+
+ got += r;
+ buf += r;
+ s->offset += r;
+ if (buf >= end_of_vec && got < size) {
+ i++;
+ buf = i->iov_base;
+ end_of_vec = i->iov_base + i->iov_len;
+ }
+ }
+
+ return 0;
+}
+
+static coroutine_fn int ssh_co_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVSSHState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = ssh_read(s, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE, qiov);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return ret;
+}
+
+static int ssh_write(BDRVSSHState *s,
+ int64_t offset, size_t size,
+ QEMUIOVector *qiov)
+{
+ ssize_t r;
+ size_t written;
+ char *buf, *end_of_vec;
+ struct iovec *i;
+
+ DPRINTF("offset=%" PRIi64 " size=%zu", offset, size);
+
+ ssh_seek(s, offset, SSH_SEEK_WRITE);
+
+ /* This keeps track of the current iovec element ('i'), where we
+ * will read from next ('buf'), and the end of the current iovec
+ * ('end_of_vec').
+ */
+ i = &qiov->iov[0];
+ buf = i->iov_base;
+ end_of_vec = i->iov_base + i->iov_len;
+
+ for (written = 0; written < size; ) {
+ again:
+ DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf);
+ r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf);
+ DPRINTF("sftp_write returned %zd", r);
+
+ if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+ co_yield(s);
+ goto again;
+ }
+ if (r < 0) {
+ sftp_error_report(s, "write failed");
+ s->offset = -1;
+ return -EIO;
+ }
+ /* The libssh2 API is very unclear about this. A comment in
+ * the code says "nothing was acked, and no EAGAIN was
+ * received!" which apparently means that no data got sent
+ * out, and the underlying channel didn't return any EAGAIN
+ * indication. I think this is a bug in either libssh2 or
+ * OpenSSH (server-side). In any case, forcing a seek (to
+ * discard libssh2 internal buffers), and then trying again
+ * works for me.
+ */
+ if (r == 0) {
+ ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE);
+ co_yield(s);
+ goto again;
+ }
+
+ written += r;
+ buf += r;
+ s->offset += r;
+ if (buf >= end_of_vec && written < size) {
+ i++;
+ buf = i->iov_base;
+ end_of_vec = i->iov_base + i->iov_len;
+ }
+
+ if (offset + written > s->attrs.filesize)
+ s->attrs.filesize = offset + written;
+ }
+
+ return 0;
+}
+
+static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVSSHState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = ssh_write(s, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE, qiov);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return ret;
+}
+
+static void unsafe_flush_warning(BDRVSSHState *s, const char *what)
+{
+ if (!s->unsafe_flush_warning) {
+ error_report("warning: ssh server %s does not support fsync",
+ s->hostport);
+ if (what) {
+ error_report("to support fsync, you need %s", what);
+ }
+ s->unsafe_flush_warning = true;
+ }
+}
+
+#ifdef HAS_LIBSSH2_SFTP_FSYNC
+
+static coroutine_fn int ssh_flush(BDRVSSHState *s)
+{
+ int r;
+
+ DPRINTF("fsync");
+ again:
+ r = libssh2_sftp_fsync(s->sftp_handle);
+ if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+ co_yield(s);
+ goto again;
+ }
+ if (r == LIBSSH2_ERROR_SFTP_PROTOCOL &&
+ libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) {
+ unsafe_flush_warning(s, "OpenSSH >= 6.3");
+ return 0;
+ }
+ if (r < 0) {
+ sftp_error_report(s, "fsync failed");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static coroutine_fn int ssh_co_flush(BlockDriverState *bs)
+{
+ BDRVSSHState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = ssh_flush(s);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return ret;
+}
+
+#else /* !HAS_LIBSSH2_SFTP_FSYNC */
+
+static coroutine_fn int ssh_co_flush(BlockDriverState *bs)
+{
+ BDRVSSHState *s = bs->opaque;
+
+ unsafe_flush_warning(s, "libssh2 >= 1.4.4");
+ return 0;
+}
+
+#endif /* !HAS_LIBSSH2_SFTP_FSYNC */
+
+static int64_t ssh_getlength(BlockDriverState *bs)
+{
+ BDRVSSHState *s = bs->opaque;
+ int64_t length;
+
+ /* Note we cannot make a libssh2 call here. */
+ length = (int64_t) s->attrs.filesize;
+ DPRINTF("length=%" PRIi64, length);
+
+ return length;
+}
+
+static BlockDriver bdrv_ssh = {
+ .format_name = "ssh",
+ .protocol_name = "ssh",
+ .instance_size = sizeof(BDRVSSHState),
+ .bdrv_parse_filename = ssh_parse_filename,
+ .bdrv_file_open = ssh_file_open,
+ .bdrv_create = ssh_create,
+ .bdrv_close = ssh_close,
+ .bdrv_has_zero_init = ssh_has_zero_init,
+ .bdrv_co_readv = ssh_co_readv,
+ .bdrv_co_writev = ssh_co_writev,
+ .bdrv_getlength = ssh_getlength,
+ .bdrv_co_flush_to_disk = ssh_co_flush,
+ .create_options = ssh_create_options,
+};
+
+static void bdrv_ssh_init(void)
+{
+ int r;
+
+ r = libssh2_init(0);
+ if (r != 0) {
+ fprintf(stderr, "libssh2 initialization failed, %d\n", r);
+ exit(EXIT_FAILURE);
+ }
+
+ bdrv_register(&bdrv_ssh);
+}
+
+block_init(bdrv_ssh_init);
diff --git a/block/stream.c b/block/stream.c
index 0c0fc7a13..7fe9e486b 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -12,8 +12,8 @@
*/
#include "trace.h"
-#include "block_int.h"
-#include "blockjob.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
#include "qemu/ratelimit.h"
enum {
@@ -108,7 +108,7 @@ static void coroutine_fn stream_run(void *opaque)
wait:
/* Note that even when no rate limit is applied we need to yield
- * with no pending I/O here so that qemu_aio_flush() returns.
+ * with no pending I/O here so that bdrv_drain_all() returns.
*/
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
if (block_job_is_cancelled(&s->common)) {
@@ -198,7 +198,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}
-static BlockJobType stream_job_type = {
+static const BlockJobType stream_job_type = {
.instance_size = sizeof(StreamBlockJob),
.job_type = "stream",
.set_speed = stream_set_speed,
diff --git a/block/vdi.c b/block/vdi.c
index c8330b7ea..8a915257e 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -50,15 +50,15 @@
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
#if defined(CONFIG_UUID)
#include <uuid/uuid.h>
#else
/* TODO: move uuid emulation to some central place in QEMU. */
-#include "sysemu.h" /* UUID_FMT */
+#include "sysemu/sysemu.h" /* UUID_FMT */
typedef unsigned char uuid_t[16];
#endif
@@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header)
{
char uuid[37];
logout("text %s", header->text);
- logout("signature 0x%04x\n", header->signature);
+ logout("signature 0x%08x\n", header->signature);
logout("header size 0x%04x\n", header->header_size);
logout("image type 0x%04x\n", header->image_type);
logout("image flags 0x%04x\n", header->image_flags);
@@ -364,15 +364,17 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
return result;
}
-static int vdi_open(BlockDriverState *bs, int flags)
+static int vdi_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVVdiState *s = bs->opaque;
VdiHeader header;
size_t bmap_size;
+ int ret;
logout("\n");
- if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) {
+ ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
+ if (ret < 0) {
goto fail;
}
@@ -390,33 +392,45 @@ static int vdi_open(BlockDriverState *bs, int flags)
header.disk_size &= ~(SECTOR_SIZE - 1);
}
- if (header.version != VDI_VERSION_1_1) {
+ if (header.signature != VDI_SIGNATURE) {
+ logout("bad vdi signature %08x\n", header.signature);
+ ret = -EMEDIUMTYPE;
+ goto fail;
+ } else if (header.version != VDI_VERSION_1_1) {
logout("unsupported version %u.%u\n",
header.version >> 16, header.version & 0xffff);
+ ret = -ENOTSUP;
goto fail;
} else if (header.offset_bmap % SECTOR_SIZE != 0) {
/* We only support block maps which start on a sector boundary. */
logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
+ ret = -ENOTSUP;
goto fail;
} else if (header.offset_data % SECTOR_SIZE != 0) {
/* We only support data blocks which start on a sector boundary. */
logout("unsupported data offset 0x%x B\n", header.offset_data);
+ ret = -ENOTSUP;
goto fail;
} else if (header.sector_size != SECTOR_SIZE) {
logout("unsupported sector size %u B\n", header.sector_size);
+ ret = -ENOTSUP;
goto fail;
} else if (header.block_size != 1 * MiB) {
logout("unsupported block size %u B\n", header.block_size);
+ ret = -ENOTSUP;
goto fail;
} else if (header.disk_size >
(uint64_t)header.blocks_in_image * header.block_size) {
logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
+ ret = -ENOTSUP;
goto fail;
} else if (!uuid_is_null(header.uuid_link)) {
logout("link uuid != 0, unsupported\n");
+ ret = -ENOTSUP;
goto fail;
} else if (!uuid_is_null(header.uuid_parent)) {
logout("parent uuid != 0, unsupported\n");
+ ret = -ENOTSUP;
goto fail;
}
@@ -429,10 +443,9 @@ static int vdi_open(BlockDriverState *bs, int flags)
bmap_size = header.blocks_in_image * sizeof(uint32_t);
bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
- if (bmap_size > 0) {
- s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
- }
- if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) {
+ s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+ ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
+ if (ret < 0) {
goto fail_free_bmap;
}
@@ -448,7 +461,7 @@ static int vdi_open(BlockDriverState *bs, int flags)
g_free(s->bmap);
fail:
- return -1;
+ return ret;
}
static int vdi_reopen_prepare(BDRVReopenState *state,
@@ -766,6 +779,7 @@ static BlockDriver bdrv_vdi = {
.bdrv_close = vdi_close,
.bdrv_reopen_prepare = vdi_reopen_prepare,
.bdrv_create = vdi_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_is_allocated = vdi_co_is_allocated,
.bdrv_make_empty = vdi_make_empty,
diff --git a/block/vhdx.c b/block/vhdx.c
new file mode 100644
index 000000000..e9704b1fd
--- /dev/null
+++ b/block/vhdx.c
@@ -0,0 +1,972 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ * Jeff Cody <jcody@redhat.com>
+ *
+ * This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ * by Microsoft:
+ * https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/crc32c.h"
+#include "block/vhdx.h"
+
+
+/* Several metadata and region table data entries are identified by
+ * guids in a MS-specific GUID format. */
+
+
+/* ------- Known Region Table GUIDs ---------------------- */
+static const MSGUID bat_guid = { .data1 = 0x2dc27766,
+ .data2 = 0xf623,
+ .data3 = 0x4200,
+ .data4 = { 0x9d, 0x64, 0x11, 0x5e,
+ 0x9b, 0xfd, 0x4a, 0x08} };
+
+static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
+ .data2 = 0x4790,
+ .data3 = 0x4b9a,
+ .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
+ 0x05, 0x0f, 0x88, 0x6e} };
+
+
+
+/* ------- Known Metadata Entry GUIDs ---------------------- */
+static const MSGUID file_param_guid = { .data1 = 0xcaa16737,
+ .data2 = 0xfa36,
+ .data3 = 0x4d43,
+ .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
+ 0xaa, 0x44, 0xe7, 0x6b} };
+
+static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
+ .data2 = 0xcd1b,
+ .data3 = 0x4876,
+ .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
+ 0xd8, 0x3b, 0xf4, 0xb8} };
+
+static const MSGUID page83_guid = { .data1 = 0xbeca12ab,
+ .data2 = 0xb2e6,
+ .data3 = 0x4523,
+ .data4 = { 0x93, 0xef, 0xc3, 0x09,
+ 0xe0, 0x00, 0xc7, 0x46} };
+
+
+static const MSGUID phys_sector_guid = { .data1 = 0xcda348c7,
+ .data2 = 0x445d,
+ .data3 = 0x4471,
+ .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
+ 0x52, 0x51, 0xc5, 0x56} };
+
+static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
+ .data2 = 0xb30b,
+ .data3 = 0x454d,
+ .data4 = { 0xab, 0xf7, 0xd3,
+ 0xd8, 0x48, 0x34,
+ 0xab, 0x0c} };
+
+static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
+ .data2 = 0xa96f,
+ .data3 = 0x4709,
+ .data4 = { 0xba, 0x47, 0xf2,
+ 0x33, 0xa8, 0xfa,
+ 0xab, 0x5f} };
+
+/* Each parent type must have a valid GUID; this is for parent images
+ * of type 'VHDX'. If we were to allow e.g. a QCOW2 parent, we would
+ * need to make up our own QCOW2 GUID type */
+static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
+ .data2 = 0xd19e,
+ .data3 = 0x4a81,
+ .data4 = { 0xb7, 0x89, 0x25, 0xb8,
+ 0xe9, 0x44, 0x59, 0x13} };
+
+
+#define META_FILE_PARAMETER_PRESENT 0x01
+#define META_VIRTUAL_DISK_SIZE_PRESENT 0x02
+#define META_PAGE_83_PRESENT 0x04
+#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
+#define META_PHYS_SECTOR_SIZE_PRESENT 0x10
+#define META_PARENT_LOCATOR_PRESENT 0x20
+
+#define META_ALL_PRESENT \
+ (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
+ META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
+ META_PHYS_SECTOR_SIZE_PRESENT)
+
+typedef struct VHDXMetadataEntries {
+ VHDXMetadataTableEntry file_parameters_entry;
+ VHDXMetadataTableEntry virtual_disk_size_entry;
+ VHDXMetadataTableEntry page83_data_entry;
+ VHDXMetadataTableEntry logical_sector_size_entry;
+ VHDXMetadataTableEntry phys_sector_size_entry;
+ VHDXMetadataTableEntry parent_locator_entry;
+ uint16_t present;
+} VHDXMetadataEntries;
+
+
+typedef struct VHDXSectorInfo {
+ uint32_t bat_idx; /* BAT entry index */
+ uint32_t sectors_avail; /* sectors available in payload block */
+ uint32_t bytes_left; /* bytes left in the block after data to r/w */
+ uint32_t bytes_avail; /* bytes available in payload block */
+ uint64_t file_offset; /* absolute offset in bytes, in file */
+ uint64_t block_offset; /* block offset, in bytes */
+} VHDXSectorInfo;
+
+
+
+typedef struct BDRVVHDXState {
+ CoMutex lock;
+
+ int curr_header;
+ VHDXHeader *headers[2];
+
+ VHDXRegionTableHeader rt;
+ VHDXRegionTableEntry bat_rt; /* region table for the BAT */
+ VHDXRegionTableEntry metadata_rt; /* region table for the metadata */
+
+ VHDXMetadataTableHeader metadata_hdr;
+ VHDXMetadataEntries metadata_entries;
+
+ VHDXFileParameters params;
+ uint32_t block_size;
+ uint32_t block_size_bits;
+ uint32_t sectors_per_block;
+ uint32_t sectors_per_block_bits;
+
+ uint64_t virtual_disk_size;
+ uint32_t logical_sector_size;
+ uint32_t physical_sector_size;
+
+ uint64_t chunk_ratio;
+ uint32_t chunk_ratio_bits;
+ uint32_t logical_sector_size_bits;
+
+ uint32_t bat_entries;
+ VHDXBatEntry *bat;
+ uint64_t bat_offset;
+
+ VHDXParentLocatorHeader parent_header;
+ VHDXParentLocatorEntry *parent_entries;
+
+} BDRVVHDXState;
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+ int crc_offset)
+{
+ uint32_t crc_new;
+ uint32_t crc_orig;
+ assert(buf != NULL);
+
+ if (crc_offset > 0) {
+ memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+ memset(buf + crc_offset, 0, sizeof(crc_orig));
+ }
+
+ crc_new = crc32c(crc, buf, size);
+ if (crc_offset > 0) {
+ memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
+ }
+
+ return crc_new;
+}
+
+/* Validates the checksum of the buffer, with an in-place CRC.
+ *
+ * Zero is substituted during crc calculation for the original crc field,
+ * and the crc field is restored afterwards. But the buffer will be modifed
+ * during the calculation, so this may not be not suitable for multi-threaded
+ * use.
+ *
+ * crc_offset: byte offset in buf of the buffer crc
+ * buf: buffer pointer
+ * size: size of buffer (must be > crc_offset+4)
+ *
+ * returns true if checksum is valid, false otherwise
+ */
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
+{
+ uint32_t crc_orig;
+ uint32_t crc;
+
+ assert(buf != NULL);
+ assert(size > (crc_offset + 4));
+
+ memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+ crc_orig = le32_to_cpu(crc_orig);
+
+ crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
+
+ return crc == crc_orig;
+}
+
+
+/*
+ * Per the MS VHDX Specification, for every VHDX file:
+ * - The header section is fixed size - 1 MB
+ * - The header section is always the first "object"
+ * - The first 64KB of the header is the File Identifier
+ * - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
+ * - The following 512 bytes constitute a UTF-16 string identifiying the
+ * software that created the file, and is optional and diagnostic only.
+ *
+ * Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
+ */
+static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
+ return 100;
+ }
+ return 0;
+}
+
+/* All VHDX structures on disk are little endian */
+static void vhdx_header_le_import(VHDXHeader *h)
+{
+ assert(h != NULL);
+
+ le32_to_cpus(&h->signature);
+ le32_to_cpus(&h->checksum);
+ le64_to_cpus(&h->sequence_number);
+
+ leguid_to_cpus(&h->file_write_guid);
+ leguid_to_cpus(&h->data_write_guid);
+ leguid_to_cpus(&h->log_guid);
+
+ le16_to_cpus(&h->log_version);
+ le16_to_cpus(&h->version);
+ le32_to_cpus(&h->log_length);
+ le64_to_cpus(&h->log_offset);
+}
+
+
+/* opens the specified header block from the VHDX file header section */
+static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+{
+ int ret = 0;
+ VHDXHeader *header1;
+ VHDXHeader *header2;
+ bool h1_valid = false;
+ bool h2_valid = false;
+ uint64_t h1_seq = 0;
+ uint64_t h2_seq = 0;
+ uint8_t *buffer;
+
+ header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
+ header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
+
+ buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
+
+ s->headers[0] = header1;
+ s->headers[1] = header2;
+
+ /* We have to read the whole VHDX_HEADER_SIZE instead of
+ * sizeof(VHDXHeader), because the checksum is over the whole
+ * region */
+ ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
+ if (ret < 0) {
+ goto fail;
+ }
+ /* copy over just the relevant portion that we need */
+ memcpy(header1, buffer, sizeof(VHDXHeader));
+ vhdx_header_le_import(header1);
+
+ if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+ !memcmp(&header1->signature, "head", 4) &&
+ header1->version == 1) {
+ h1_seq = header1->sequence_number;
+ h1_valid = true;
+ }
+
+ ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
+ if (ret < 0) {
+ goto fail;
+ }
+ /* copy over just the relevant portion that we need */
+ memcpy(header2, buffer, sizeof(VHDXHeader));
+ vhdx_header_le_import(header2);
+
+ if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+ !memcmp(&header2->signature, "head", 4) &&
+ header2->version == 1) {
+ h2_seq = header2->sequence_number;
+ h2_valid = true;
+ }
+
+ /* If there is only 1 valid header (or no valid headers), we
+ * don't care what the sequence numbers are */
+ if (h1_valid && !h2_valid) {
+ s->curr_header = 0;
+ } else if (!h1_valid && h2_valid) {
+ s->curr_header = 1;
+ } else if (!h1_valid && !h2_valid) {
+ ret = -EINVAL;
+ goto fail;
+ } else {
+ /* If both headers are valid, then we choose the active one by the
+ * highest sequence number. If the sequence numbers are equal, that is
+ * invalid */
+ if (h1_seq > h2_seq) {
+ s->curr_header = 0;
+ } else if (h2_seq > h1_seq) {
+ s->curr_header = 1;
+ } else {
+ ret = -EINVAL;
+ goto fail;
+ }
+ }
+
+ ret = 0;
+
+ goto exit;
+
+fail:
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
+ qemu_vfree(header1);
+ qemu_vfree(header2);
+ s->headers[0] = NULL;
+ s->headers[1] = NULL;
+exit:
+ qemu_vfree(buffer);
+ return ret;
+}
+
+
+static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+{
+ int ret = 0;
+ uint8_t *buffer;
+ int offset = 0;
+ VHDXRegionTableEntry rt_entry;
+ uint32_t i;
+ bool bat_rt_found = false;
+ bool metadata_rt_found = false;
+
+ /* We have to read the whole 64KB block, because the crc32 is over the
+ * whole block */
+ buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
+
+ ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+ VHDX_HEADER_BLOCK_SIZE);
+ if (ret < 0) {
+ goto fail;
+ }
+ memcpy(&s->rt, buffer, sizeof(s->rt));
+ le32_to_cpus(&s->rt.signature);
+ le32_to_cpus(&s->rt.checksum);
+ le32_to_cpus(&s->rt.entry_count);
+ le32_to_cpus(&s->rt.reserved);
+ offset += sizeof(s->rt);
+
+ if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
+ memcmp(&s->rt.signature, "regi", 4)) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Per spec, maximum region table entry count is 2047 */
+ if (s->rt.entry_count > 2047) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ for (i = 0; i < s->rt.entry_count; i++) {
+ memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
+ offset += sizeof(rt_entry);
+
+ leguid_to_cpus(&rt_entry.guid);
+ le64_to_cpus(&rt_entry.file_offset);
+ le32_to_cpus(&rt_entry.length);
+ le32_to_cpus(&rt_entry.data_bits);
+
+ /* see if we recognize the entry */
+ if (guid_eq(rt_entry.guid, bat_guid)) {
+ /* must be unique; if we have already found it this is invalid */
+ if (bat_rt_found) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ bat_rt_found = true;
+ s->bat_rt = rt_entry;
+ continue;
+ }
+
+ if (guid_eq(rt_entry.guid, metadata_guid)) {
+ /* must be unique; if we have already found it this is invalid */
+ if (metadata_rt_found) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ metadata_rt_found = true;
+ s->metadata_rt = rt_entry;
+ continue;
+ }
+
+ if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
+ /* cannot read vhdx file - required region table entry that
+ * we do not understand. per spec, we must fail to open */
+ ret = -ENOTSUP;
+ goto fail;
+ }
+ }
+ ret = 0;
+
+fail:
+ qemu_vfree(buffer);
+ return ret;
+}
+
+
+
+/* Metadata initial parser
+ *
+ * This loads all the metadata entry fields. This may cause additional
+ * fields to be processed (e.g. parent locator, etc..).
+ *
+ * There are 5 Metadata items that are always required:
+ * - File Parameters (block size, has a parent)
+ * - Virtual Disk Size (size, in bytes, of the virtual drive)
+ * - Page 83 Data (scsi page 83 guid)
+ * - Logical Sector Size (logical sector size in bytes, either 512 or
+ * 4096. We only support 512 currently)
+ * - Physical Sector Size (512 or 4096)
+ *
+ * Also, if the File Parameters indicate this is a differencing file,
+ * we must also look for the Parent Locator metadata item.
+ */
+static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+{
+ int ret = 0;
+ uint8_t *buffer;
+ int offset = 0;
+ uint32_t i = 0;
+ VHDXMetadataTableEntry md_entry;
+
+ buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
+
+ ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+ VHDX_METADATA_TABLE_MAX_SIZE);
+ if (ret < 0) {
+ goto exit;
+ }
+ memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
+ offset += sizeof(s->metadata_hdr);
+
+ le64_to_cpus(&s->metadata_hdr.signature);
+ le16_to_cpus(&s->metadata_hdr.reserved);
+ le16_to_cpus(&s->metadata_hdr.entry_count);
+
+ if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ s->metadata_entries.present = 0;
+
+ if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
+ (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ for (i = 0; i < s->metadata_hdr.entry_count; i++) {
+ memcpy(&md_entry, buffer + offset, sizeof(md_entry));
+ offset += sizeof(md_entry);
+
+ leguid_to_cpus(&md_entry.item_id);
+ le32_to_cpus(&md_entry.offset);
+ le32_to_cpus(&md_entry.length);
+ le32_to_cpus(&md_entry.data_bits);
+ le32_to_cpus(&md_entry.reserved2);
+
+ if (guid_eq(md_entry.item_id, file_param_guid)) {
+ if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.file_parameters_entry = md_entry;
+ s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
+ continue;
+ }
+
+ if (guid_eq(md_entry.item_id, virtual_size_guid)) {
+ if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.virtual_disk_size_entry = md_entry;
+ s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
+ continue;
+ }
+
+ if (guid_eq(md_entry.item_id, page83_guid)) {
+ if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.page83_data_entry = md_entry;
+ s->metadata_entries.present |= META_PAGE_83_PRESENT;
+ continue;
+ }
+
+ if (guid_eq(md_entry.item_id, logical_sector_guid)) {
+ if (s->metadata_entries.present &
+ META_LOGICAL_SECTOR_SIZE_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.logical_sector_size_entry = md_entry;
+ s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
+ continue;
+ }
+
+ if (guid_eq(md_entry.item_id, phys_sector_guid)) {
+ if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.phys_sector_size_entry = md_entry;
+ s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
+ continue;
+ }
+
+ if (guid_eq(md_entry.item_id, parent_locator_guid)) {
+ if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->metadata_entries.parent_locator_entry = md_entry;
+ s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
+ continue;
+ }
+
+ if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
+ /* cannot read vhdx file - required region table entry that
+ * we do not understand. per spec, we must fail to open */
+ ret = -ENOTSUP;
+ goto exit;
+ }
+ }
+
+ if (s->metadata_entries.present != META_ALL_PRESENT) {
+ ret = -ENOTSUP;
+ goto exit;
+ }
+
+ ret = bdrv_pread(bs->file,
+ s->metadata_entries.file_parameters_entry.offset
+ + s->metadata_rt.file_offset,
+ &s->params,
+ sizeof(s->params));
+
+ if (ret < 0) {
+ goto exit;
+ }
+
+ le32_to_cpus(&s->params.block_size);
+ le32_to_cpus(&s->params.data_bits);
+
+
+ /* We now have the file parameters, so we can tell if this is a
+ * differencing file (i.e.. has_parent), is dynamic or fixed
+ * sized (leave_blocks_allocated), and the block size */
+
+ /* The parent locator required iff the file parameters has_parent set */
+ if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+ if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+ /* TODO: parse parent locator fields */
+ ret = -ENOTSUP; /* temp, until differencing files are supported */
+ goto exit;
+ } else {
+ /* if has_parent is set, but there is not parent locator present,
+ * then that is an invalid combination */
+ ret = -EINVAL;
+ goto exit;
+ }
+ }
+
+ /* determine virtual disk size, logical sector size,
+ * and phys sector size */
+
+ ret = bdrv_pread(bs->file,
+ s->metadata_entries.virtual_disk_size_entry.offset
+ + s->metadata_rt.file_offset,
+ &s->virtual_disk_size,
+ sizeof(uint64_t));
+ if (ret < 0) {
+ goto exit;
+ }
+ ret = bdrv_pread(bs->file,
+ s->metadata_entries.logical_sector_size_entry.offset
+ + s->metadata_rt.file_offset,
+ &s->logical_sector_size,
+ sizeof(uint32_t));
+ if (ret < 0) {
+ goto exit;
+ }
+ ret = bdrv_pread(bs->file,
+ s->metadata_entries.phys_sector_size_entry.offset
+ + s->metadata_rt.file_offset,
+ &s->physical_sector_size,
+ sizeof(uint32_t));
+ if (ret < 0) {
+ goto exit;
+ }
+
+ le64_to_cpus(&s->virtual_disk_size);
+ le32_to_cpus(&s->logical_sector_size);
+ le32_to_cpus(&s->physical_sector_size);
+
+ if (s->logical_sector_size == 0 || s->params.block_size == 0) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ /* both block_size and sector_size are guaranteed powers of 2 */
+ s->sectors_per_block = s->params.block_size / s->logical_sector_size;
+ s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+ (uint64_t)s->logical_sector_size /
+ (uint64_t)s->params.block_size;
+
+ /* These values are ones we will want to use for division / multiplication
+ * later on, and they are all guaranteed (per the spec) to be powers of 2,
+ * so we can take advantage of that for shift operations during
+ * reads/writes */
+ if (s->logical_sector_size & (s->logical_sector_size - 1)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ if (s->sectors_per_block & (s->sectors_per_block - 1)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ if (s->chunk_ratio & (s->chunk_ratio - 1)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ s->block_size = s->params.block_size;
+ if (s->block_size & (s->block_size - 1)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
+ s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block);
+ s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio);
+ s->block_size_bits = 31 - clz32(s->block_size);
+
+ ret = 0;
+
+exit:
+ qemu_vfree(buffer);
+ return ret;
+}
+
+/* Parse the replay log. Per the VHDX spec, if the log is present
+ * it must be replayed prior to opening the file, even read-only.
+ *
+ * If read-only, we must replay the log in RAM (or refuse to open
+ * a dirty VHDX file read-only */
+static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
+{
+ int ret = 0;
+ int i;
+ VHDXHeader *hdr;
+
+ hdr = s->headers[s->curr_header];
+
+ /* either the log guid, or log length is zero,
+ * then a replay log is present */
+ for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
+ ret |= hdr->log_guid.data4[i];
+ }
+ if (hdr->log_guid.data1 == 0 &&
+ hdr->log_guid.data2 == 0 &&
+ hdr->log_guid.data3 == 0 &&
+ ret == 0) {
+ goto exit;
+ }
+
+ /* per spec, only log version of 0 is supported */
+ if (hdr->log_version != 0) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (hdr->log_length == 0) {
+ goto exit;
+ }
+
+ /* We currently do not support images with logs to replay */
+ ret = -ENOTSUP;
+
+exit:
+ return ret;
+}
+
+
+static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVVHDXState *s = bs->opaque;
+ int ret = 0;
+ uint32_t i;
+ uint64_t signature;
+ uint32_t data_blocks_cnt, bitmap_blocks_cnt;
+
+
+ s->bat = NULL;
+
+ qemu_co_mutex_init(&s->lock);
+
+ /* validate the file signature */
+ ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+ if (memcmp(&signature, "vhdxfile", 8)) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = vhdx_parse_header(bs, s);
+ if (ret) {
+ goto fail;
+ }
+
+ ret = vhdx_parse_log(bs, s);
+ if (ret) {
+ goto fail;
+ }
+
+ ret = vhdx_open_region_tables(bs, s);
+ if (ret) {
+ goto fail;
+ }
+
+ ret = vhdx_parse_metadata(bs, s);
+ if (ret) {
+ goto fail;
+ }
+ s->block_size = s->params.block_size;
+
+ /* the VHDX spec dictates that virtual_disk_size is always a multiple of
+ * logical_sector_size */
+ bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
+
+ data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
+ if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
+ data_blocks_cnt++;
+ }
+ bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
+ if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
+ bitmap_blocks_cnt++;
+ }
+
+ if (s->parent_entries) {
+ s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
+ } else {
+ s->bat_entries = data_blocks_cnt +
+ ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
+ }
+
+ s->bat_offset = s->bat_rt.file_offset;
+
+ if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
+ /* BAT allocation is not large enough for all entries */
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->bat = qemu_blockalign(bs, s->bat_rt.length);
+
+ ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for (i = 0; i < s->bat_entries; i++) {
+ le64_to_cpus(&s->bat[i]);
+ }
+
+ if (flags & BDRV_O_RDWR) {
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /* TODO: differencing files, write */
+
+ return 0;
+fail:
+ qemu_vfree(s->headers[0]);
+ qemu_vfree(s->headers[1]);
+ qemu_vfree(s->bat);
+ qemu_vfree(s->parent_entries);
+ return ret;
+}
+
+static int vhdx_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
+
+/*
+ * Perform sector to block offset translations, to get various
+ * sector and file offsets into the image. See VHDXSectorInfo
+ */
+static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
+ int nb_sectors, VHDXSectorInfo *sinfo)
+{
+ uint32_t block_offset;
+
+ sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
+ /* effectively a modulo - this gives us the offset into the block
+ * (in sector sizes) for our sector number */
+ block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
+ /* the chunk ratio gives us the interleaving of the sector
+ * bitmaps, so we need to advance our page block index by the
+ * sector bitmaps entry number */
+ sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
+
+ /* the number of sectors we can read/write in this cycle */
+ sinfo->sectors_avail = s->sectors_per_block - block_offset;
+
+ sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+ if (sinfo->sectors_avail > nb_sectors) {
+ sinfo->sectors_avail = nb_sectors;
+ }
+
+ sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+ sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
+
+ sinfo->block_offset = block_offset << s->logical_sector_size_bits;
+
+ /* The file offset must be past the header section, so must be > 0 */
+ if (sinfo->file_offset == 0) {
+ return;
+ }
+
+ /* block offset is the offset in vhdx logical sectors, in
+ * the payload data block. Convert that to a byte offset
+ * in the block, and add in the payload data block offset
+ * in the file, in bytes, to get the final read address */
+
+ sinfo->file_offset <<= 20; /* now in bytes, rather than 1MB units */
+ sinfo->file_offset += sinfo->block_offset;
+}
+
+
+
+static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVVHDXState *s = bs->opaque;
+ int ret = 0;
+ VHDXSectorInfo sinfo;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (nb_sectors > 0) {
+ /* We are a differencing file, so we need to inspect the sector bitmap
+ * to see if we have the data or not */
+ if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+ /* not supported yet */
+ ret = -ENOTSUP;
+ goto exit;
+ } else {
+ vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done, sinfo.bytes_avail);
+
+ /* check the payload block state */
+ switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
+ case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
+ case PAYLOAD_BLOCK_UNDEFINED: /* fall through */
+ case PAYLOAD_BLOCK_UNMAPPED: /* fall through */
+ case PAYLOAD_BLOCK_ZERO:
+ /* return zero */
+ qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
+ break;
+ case PAYLOAD_BLOCK_FULL_PRESENT:
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->file,
+ sinfo.file_offset >> BDRV_SECTOR_BITS,
+ sinfo.sectors_avail, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto exit;
+ }
+ break;
+ case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
+ /* we don't yet support difference files, fall through
+ * to error */
+ default:
+ ret = -EIO;
+ goto exit;
+ break;
+ }
+ nb_sectors -= sinfo.sectors_avail;
+ sector_num += sinfo.sectors_avail;
+ bytes_done += sinfo.bytes_avail;
+ }
+ }
+ ret = 0;
+exit:
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_iovec_destroy(&hd_qiov);
+ return ret;
+}
+
+
+
+static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ return -ENOTSUP;
+}
+
+
+static void vhdx_close(BlockDriverState *bs)
+{
+ BDRVVHDXState *s = bs->opaque;
+ qemu_vfree(s->headers[0]);
+ qemu_vfree(s->headers[1]);
+ qemu_vfree(s->bat);
+ qemu_vfree(s->parent_entries);
+}
+
+static BlockDriver bdrv_vhdx = {
+ .format_name = "vhdx",
+ .instance_size = sizeof(BDRVVHDXState),
+ .bdrv_probe = vhdx_probe,
+ .bdrv_open = vhdx_open,
+ .bdrv_close = vhdx_close,
+ .bdrv_reopen_prepare = vhdx_reopen_prepare,
+ .bdrv_co_readv = vhdx_co_readv,
+ .bdrv_co_writev = vhdx_co_writev,
+};
+
+static void bdrv_vhdx_init(void)
+{
+ bdrv_register(&bdrv_vhdx);
+}
+
+block_init(bdrv_vhdx_init);
diff --git a/block/vhdx.h b/block/vhdx.h
new file mode 100644
index 000000000..fb687ed2d
--- /dev/null
+++ b/block/vhdx.h
@@ -0,0 +1,325 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ * Jeff Cody <jcody@redhat.com>
+ *
+ * This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ * by Microsoft:
+ * https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_VHDX_H
+#define BLOCK_VHDX_H
+
+/* Structures and fields present in the VHDX file */
+
+/* The header section has the following blocks,
+ * each block is 64KB:
+ *
+ * _____________________________________________________________________________
+ * | File Id. | Header 1 | Header 2 | Region Table | Reserved (768KB) |
+ * |----------|---------------|------------|--------------|--------------------|
+ * | | | | | |
+ * 0.........64KB...........128KB........192KB..........256KB................1MB
+ */
+
+#define VHDX_HEADER_BLOCK_SIZE (64*1024)
+
+#define VHDX_FILE_ID_OFFSET 0
+#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE*1)
+#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE*2)
+#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE*3)
+
+
+/*
+ * A note on the use of MS-GUID fields. For more details on the GUID,
+ * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier.
+ *
+ * The VHDX specification only states that these are MS GUIDs, and which
+ * bytes are data1-data4. It makes no mention of what algorithm should be used
+ * to generate the GUID, nor what standard. However, looking at the specified
+ * known GUID fields, it appears the GUIDs are:
+ * Standard/DCE GUID type (noted by 10b in the MSB of byte 0 of .data4)
+ * Random algorithm (noted by 0x4XXX for .data3)
+ */
+
+/* ---- HEADER SECTION STRUCTURES ---- */
+
+/* These structures are ones that are defined in the VHDX specification
+ * document */
+
+typedef struct VHDXFileIdentifier {
+ uint64_t signature; /* "vhdxfile" in ASCII */
+ uint16_t creator[256]; /* optional; utf-16 string to identify
+ the vhdx file creator. Diagnotistic
+ only */
+} VHDXFileIdentifier;
+
+
+/* the guid is a 16 byte unique ID - the definition for this used by
+ * Microsoft is not just 16 bytes though - it is a structure that is defined,
+ * so we need to follow it here so that endianness does not trip us up */
+
+typedef struct MSGUID {
+ uint32_t data1;
+ uint16_t data2;
+ uint16_t data3;
+ uint8_t data4[8];
+} MSGUID;
+
+#define guid_eq(a, b) \
+ (memcmp(&(a), &(b), sizeof(MSGUID)) == 0)
+
+#define VHDX_HEADER_SIZE (4*1024) /* although the vhdx_header struct in disk
+ is only 582 bytes, for purposes of crc
+ the header is the first 4KB of the 64KB
+ block */
+
+/* The full header is 4KB, although the actual header data is much smaller.
+ * But for the checksum calculation, it is over the entire 4KB structure,
+ * not just the defined portion of it */
+typedef struct QEMU_PACKED VHDXHeader {
+ uint32_t signature; /* "head" in ASCII */
+ uint32_t checksum; /* CRC-32C hash of the whole header */
+ uint64_t sequence_number; /* Seq number of this header. Each
+ VHDX file has 2 of these headers,
+ and only the header with the highest
+ sequence number is valid */
+ MSGUID file_write_guid; /* 128 bit unique identifier. Must be
+ updated to new, unique value before
+ the first modification is made to
+ file */
+ MSGUID data_write_guid; /* 128 bit unique identifier. Must be
+ updated to new, unique value before
+ the first modification is made to
+ visible data. Visbile data is
+ defined as:
+ - system & user metadata
+ - raw block data
+ - disk size
+ - any change that will
+ cause the virtual disk
+ sector read to differ
+
+ This does not need to change if
+ blocks are re-arranged */
+ MSGUID log_guid; /* 128 bit unique identifier. If zero,
+ there is no valid log. If non-zero,
+ log entries with this guid are
+ valid. */
+ uint16_t log_version; /* version of the log format. Mustn't be
+ zero, unless log_guid is also zero */
+ uint16_t version; /* version of th evhdx file. Currently,
+ only supported version is "1" */
+ uint32_t log_length; /* length of the log. Must be multiple
+ of 1MB */
+ uint64_t log_offset; /* byte offset in the file of the log.
+ Must also be a multiple of 1MB */
+} VHDXHeader;
+
+/* Header for the region table block */
+typedef struct QEMU_PACKED VHDXRegionTableHeader {
+ uint32_t signature; /* "regi" in ASCII */
+ uint32_t checksum; /* CRC-32C hash of the 64KB table */
+ uint32_t entry_count; /* number of valid entries */
+ uint32_t reserved;
+} VHDXRegionTableHeader;
+
+/* Individual region table entry. There may be a maximum of 2047 of these
+ *
+ * There are two known region table properties. Both are required.
+ * BAT (block allocation table): 2DC27766F62342009D64115E9BFD4A08
+ * Metadata: 8B7CA20647904B9AB8FE575F050F886E
+ */
+#define VHDX_REGION_ENTRY_REQUIRED 0x01 /* if set, parser must understand
+ this entry in order to open
+ file */
+typedef struct QEMU_PACKED VHDXRegionTableEntry {
+ MSGUID guid; /* 128-bit unique identifier */
+ uint64_t file_offset; /* offset of the object in the file.
+ Must be multiple of 1MB */
+ uint32_t length; /* length, in bytes, of the object */
+ uint32_t data_bits;
+} VHDXRegionTableEntry;
+
+
+/* ---- LOG ENTRY STRUCTURES ---- */
+#define VHDX_LOG_HDR_SIZE 64
+typedef struct QEMU_PACKED VHDXLogEntryHeader {
+ uint32_t signature; /* "loge" in ASCII */
+ uint32_t checksum; /* CRC-32C hash of the 64KB table */
+ uint32_t entry_length; /* length in bytes, multiple of 1MB */
+ uint32_t tail; /* byte offset of first log entry of a
+ seq, where this entry is the last
+ entry */
+ uint64_t sequence_number; /* incremented with each log entry.
+ May not be zero. */
+ uint32_t descriptor_count; /* number of descriptors in this log
+ entry, must be >= 0 */
+ uint32_t reserved;
+ MSGUID log_guid; /* value of the log_guid from
+ vhdx_header. If not found in
+ vhdx_header, it is invalid */
+ uint64_t flushed_file_offset; /* see spec for full details - this
+ should be vhdx file size in bytes */
+ uint64_t last_file_offset; /* size in bytes that all allocated
+ file structures fit into */
+} VHDXLogEntryHeader;
+
+#define VHDX_LOG_DESC_SIZE 32
+
+typedef struct QEMU_PACKED VHDXLogDescriptor {
+ uint32_t signature; /* "zero" or "desc" in ASCII */
+ union {
+ uint32_t reserved; /* zero desc */
+ uint32_t trailing_bytes; /* data desc: bytes 4092-4096 of the
+ data sector */
+ };
+ union {
+ uint64_t zero_length; /* zero desc: length of the section to
+ zero */
+ uint64_t leading_bytes; /* data desc: bytes 0-7 of the data
+ sector */
+ };
+ uint64_t file_offset; /* file offset to write zeros - multiple
+ of 4kB */
+ uint64_t sequence_number; /* must match same field in
+ vhdx_log_entry_header */
+} VHDXLogDescriptor;
+
+typedef struct QEMU_PACKED VHDXLogDataSector {
+ uint32_t data_signature; /* "data" in ASCII */
+ uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */
+ uint8_t data[4084]; /* raw data, bytes 8-4091 (inclusive).
+ see the data descriptor field for the
+ other mising bytes */
+ uint32_t sequence_low; /* 4 LSB of 8 byte sequence_number */
+} VHDXLogDataSector;
+
+
+
+/* block states - different state values depending on whether it is a
+ * payload block, or a sector block. */
+
+#define PAYLOAD_BLOCK_NOT_PRESENT 0
+#define PAYLOAD_BLOCK_UNDEFINED 1
+#define PAYLOAD_BLOCK_ZERO 2
+#define PAYLOAD_BLOCK_UNMAPPED 5
+#define PAYLOAD_BLOCK_FULL_PRESENT 6
+#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7
+
+#define SB_BLOCK_NOT_PRESENT 0
+#define SB_BLOCK_PRESENT 6
+
+/* per the spec */
+#define VHDX_MAX_SECTORS_PER_BLOCK (1<<23)
+
+/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state
+ other bits are reserved */
+#define VHDX_BAT_STATE_BIT_MASK 0x07
+#define VHDX_BAT_FILE_OFF_BITS (64-44)
+typedef uint64_t VHDXBatEntry;
+
+/* ---- METADATA REGION STRUCTURES ---- */
+
+#define VHDX_METADATA_ENTRY_SIZE 32
+#define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */
+#define VHDX_METADATA_TABLE_MAX_SIZE \
+ (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1))
+typedef struct QEMU_PACKED VHDXMetadataTableHeader {
+ uint64_t signature; /* "metadata" in ASCII */
+ uint16_t reserved;
+ uint16_t entry_count; /* number table entries. <= 2047 */
+ uint32_t reserved2[5];
+} VHDXMetadataTableHeader;
+
+#define VHDX_META_FLAGS_IS_USER 0x01 /* max 1024 entries */
+#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02 /* virtual disk metadata if set,
+ otherwise file metdata */
+#define VHDX_META_FLAGS_IS_REQUIRED 0x04 /* parse must understand this
+ entry to open the file */
+typedef struct QEMU_PACKED VHDXMetadataTableEntry {
+ MSGUID item_id; /* 128-bit identifier for metadata */
+ uint32_t offset; /* byte offset of the metadata. At
+ least 64kB. Relative to start of
+ metadata region */
+ /* note: if length = 0, so is offset */
+ uint32_t length; /* length of metadata. <= 1MB. */
+ uint32_t data_bits; /* least-significant 3 bits are flags, the
+ rest are reserved (see above) */
+ uint32_t reserved2;
+} VHDXMetadataTableEntry;
+
+#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01 /* Do not change any blocks to
+ be BLOCK_NOT_PRESENT.
+ If set indicates a fixed
+ size VHDX file */
+#define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */
+typedef struct QEMU_PACKED VHDXFileParameters {
+ uint32_t block_size; /* size of each payload block, always
+ power of 2, <= 256MB and >= 1MB. */
+ uint32_t data_bits; /* least-significant 2 bits are flags, the rest
+ are reserved (see above) */
+} VHDXFileParameters;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskSize {
+ uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes.
+ Must be multiple of the sector size,
+ max of 64TB */
+} VHDXVirtualDiskSize;
+
+typedef struct QEMU_PACKED VHDXPage83Data {
+ MSGUID page_83_data[16]; /* unique id for scsi devices that
+ support page 0x83 */
+} VHDXPage83Data;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize {
+ uint32_t logical_sector_size; /* virtual disk sector size (in bytes).
+ Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskLogicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize {
+ uint32_t physical_sector_size; /* physical sector size (in bytes).
+ Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskPhysicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXParentLocatorHeader {
+ MSGUID locator_type[16]; /* type of the parent virtual disk. */
+ uint16_t reserved;
+ uint16_t key_value_count; /* number of key/value pairs for this
+ locator */
+} VHDXParentLocatorHeader;
+
+/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */
+typedef struct QEMU_PACKED VHDXParentLocatorEntry {
+ uint32_t key_offset; /* offset in metadata for key, > 0 */
+ uint32_t value_offset; /* offset in metadata for value, >0 */
+ uint16_t key_length; /* length of entry key, > 0 */
+ uint16_t value_length; /* length of entry value, > 0 */
+} VHDXParentLocatorEntry;
+
+
+/* ----- END VHDX SPECIFICATION STRUCTURES ---- */
+
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+ int crc_offset);
+
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
+
+
+static void leguid_to_cpus(MSGUID *guid)
+{
+ le32_to_cpus(&guid->data1);
+ le16_to_cpus(&guid->data2);
+ le16_to_cpus(&guid->data3);
+}
+
+#endif
diff --git a/block/vmdk.c b/block/vmdk.c
index 51398c0c0..346bb5cad 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -24,19 +24,33 @@
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
#include <zlib.h>
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
#define VMDK4_COMPRESSION_DEFLATE 1
+#define VMDK4_FLAG_NL_DETECT (1 << 0)
#define VMDK4_FLAG_RGD (1 << 1)
+/* Zeroed-grain enable bit */
+#define VMDK4_FLAG_ZERO_GRAIN (1 << 2)
#define VMDK4_FLAG_COMPRESS (1 << 16)
#define VMDK4_FLAG_MARKER (1 << 17)
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
+#define VMDK_GTE_ZEROED 0x1
+
+/* VMDK internal error codes */
+#define VMDK_OK 0
+#define VMDK_ERROR (-1)
+/* Cluster not allocated */
+#define VMDK_UNALLOC (-2)
+#define VMDK_ZEROED (-3)
+
+#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
+
typedef struct {
uint32_t version;
uint32_t flags;
@@ -48,19 +62,20 @@ typedef struct {
uint32_t cylinders;
uint32_t heads;
uint32_t sectors_per_track;
-} VMDK3Header;
+} QEMU_PACKED VMDK3Header;
typedef struct {
uint32_t version;
uint32_t flags;
- int64_t capacity;
- int64_t granularity;
- int64_t desc_offset;
- int64_t desc_size;
- int32_t num_gtes_per_gte;
- int64_t rgd_offset;
- int64_t gd_offset;
- int64_t grain_offset;
+ uint64_t capacity;
+ uint64_t granularity;
+ uint64_t desc_offset;
+ uint64_t desc_size;
+ /* Number of GrainTableEntries per GrainTable */
+ uint32_t num_gtes_per_gt;
+ uint64_t rgd_offset;
+ uint64_t gd_offset;
+ uint64_t grain_offset;
char filler[1];
char check_bytes[4];
uint16_t compressAlgorithm;
@@ -73,6 +88,8 @@ typedef struct VmdkExtent {
bool flat;
bool compressed;
bool has_marker;
+ bool has_zero_grain;
+ int version;
int64_t sectors;
int64_t end_sector;
int64_t flat_start_offset;
@@ -93,7 +110,7 @@ typedef struct VmdkExtent {
typedef struct BDRVVmdkState {
CoMutex lock;
- int desc_offset;
+ uint64_t desc_offset;
bool cid_updated;
uint32_t parent_cid;
int num_extents;
@@ -108,13 +125,14 @@ typedef struct VmdkMetaData {
unsigned int l2_index;
unsigned int l2_offset;
int valid;
+ uint32_t *l2_cache_entry;
} VmdkMetaData;
typedef struct VmdkGrainMarker {
uint64_t lba;
uint32_t size;
uint8_t data[0];
-} VmdkGrainMarker;
+} QEMU_PACKED VmdkGrainMarker;
enum {
MARKER_END_OF_STREAM = 0,
@@ -368,15 +386,22 @@ static int vmdk_parent_open(BlockDriverState *bs)
/* Create and append extent to the extent array. Return the added VmdkExtent
* address. return NULL if allocation failed. */
-static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
+static int vmdk_add_extent(BlockDriverState *bs,
BlockDriverState *file, bool flat, int64_t sectors,
int64_t l1_offset, int64_t l1_backup_offset,
uint32_t l1_size,
- int l2_size, unsigned int cluster_sectors)
+ int l2_size, uint64_t cluster_sectors,
+ VmdkExtent **new_extent)
{
VmdkExtent *extent;
BDRVVmdkState *s = bs->opaque;
+ if (cluster_sectors > 0x200000) {
+ /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
+ error_report("invalid granularity, image may be corrupt");
+ return -EINVAL;
+ }
+
s->extents = g_realloc(s->extents,
(s->num_extents + 1) * sizeof(VmdkExtent));
extent = &s->extents[s->num_extents];
@@ -399,7 +424,10 @@ static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
extent->end_sector = extent->sectors;
}
bs->total_sectors = extent->end_sector;
- return extent;
+ if (new_extent) {
+ *new_extent = extent;
+ }
+ return 0;
}
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
@@ -458,12 +486,17 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
if (ret < 0) {
return ret;
}
- extent = vmdk_add_extent(bs,
+
+ ret = vmdk_add_extent(bs,
bs->file, false,
le32_to_cpu(header.disk_sectors),
le32_to_cpu(header.l1dir_offset) << 9,
0, 1 << 6, 1 << 9,
- le32_to_cpu(header.granularity));
+ le32_to_cpu(header.granularity),
+ &extent);
+ if (ret < 0) {
+ return ret;
+ }
ret = vmdk_init_tables(bs, extent);
if (ret) {
/* free extent allocated by vmdk_add_extent */
@@ -473,7 +506,7 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
}
static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
- int64_t desc_offset);
+ uint64_t desc_offset);
static int vmdk_open_vmdk4(BlockDriverState *bs,
BlockDriverState *file,
@@ -490,8 +523,11 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
if (ret < 0) {
return ret;
}
- if (header.capacity == 0 && header.desc_offset) {
- return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
+ if (header.capacity == 0) {
+ uint64_t desc_offset = le64_to_cpu(header.desc_offset);
+ if (desc_offset) {
+ return vmdk_open_desc_file(bs, flags, desc_offset << 9);
+ }
}
if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
@@ -541,26 +577,54 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
header = footer.header;
}
- l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
+ if (le32_to_cpu(header.version) >= 3) {
+ char buf[64];
+ snprintf(buf, sizeof(buf), "VMDK version %d",
+ le32_to_cpu(header.version));
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "vmdk", buf);
+ return -ENOTSUP;
+ }
+
+ if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
+ error_report("L2 table size too big");
+ return -EINVAL;
+ }
+
+ l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
* le64_to_cpu(header.granularity);
if (l1_entry_sectors == 0) {
return -EINVAL;
}
l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
/ l1_entry_sectors;
+ if (l1_size > 512 * 1024 * 1024) {
+ /* although with big capacity and small l1_entry_sectors, we can get a
+ * big l1_size, we don't want unbounded value to allocate the table.
+ * Limit it to 512M, which is 16PB for default cluster and L2 table
+ * size */
+ error_report("L1 size too big");
+ return -EFBIG;
+ }
if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
}
- extent = vmdk_add_extent(bs, file, false,
+ ret = vmdk_add_extent(bs, file, false,
le64_to_cpu(header.capacity),
le64_to_cpu(header.gd_offset) << 9,
l1_backup_offset,
l1_size,
- le32_to_cpu(header.num_gtes_per_gte),
- le64_to_cpu(header.granularity));
+ le32_to_cpu(header.num_gtes_per_gt),
+ le64_to_cpu(header.granularity),
+ &extent);
+ if (ret < 0) {
+ return ret;
+ }
extent->compressed =
le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
+ extent->version = le32_to_cpu(header.version);
+ extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
ret = vmdk_init_tables(bs, extent);
if (ret) {
/* free extent allocated by vmdk_add_extent */
@@ -578,22 +642,22 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
opt_pos = strstr(desc, opt_name);
if (!opt_pos) {
- return -1;
+ return VMDK_ERROR;
}
/* Skip "=\"" following opt_name */
opt_pos += strlen(opt_name) + 2;
if (opt_pos >= end) {
- return -1;
+ return VMDK_ERROR;
}
opt_end = opt_pos;
while (opt_end < end && *opt_end != '"') {
opt_end++;
}
if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
- return -1;
+ return VMDK_ERROR;
}
pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
- return 0;
+ return VMDK_OK;
}
/* Open an extent file and append to bs array */
@@ -616,7 +680,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
return vmdk_open_vmdk4(bs, file, flags);
break;
default:
- return -EINVAL;
+ return -EMEDIUMTYPE;
break;
}
}
@@ -641,7 +705,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
* RW [size in sectors] SPARSE "file-name.vmdk"
*/
flat_offset = -1;
- ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64,
+ ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
access, &sectors, type, fname, &flat_offset);
if (ret < 4 || strcmp(access, "RW")) {
goto next_line;
@@ -653,14 +717,6 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
return -EINVAL;
}
- /* trim the quotation marks around */
- if (fname[0] == '"') {
- memmove(fname, fname + 1, strlen(fname));
- if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') {
- return -EINVAL;
- }
- fname[strlen(fname) - 1] = '\0';
- }
if (sectors <= 0 ||
(strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
(strcmp(access, "RW"))) {
@@ -669,7 +725,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
path_combine(extent_path, sizeof(extent_path),
desc_file_path, fname);
- ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags);
+ ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags);
if (ret) {
return ret;
}
@@ -679,8 +735,11 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
/* FLAT extent */
VmdkExtent *extent;
- extent = vmdk_add_extent(bs, extent_file, true, sectors,
- 0, 0, 0, 0, sectors);
+ ret = vmdk_add_extent(bs, extent_file, true, sectors,
+ 0, 0, 0, 0, sectors, &extent);
+ if (ret < 0) {
+ return ret;
+ }
extent->flat_start_offset = flat_offset << 9;
} else if (!strcmp(type, "SPARSE")) {
/* SPARSE extent */
@@ -705,33 +764,46 @@ next_line:
}
static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
- int64_t desc_offset)
+ uint64_t desc_offset)
{
int ret;
- char buf[2048];
+ char *buf = NULL;
char ct[128];
BDRVVmdkState *s = bs->opaque;
+ int64_t size;
- ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
+ size = bdrv_getlength(bs->file);
+ if (size < 0) {
+ return -EINVAL;
+ }
+
+ size = MIN(size, 1 << 20); /* avoid unbounded allocation */
+ buf = g_malloc0(size + 1);
+
+ ret = bdrv_pread(bs->file, desc_offset, buf, size);
if (ret < 0) {
- return ret;
+ goto exit;
}
- buf[2047] = '\0';
if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
- return -EINVAL;
+ ret = -EMEDIUMTYPE;
+ goto exit;
}
if (strcmp(ct, "monolithicFlat") &&
strcmp(ct, "twoGbMaxExtentSparse") &&
strcmp(ct, "twoGbMaxExtentFlat")) {
fprintf(stderr,
"VMDK: Not supported image type \"%s\""".\n", ct);
- return -ENOTSUP;
+ ret = -ENOTSUP;
+ goto exit;
}
s->desc_offset = 0;
- return vmdk_parse_extents(buf, bs, bs->file->filename);
+ ret = vmdk_parse_extents(buf, bs, bs->file->filename);
+exit:
+ g_free(buf);
+ return ret;
}
-static int vmdk_open(BlockDriverState *bs, int flags)
+static int vmdk_open(BlockDriverState *bs, QDict *options, int flags)
{
int ret;
BDRVVmdkState *s = bs->opaque;
@@ -771,16 +843,17 @@ static int get_whole_cluster(BlockDriverState *bs,
uint64_t offset,
bool allocate)
{
- /* 128 sectors * 512 bytes each = grain size 64KB */
- uint8_t whole_grain[extent->cluster_sectors * 512];
+ int ret = VMDK_OK;
+ uint8_t *whole_grain = NULL;
/* we will be here if it's first write on non-exist grain(cluster).
* try to read from parent image, if exist */
if (bs->backing_hd) {
- int ret;
-
+ whole_grain =
+ qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
if (!vmdk_is_cid_valid(bs)) {
- return -1;
+ ret = VMDK_ERROR;
+ goto exit;
}
/* floor offset to cluster */
@@ -788,30 +861,35 @@ static int get_whole_cluster(BlockDriverState *bs,
ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
extent->cluster_sectors);
if (ret < 0) {
- return -1;
+ ret = VMDK_ERROR;
+ goto exit;
}
/* Write grain only into the active image */
ret = bdrv_write(extent->file, cluster_offset, whole_grain,
extent->cluster_sectors);
if (ret < 0) {
- return -1;
+ ret = VMDK_ERROR;
+ goto exit;
}
}
- return 0;
+exit:
+ qemu_vfree(whole_grain);
+ return ret;
}
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
{
+ uint32_t offset;
+ QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
+ offset = cpu_to_le32(m_data->offset);
/* update L2 table */
if (bdrv_pwrite_sync(
extent->file,
((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)),
- &(m_data->offset),
- sizeof(m_data->offset)
- ) < 0) {
- return -1;
+ &offset, sizeof(offset)) < 0) {
+ return VMDK_ERROR;
}
/* update backup L2 table */
if (extent->l1_backup_table_offset != 0) {
@@ -820,13 +898,15 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
extent->file,
((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)),
- &(m_data->offset), sizeof(m_data->offset)
- ) < 0) {
- return -1;
+ &offset, sizeof(offset)) < 0) {
+ return VMDK_ERROR;
}
}
+ if (m_data->l2_cache_entry) {
+ *m_data->l2_cache_entry = offset;
+ }
- return 0;
+ return VMDK_OK;
}
static int get_cluster_offset(BlockDriverState *bs,
@@ -838,24 +918,25 @@ static int get_cluster_offset(BlockDriverState *bs,
{
unsigned int l1_index, l2_offset, l2_index;
int min_index, i, j;
- uint32_t min_count, *l2_table, tmp = 0;
+ uint32_t min_count, *l2_table;
+ bool zeroed = false;
if (m_data) {
m_data->valid = 0;
}
if (extent->flat) {
*cluster_offset = extent->flat_start_offset;
- return 0;
+ return VMDK_OK;
}
offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
l1_index = (offset >> 9) / extent->l1_entry_sectors;
if (l1_index >= extent->l1_size) {
- return -1;
+ return VMDK_ERROR;
}
l2_offset = extent->l1_table[l1_index];
if (!l2_offset) {
- return -1;
+ return VMDK_UNALLOC;
}
for (i = 0; i < L2_CACHE_SIZE; i++) {
if (l2_offset == extent->l2_cache_offsets[i]) {
@@ -885,7 +966,7 @@ static int get_cluster_offset(BlockDriverState *bs,
l2_table,
extent->l2_size * sizeof(uint32_t)
) != extent->l2_size * sizeof(uint32_t)) {
- return -1;
+ return VMDK_ERROR;
}
extent->l2_cache_offsets[min_index] = l2_offset;
@@ -894,9 +975,21 @@ static int get_cluster_offset(BlockDriverState *bs,
l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
*cluster_offset = le32_to_cpu(l2_table[l2_index]);
- if (!*cluster_offset) {
+ if (m_data) {
+ m_data->valid = 1;
+ m_data->l1_index = l1_index;
+ m_data->l2_index = l2_index;
+ m_data->offset = *cluster_offset;
+ m_data->l2_offset = l2_offset;
+ m_data->l2_cache_entry = &l2_table[l2_index];
+ }
+ if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+ zeroed = true;
+ }
+
+ if (!*cluster_offset || zeroed) {
if (!allocate) {
- return -1;
+ return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
}
/* Avoid the L2 tables update for the images that have snapshots. */
@@ -909,8 +1002,7 @@ static int get_cluster_offset(BlockDriverState *bs,
}
*cluster_offset >>= 9;
- tmp = cpu_to_le32(*cluster_offset);
- l2_table[l2_index] = tmp;
+ l2_table[l2_index] = cpu_to_le32(*cluster_offset);
/* First of all we write grain itself, to avoid race condition
* that may to corrupt the image.
@@ -919,19 +1011,15 @@ static int get_cluster_offset(BlockDriverState *bs,
*/
if (get_whole_cluster(
bs, extent, *cluster_offset, offset, allocate) == -1) {
- return -1;
+ return VMDK_ERROR;
}
if (m_data) {
- m_data->offset = tmp;
- m_data->l1_index = l1_index;
- m_data->l2_index = l2_index;
- m_data->l2_offset = l2_offset;
- m_data->valid = 1;
+ m_data->offset = *cluster_offset;
}
}
*cluster_offset <<= 9;
- return 0;
+ return VMDK_OK;
}
static VmdkExtent *find_extent(BDRVVmdkState *s,
@@ -967,8 +1055,8 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
ret = get_cluster_offset(bs, extent, NULL,
sector_num * 512, 0, &offset);
qemu_co_mutex_unlock(&s->lock);
- /* get_cluster_offset returning 0 means success */
- ret = !ret;
+
+ ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
index_in_cluster = sector_num % extent->cluster_sectors;
n = extent->cluster_sectors - index_in_cluster;
@@ -1111,9 +1199,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
if (n > nb_sectors) {
n = nb_sectors;
}
- if (ret) {
+ if (ret != VMDK_OK) {
/* if not allocated, try to read from parent image, if exist */
- if (bs->backing_hd) {
+ if (bs->backing_hd && ret != VMDK_ZEROED) {
if (!vmdk_is_cid_valid(bs)) {
return -EINVAL;
}
@@ -1150,8 +1238,19 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
return ret;
}
+/**
+ * vmdk_write:
+ * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature
+ * if possible, otherwise return -ENOTSUP.
+ * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
+ * with each cluster. By dry run we can find if the zero write
+ * is possible without modifying image data.
+ *
+ * Returns: error code with 0 for success.
+ */
static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
+ const uint8_t *buf, int nb_sectors,
+ bool zeroed, bool zero_dry_run)
{
BDRVVmdkState *s = bs->opaque;
VmdkExtent *extent = NULL;
@@ -1181,7 +1280,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
sector_num << 9, !extent->compressed,
&cluster_offset);
if (extent->compressed) {
- if (ret == 0) {
+ if (ret == VMDK_OK) {
/* Refuse write to allocated cluster for streamOptimized */
fprintf(stderr,
"VMDK: can't write to allocated cluster"
@@ -1197,7 +1296,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
&cluster_offset);
}
}
- if (ret) {
+ if (ret == VMDK_ERROR) {
return -EINVAL;
}
extent_begin_sector = extent->end_sector - extent->sectors;
@@ -1207,17 +1306,34 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
if (n > nb_sectors) {
n = nb_sectors;
}
-
- ret = vmdk_write_extent(extent,
- cluster_offset, index_in_cluster * 512,
- buf, n, sector_num);
- if (ret) {
- return ret;
- }
- if (m_data.valid) {
- /* update L2 tables */
- if (vmdk_L2update(extent, &m_data) == -1) {
- return -EIO;
+ if (zeroed) {
+ /* Do zeroed write, buf is ignored */
+ if (extent->has_zero_grain &&
+ index_in_cluster == 0 &&
+ n >= extent->cluster_sectors) {
+ n = extent->cluster_sectors;
+ if (!zero_dry_run) {
+ m_data.offset = VMDK_GTE_ZEROED;
+ /* update L2 tables */
+ if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+ return -EIO;
+ }
+ }
+ } else {
+ return -ENOTSUP;
+ }
+ } else {
+ ret = vmdk_write_extent(extent,
+ cluster_offset, index_in_cluster * 512,
+ buf, n, sector_num);
+ if (ret) {
+ return ret;
+ }
+ if (m_data.valid) {
+ /* update L2 tables */
+ if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+ return -EIO;
+ }
}
}
nb_sectors -= n;
@@ -1243,14 +1359,31 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
int ret;
BDRVVmdkState *s = bs->opaque;
qemu_co_mutex_lock(&s->lock);
- ret = vmdk_write(bs, sector_num, buf, nb_sectors);
+ ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors)
+{
+ int ret;
+ BDRVVmdkState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ /* write zeroes could fail if sectors not aligned to cluster, test it with
+ * dry_run == true before really updating image */
+ ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+ if (!ret) {
+ ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+ }
qemu_co_mutex_unlock(&s->lock);
return ret;
}
static int vmdk_create_extent(const char *filename, int64_t filesize,
- bool flat, bool compress)
+ bool flat, bool compress, bool zeroed_grain)
{
int ret, i;
int fd = 0;
@@ -1272,18 +1405,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
}
magic = cpu_to_be32(VMDK4_MAGIC);
memset(&header, 0, sizeof(header));
- header.version = 1;
- header.flags =
- 3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
+ header.version = zeroed_grain ? 2 : 1;
+ header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
+ | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+ | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
header.capacity = filesize / 512;
header.granularity = 128;
- header.num_gtes_per_gte = 512;
+ header.num_gtes_per_gt = 512;
grains = (filesize / 512 + header.granularity - 1) / header.granularity;
- gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+ gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
gt_count =
- (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+ (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
header.desc_offset = 1;
@@ -1299,7 +1433,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
header.flags = cpu_to_le32(header.flags);
header.capacity = cpu_to_le64(header.capacity);
header.granularity = cpu_to_le64(header.granularity);
- header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
+ header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
header.desc_offset = cpu_to_le64(header.desc_offset);
header.desc_size = cpu_to_le64(header.desc_size);
header.rgd_offset = cpu_to_le64(header.rgd_offset);
@@ -1365,7 +1499,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
if (filename == NULL || !strlen(filename)) {
fprintf(stderr, "Vmdk: no filename provided.\n");
- return -1;
+ return VMDK_ERROR;
}
p = strrchr(filename, '/');
if (p == NULL) {
@@ -1377,7 +1511,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
if (p != NULL) {
p++;
if (p - filename >= buf_len) {
- return -1;
+ return VMDK_ERROR;
}
pstrcpy(path, p - filename + 1, filename);
} else {
@@ -1390,51 +1524,12 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
postfix[0] = '\0';
} else {
if (q - p >= buf_len) {
- return -1;
+ return VMDK_ERROR;
}
pstrcpy(prefix, q - p + 1, p);
pstrcpy(postfix, buf_len, q);
}
- return 0;
-}
-
-static int relative_path(char *dest, int dest_size,
- const char *base, const char *target)
-{
- int i = 0;
- int n = 0;
- const char *p, *q;
-#ifdef _WIN32
- const char *sep = "\\";
-#else
- const char *sep = "/";
-#endif
-
- if (!(dest && base && target)) {
- return -1;
- }
- if (path_is_absolute(target)) {
- pstrcpy(dest, dest_size, target);
- return 0;
- }
- while (base[i] == target[i]) {
- i++;
- }
- p = &base[i];
- q = &target[i];
- while (*p) {
- if (*p == *sep) {
- n++;
- }
- p++;
- }
- dest[0] = '\0';
- for (; n; n--) {
- pstrcat(dest, dest_size, "..");
- pstrcat(dest, dest_size, sep);
- }
- pstrcat(dest, dest_size, q);
- return 0;
+ return VMDK_OK;
}
static int vmdk_create(const char *filename, QEMUOptionParameter *options)
@@ -1442,6 +1537,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
int fd, idx = 0;
char desc[BUF_SIZE];
int64_t total_size = 0, filesize;
+ const char *adapter_type = NULL;
const char *backing_file = NULL;
const char *fmt = NULL;
int flags = 0;
@@ -1453,6 +1549,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
const char *desc_extent_line;
char parent_desc_line[BUF_SIZE] = "";
uint32_t parent_cid = 0xffffffff;
+ uint32_t number_heads = 16;
+ bool zeroed_grain = false;
const char desc_template[] =
"# Disk DescriptorFile\n"
"version=1\n"
@@ -1469,9 +1567,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
"\n"
"ddb.virtualHWVersion = \"%d\"\n"
"ddb.geometry.cylinders = \"%" PRId64 "\"\n"
- "ddb.geometry.heads = \"16\"\n"
+ "ddb.geometry.heads = \"%d\"\n"
"ddb.geometry.sectors = \"63\"\n"
- "ddb.adapterType = \"ide\"\n";
+ "ddb.adapterType = \"%s\"\n";
if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
return -EINVAL;
@@ -1480,15 +1578,33 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
while (options && options->name) {
if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
total_size = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) {
+ adapter_type = options->value.s;
} else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
backing_file = options->value.s;
} else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
} else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
fmt = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
+ zeroed_grain |= options->value.n;
}
options++;
}
+ if (!adapter_type) {
+ adapter_type = "ide";
+ } else if (strcmp(adapter_type, "ide") &&
+ strcmp(adapter_type, "buslogic") &&
+ strcmp(adapter_type, "lsilogic") &&
+ strcmp(adapter_type, "legacyESX")) {
+ fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type);
+ return -EINVAL;
+ }
+ if (strcmp(adapter_type, "ide") != 0) {
+ /* that's the number of heads with which vmware operates when
+ creating, exporting, etc. vmdk files with a non-ide adapter type */
+ number_heads = 255;
+ }
if (!fmt) {
/* Default format to monolithicSparse */
fmt = "monolithicSparse";
@@ -1515,9 +1631,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
return -ENOTSUP;
}
if (backing_file) {
- char parent_filename[PATH_MAX];
BlockDriverState *bs = bdrv_new("");
- ret = bdrv_open(bs, backing_file, 0, NULL);
+ ret = bdrv_open(bs, backing_file, NULL, 0, NULL);
if (ret != 0) {
bdrv_delete(bs);
return ret;
@@ -1528,10 +1643,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
}
parent_cid = vmdk_read_cid(bs, 0);
bdrv_delete(bs);
- relative_path(parent_filename, sizeof(parent_filename),
- filename, backing_file);
snprintf(parent_desc_line, sizeof(parent_desc_line),
- "parentFileNameHint=\"%s\"", parent_filename);
+ "parentFileNameHint=\"%s\"", backing_file);
}
/* Create extents */
@@ -1558,7 +1671,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
snprintf(ext_filename, sizeof(ext_filename), "%s%s",
path, desc_filename);
- if (vmdk_create_extent(ext_filename, size, flat, compress)) {
+ if (vmdk_create_extent(ext_filename, size,
+ flat, compress, zeroed_grain)) {
return -EINVAL;
}
filesize -= size;
@@ -1576,7 +1690,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
parent_desc_line,
ext_desc_lines,
(flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
- total_size / (int64_t)(63 * 16 * 512));
+ total_size / (int64_t)(63 * number_heads * 512), number_heads,
+ adapter_type);
if (split || flat) {
fd = qemu_open(filename,
O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
@@ -1654,6 +1769,23 @@ static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
return ret;
}
+static int vmdk_has_zero_init(BlockDriverState *bs)
+{
+ int i;
+ BDRVVmdkState *s = bs->opaque;
+
+ /* If has a flat extent and its underlying storage doesn't have zero init,
+ * return 0. */
+ for (i = 0; i < s->num_extents; i++) {
+ if (s->extents[i].flat) {
+ if (!bdrv_has_zero_init(s->extents[i].file)) {
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
static QEMUOptionParameter vmdk_create_options[] = {
{
.name = BLOCK_OPT_SIZE,
@@ -1661,6 +1793,12 @@ static QEMUOptionParameter vmdk_create_options[] = {
.help = "Virtual disk size"
},
{
+ .name = BLOCK_OPT_ADAPTER_TYPE,
+ .type = OPT_STRING,
+ .help = "Virtual adapter type, can be one of "
+ "ide (default), lsilogic, buslogic or legacyESX"
+ },
+ {
.name = BLOCK_OPT_BACKING_FILE,
.type = OPT_STRING,
.help = "File name of a base image"
@@ -1677,24 +1815,31 @@ static QEMUOptionParameter vmdk_create_options[] = {
"VMDK flat extent format, can be one of "
"{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
},
+ {
+ .name = BLOCK_OPT_ZEROED_GRAIN,
+ .type = OPT_FLAG,
+ .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
+ },
{ NULL }
};
static BlockDriver bdrv_vmdk = {
- .format_name = "vmdk",
- .instance_size = sizeof(BDRVVmdkState),
- .bdrv_probe = vmdk_probe,
- .bdrv_open = vmdk_open,
- .bdrv_reopen_prepare = vmdk_reopen_prepare,
- .bdrv_read = vmdk_co_read,
- .bdrv_write = vmdk_co_write,
- .bdrv_close = vmdk_close,
- .bdrv_create = vmdk_create,
- .bdrv_co_flush_to_disk = vmdk_co_flush,
- .bdrv_co_is_allocated = vmdk_co_is_allocated,
- .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
-
- .create_options = vmdk_create_options,
+ .format_name = "vmdk",
+ .instance_size = sizeof(BDRVVmdkState),
+ .bdrv_probe = vmdk_probe,
+ .bdrv_open = vmdk_open,
+ .bdrv_reopen_prepare = vmdk_reopen_prepare,
+ .bdrv_read = vmdk_co_read,
+ .bdrv_write = vmdk_co_write,
+ .bdrv_co_write_zeroes = vmdk_co_write_zeroes,
+ .bdrv_close = vmdk_close,
+ .bdrv_create = vmdk_create,
+ .bdrv_co_flush_to_disk = vmdk_co_flush,
+ .bdrv_co_is_allocated = vmdk_co_is_allocated,
+ .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
+ .bdrv_has_zero_init = vmdk_has_zero_init,
+
+ .create_options = vmdk_create_options,
};
static void bdrv_vmdk_init(void)
diff --git a/block/vpc.c b/block/vpc.c
index b6bf52f14..fe4f311d5 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -23,9 +23,12 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif
/**************************************************************/
@@ -152,7 +155,7 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
return 0;
}
-static int vpc_open(BlockDriverState *bs, int flags)
+static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVVPCState *s = bs->opaque;
int i;
@@ -160,24 +163,33 @@ static int vpc_open(BlockDriverState *bs, int flags)
struct vhd_dyndisk_header* dyndisk_header;
uint8_t buf[HEADER_SIZE];
uint32_t checksum;
- int err = -1;
int disk_type = VHD_DYNAMIC;
+ int ret;
- if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+ ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
+ if (ret < 0) {
goto fail;
+ }
footer = (struct vhd_footer*) s->footer_buf;
if (strncmp(footer->creator, "conectix", 8)) {
int64_t offset = bdrv_getlength(bs->file);
- if (offset < HEADER_SIZE) {
+ if (offset < 0) {
+ ret = offset;
+ goto fail;
+ } else if (offset < HEADER_SIZE) {
+ ret = -EINVAL;
goto fail;
}
+
/* If a fixed disk, the footer is found only at the end of the file */
- if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE)
- != HEADER_SIZE) {
+ ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
+ HEADER_SIZE);
+ if (ret < 0) {
goto fail;
}
if (strncmp(footer->creator, "conectix", 8)) {
+ ret = -EMEDIUMTYPE;
goto fail;
}
disk_type = VHD_FIXED;
@@ -198,20 +210,23 @@ static int vpc_open(BlockDriverState *bs, int flags)
bs->total_sectors = (int64_t)
be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
- if (bs->total_sectors >= 65535 * 16 * 255) {
- err = -EFBIG;
+ /* Allow a maximum disk size of approximately 2 TB */
+ if (bs->total_sectors >= 65535LL * 255 * 255) {
+ ret = -EFBIG;
goto fail;
}
if (disk_type == VHD_DYNAMIC) {
- if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
- HEADER_SIZE) != HEADER_SIZE) {
+ ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
+ HEADER_SIZE);
+ if (ret < 0) {
goto fail;
}
dyndisk_header = (struct vhd_dyndisk_header *) buf;
if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
+ ret = -EINVAL;
goto fail;
}
@@ -222,8 +237,10 @@ static int vpc_open(BlockDriverState *bs, int flags)
s->pagetable = g_malloc(s->max_table_entries * 4);
s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
- if (bdrv_pread(bs->file, s->bat_offset, s->pagetable,
- s->max_table_entries * 4) != s->max_table_entries * 4) {
+
+ ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
+ s->max_table_entries * 4);
+ if (ret < 0) {
goto fail;
}
@@ -261,8 +278,13 @@ static int vpc_open(BlockDriverState *bs, int flags)
migrate_add_blocker(s->migration_blocker);
return 0;
- fail:
- return err;
+
+fail:
+ g_free(s->pagetable);
+#ifdef CACHE
+ g_free(s->pageentry_u8);
+#endif
+ return ret;
}
static int vpc_reopen_prepare(BDRVReopenState *state,
@@ -524,19 +546,27 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
* Note that the geometry doesn't always exactly match total_sectors but
* may round it down.
*
- * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override
+ * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
+ * and instead allow up to 255 heads.
*/
static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
uint8_t* heads, uint8_t* secs_per_cyl)
{
uint32_t cyls_times_heads;
- if (total_sectors > 65535 * 16 * 255)
+ /* Allow a maximum disk size of approximately 2 TB */
+ if (total_sectors > 65535LL * 255 * 255) {
return -EFBIG;
+ }
if (total_sectors > 65535 * 16 * 63) {
*secs_per_cyl = 255;
- *heads = 16;
+ if (total_sectors > 65535 * 16 * 255) {
+ *heads = 255;
+ } else {
+ *heads = 16;
+ }
cyls_times_heads = total_sectors / *secs_per_cyl;
} else {
*secs_per_cyl = 17;
@@ -739,7 +769,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
footer->type = be32_to_cpu(disk_type);
- /* TODO uuid is missing */
+#if defined(CONFIG_UUID)
+ uuid_generate(footer->uuid);
+#endif
footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
@@ -754,6 +786,18 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
return ret;
}
+static int vpc_has_zero_init(BlockDriverState *bs)
+{
+ BDRVVPCState *s = bs->opaque;
+ struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf;
+
+ if (cpu_to_be32(footer->type) == VHD_FIXED) {
+ return bdrv_has_zero_init(bs->file);
+ } else {
+ return 1;
+ }
+}
+
static void vpc_close(BlockDriverState *bs)
{
BDRVVPCState *s = bs->opaque;
@@ -786,16 +830,17 @@ static BlockDriver bdrv_vpc = {
.format_name = "vpc",
.instance_size = sizeof(BDRVVPCState),
- .bdrv_probe = vpc_probe,
- .bdrv_open = vpc_open,
- .bdrv_close = vpc_close,
- .bdrv_reopen_prepare = vpc_reopen_prepare,
- .bdrv_create = vpc_create,
+ .bdrv_probe = vpc_probe,
+ .bdrv_open = vpc_open,
+ .bdrv_close = vpc_close,
+ .bdrv_reopen_prepare = vpc_reopen_prepare,
+ .bdrv_create = vpc_create,
.bdrv_read = vpc_co_read,
.bdrv_write = vpc_co_write,
- .create_options = vpc_create_options,
+ .create_options = vpc_create_options,
+ .bdrv_has_zero_init = vpc_has_zero_init,
};
static void bdrv_vpc_init(void)
diff --git a/block/vvfat.c b/block/vvfat.c
index 59d3c5b8a..cd3b8edd9 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1,4 +1,4 @@
-/* vim:set shiftwidth=4 ts=8: */
+/* vim:set shiftwidth=4 ts=4: */
/*
* QEMU Block driver for virtual VFAT (shadows a local directory)
*
@@ -25,9 +25,11 @@
#include <sys/stat.h>
#include <dirent.h>
#include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qbool.h"
#ifndef S_IWGRP
#define S_IWGRP 0
@@ -529,13 +531,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry)
/* if return_time==0, this returns the fat_date, else the fat_time */
static uint16_t fat_datetime(time_t time,int return_time) {
struct tm* t;
-#ifdef _WIN32
- t=localtime(&time); /* this is not thread safe */
-#else
struct tm t1;
t = &t1;
localtime_r(&time,t);
-#endif
if(return_time)
return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
@@ -992,10 +990,90 @@ static void vvfat_rebind(BlockDriverState *bs)
s->bs = bs;
}
-static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+static QemuOptsList runtime_opts = {
+ .name = "vvfat",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "dir",
+ .type = QEMU_OPT_STRING,
+ .help = "Host directory to map to the vvfat device",
+ },
+ {
+ .name = "fat-type",
+ .type = QEMU_OPT_NUMBER,
+ .help = "FAT type (12, 16 or 32)",
+ },
+ {
+ .name = "floppy",
+ .type = QEMU_OPT_BOOL,
+ .help = "Create a floppy rather than a hard disk image",
+ },
+ {
+ .name = "rw",
+ .type = QEMU_OPT_BOOL,
+ .help = "Make the image writable",
+ },
+ { /* end of list */ }
+ },
+};
+
+static void vvfat_parse_filename(const char *filename, QDict *options,
+ Error **errp)
+{
+ int fat_type = 0;
+ bool floppy = false;
+ bool rw = false;
+ int i;
+
+ if (!strstart(filename, "fat:", NULL)) {
+ error_setg(errp, "File name string must start with 'fat:'");
+ return;
+ }
+
+ /* Parse options */
+ if (strstr(filename, ":32:")) {
+ fat_type = 32;
+ } else if (strstr(filename, ":16:")) {
+ fat_type = 16;
+ } else if (strstr(filename, ":12:")) {
+ fat_type = 12;
+ }
+
+ if (strstr(filename, ":floppy:")) {
+ floppy = true;
+ }
+
+ if (strstr(filename, ":rw:")) {
+ rw = true;
+ }
+
+ /* Get the directory name without options */
+ i = strrchr(filename, ':') - filename;
+ assert(i >= 3);
+ if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) {
+ /* workaround for DOS drive names */
+ filename += i - 1;
+ } else {
+ filename += i + 1;
+ }
+
+ /* Fill in the options QDict */
+ qdict_put(options, "dir", qstring_from_str(filename));
+ qdict_put(options, "fat-type", qint_from_int(fat_type));
+ qdict_put(options, "floppy", qbool_from_int(floppy));
+ qdict_put(options, "rw", qbool_from_int(rw));
+}
+
+static int vvfat_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVVVFATState *s = bs->opaque;
- int i, cyls, heads, secs;
+ int cyls, heads, secs;
+ bool floppy;
+ const char *dirname;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ int ret;
#ifdef DEBUG
vvv = s;
@@ -1006,6 +1084,65 @@ DLOG(if (stderr == NULL) {
setbuf(stderr, NULL);
})
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ dirname = qemu_opt_get(opts, "dir");
+ if (!dirname) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires "
+ "a 'dir' option");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->fat_type = qemu_opt_get_number(opts, "fat-type", 0);
+ floppy = qemu_opt_get_bool(opts, "floppy", false);
+
+ if (floppy) {
+ /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */
+ if (!s->fat_type) {
+ s->fat_type = 12;
+ secs = 36;
+ s->sectors_per_cluster = 2;
+ } else {
+ secs = s->fat_type == 12 ? 18 : 36;
+ s->sectors_per_cluster = 1;
+ }
+ s->first_sectors_number = 1;
+ cyls = 80;
+ heads = 2;
+ } else {
+ /* 32MB or 504MB disk*/
+ if (!s->fat_type) {
+ s->fat_type = 16;
+ }
+ cyls = s->fat_type == 12 ? 64 : 1024;
+ heads = 16;
+ secs = 63;
+ }
+
+ switch (s->fat_type) {
+ case 32:
+ fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. "
+ "You are welcome to do so!\n");
+ break;
+ case 16:
+ case 12:
+ break;
+ default:
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only "
+ "12, 16 and 32");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+
s->bs = bs;
/* LATER TODO: if FAT32, adjust */
@@ -1021,63 +1158,24 @@ DLOG(if (stderr == NULL) {
s->fat2 = NULL;
s->downcase_short_names = 1;
- if (!strstart(dirname, "fat:", NULL))
- return -1;
-
- if (strstr(dirname, ":32:")) {
- fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
- s->fat_type = 32;
- } else if (strstr(dirname, ":16:")) {
- s->fat_type = 16;
- } else if (strstr(dirname, ":12:")) {
- s->fat_type = 12;
- }
-
- if (strstr(dirname, ":floppy:")) {
- /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */
- if (!s->fat_type) {
- s->fat_type = 12;
- secs = 36;
- s->sectors_per_cluster=2;
- } else {
- secs = s->fat_type == 12 ? 18 : 36;
- s->sectors_per_cluster=1;
- }
- s->first_sectors_number = 1;
- cyls = 80;
- heads = 2;
- } else {
- /* 32MB or 504MB disk*/
- if (!s->fat_type) {
- s->fat_type = 16;
- }
- cyls = s->fat_type == 12 ? 64 : 1024;
- heads = 16;
- secs = 63;
- }
fprintf(stderr, "vvfat %s chs %d,%d,%d\n",
dirname, cyls, heads, secs);
s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);
- if (strstr(dirname, ":rw:")) {
- if (enable_write_target(s))
- return -1;
- bs->read_only = 0;
+ if (qemu_opt_get_bool(opts, "rw", false)) {
+ ret = enable_write_target(s);
+ if (ret < 0) {
+ goto fail;
+ }
+ bs->read_only = 0;
}
- i = strrchr(dirname, ':') - dirname;
- assert(i >= 3);
- if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
- /* workaround for DOS drive names */
- dirname += i-1;
- else
- dirname += i+1;
-
bs->total_sectors = cyls * heads * secs;
if (init_directories(s, dirname, heads, secs)) {
- return -1;
+ ret = -EIO;
+ goto fail;
}
s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
@@ -1097,7 +1195,10 @@ DLOG(if (stderr == NULL) {
migrate_add_blocker(s->migration_blocker);
}
- return 0;
+ ret = 0;
+fail:
+ qemu_opts_del(opts);
+ return ret;
}
static inline void vvfat_close_current_file(BDRVVVFATState *s)
@@ -2816,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s)
s->qcow_filename = g_malloc(1024);
ret = get_tmp_filename(s->qcow_filename, 1024);
if (ret < 0) {
- g_free(s->qcow_filename);
- s->qcow_filename = NULL;
- return ret;
+ goto err;
}
bdrv_qcow = bdrv_find_format("qcow");
@@ -2826,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s)
set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
- if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
- return -1;
+ ret = bdrv_create(bdrv_qcow, s->qcow_filename, options);
+ if (ret < 0) {
+ goto err;
+ }
s->qcow = bdrv_new("");
- if (s->qcow == NULL) {
- return -1;
- }
- ret = bdrv_open(s->qcow, s->qcow_filename,
+ ret = bdrv_open(s->qcow, s->qcow_filename, NULL,
BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
if (ret < 0) {
- return ret;
+ bdrv_delete(s->qcow);
+ goto err;
}
#ifndef _WIN32
@@ -2850,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s)
*(void**)s->bs->backing_hd->opaque = s;
return 0;
+
+err:
+ g_free(s->qcow_filename);
+ s->qcow_filename = NULL;
+ return ret;
}
static void vvfat_close(BlockDriverState *bs)
@@ -2869,15 +2973,18 @@ static void vvfat_close(BlockDriverState *bs)
}
static BlockDriver bdrv_vvfat = {
- .format_name = "vvfat",
- .instance_size = sizeof(BDRVVVFATState),
- .bdrv_file_open = vvfat_open,
- .bdrv_rebind = vvfat_rebind,
- .bdrv_read = vvfat_co_read,
- .bdrv_write = vvfat_co_write,
- .bdrv_close = vvfat_close,
- .bdrv_co_is_allocated = vvfat_co_is_allocated,
- .protocol_name = "fat",
+ .format_name = "vvfat",
+ .protocol_name = "fat",
+ .instance_size = sizeof(BDRVVVFATState),
+
+ .bdrv_parse_filename = vvfat_parse_filename,
+ .bdrv_file_open = vvfat_open,
+ .bdrv_close = vvfat_close,
+ .bdrv_rebind = vvfat_rebind,
+
+ .bdrv_read = vvfat_co_read,
+ .bdrv_write = vvfat_co_write,
+ .bdrv_co_is_allocated = vvfat_co_is_allocated,
};
static void bdrv_vvfat_init(void)
diff --git a/block/win32-aio.c b/block/win32-aio.c
index 4704ee06c..fcb7c754d 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -22,13 +22,13 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu-common.h"
-#include "qemu-aio.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "block/aio.h"
#include "raw-aio.h"
-#include "event_notifier.h"
+#include "qemu/event_notifier.h"
+#include "qemu/iov.h"
#include <windows.h>
#include <winioctl.h>
@@ -80,15 +80,9 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
if (!waiocb->is_linear) {
if (ret == 0 && waiocb->is_read) {
QEMUIOVector *qiov = waiocb->qiov;
- char *p = waiocb->buf;
- int i;
-
- for (i = 0; i < qiov->niov; ++i) {
- memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
- p += qiov->iov[i].iov_len;
- }
- g_free(waiocb->buf);
+ iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
}
+ qemu_vfree(waiocb->buf);
}
@@ -153,13 +147,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
if (qiov->niov > 1) {
waiocb->buf = qemu_blockalign(bs, qiov->size);
if (type & QEMU_AIO_WRITE) {
- char *p = waiocb->buf;
- int i;
-
- for (i = 0; i < qiov->niov; ++i) {
- memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
- p += qiov->iov[i].iov_len;
- }
+ iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
}
waiocb->is_linear = false;
} else {