43 files changed, 7821 insertions, 1629 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 7f015105b..4cf9aa499 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,7 +2,9 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
+block-obj-y += vhdx.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
+block-obj-y += snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
@@ -13,8 +15,12 @@ block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+block-obj-$(CONFIG_LIBSSH2) += ssh.o
 endif
 
 common-obj-y += stream.o
 common-obj-y += commit.o
 common-obj-y += mirror.o
+common-obj-y += backup.o
+
+$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
diff --git a/block/backup.c b/block/backup.c
new file mode 100644
index 000000000..6ae8a05a3
--- /dev/null
+++ b/block/backup.c
@@ -0,0 +1,386 @@
+/*
+ * QEMU backup
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "trace.h"
+#include "block/block.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
+#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CowRequest {
+    int64_t start;
+    int64_t end;
+    QLIST_ENTRY(CowRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+typedef struct BackupBlockJob {
+    BlockJob common;
+    BlockDriverState *target;
+    MirrorSyncMode sync_mode;
+    RateLimit limit;
+    BlockdevOnError on_source_error;
+    BlockdevOnError on_target_error;
+    CoRwlock flush_rwlock;
+    uint64_t sectors_read;
+    HBitmap *bitmap;
+    QLIST_HEAD(, CowRequest) inflight_reqs;
+} BackupBlockJob;
+
+/* See if in-flight requests overlap and wait for them to complete */
+static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
+                                                       int64_t start,
+                                                       int64_t end)
+{
+    CowRequest *req;
+    bool retry;
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &job->inflight_reqs, list) {
+            if (end > req->start && start < req->end) {
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
+/* Keep track of an in-flight request */
+static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
+                                     int64_t start, int64_t end)
+{
+    req->start = start;
+    req->end = end;
+    qemu_co_queue_init(&req->wait_queue);
+    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
+}
+
+/* Forget about a completed request */
+static void cow_request_end(CowRequest *req)
+{
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      bool *error_is_read)
+{
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    CowRequest cow_request;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+    int64_t start, end;
+    int n;
+
+    qemu_co_rwlock_rdlock(&job->flush_rwlock);
+
+    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
+
+    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
+
+    wait_for_overlapping_requests(job, start, end);
+    cow_request_begin(&cow_request, job, start, end);
+
+    for (; start < end; start++) {
+        if (hbitmap_get(job->bitmap, start)) {
+            trace_backup_do_cow_skip(job, start);
+            continue; /* already copied */
+        }
+
+        trace_backup_do_cow_process(job, start);
+
+        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
+                job->common.len / BDRV_SECTOR_SIZE -
+                start * BACKUP_SECTORS_PER_CLUSTER);
+
+        if (!bounce_buffer) {
+            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
+        }
+        iov.iov_base = bounce_buffer;
+        iov.iov_len = n * BDRV_SECTOR_SIZE;
+        qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+                            &bounce_qiov);
+        if (ret < 0) {
+            trace_backup_do_cow_read_fail(job, start, ret);
+            if (error_is_read) {
+                *error_is_read = true;
+            }
+            goto out;
+        }
+
+        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
+            ret = bdrv_co_write_zeroes(job->target,
+                                       start * BACKUP_SECTORS_PER_CLUSTER, n);
+        } else {
+            ret = bdrv_co_writev(job->target,
+                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
+                                 &bounce_qiov);
+        }
+        if (ret < 0) {
+            trace_backup_do_cow_write_fail(job, start, ret);
+            if (error_is_read) {
+                *error_is_read = false;
+            }
+            goto out;
+        }
+
+        hbitmap_set(job->bitmap, start, 1);
+
+        /* Publish progress, guest I/O counts as progress too.  Note that the
+         * offset field is an opaque progress value, it is not a disk offset.
+         */
+        job->sectors_read += n;
+        job->common.offset += n * BDRV_SECTOR_SIZE;
+    }
+
+out:
+    if (bounce_buffer) {
+        qemu_vfree(bounce_buffer);
+    }
+
+    cow_request_end(&cow_request);
+
+    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
+
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    return ret;
+}
+
+static int coroutine_fn backup_before_write_notify(
+        NotifierWithReturn *notifier,
+        void *opaque)
+{
+    BdrvTrackedRequest *req = opaque;
+
+    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+}
+
+static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static void backup_iostatus_reset(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    bdrv_iostatus_reset(s->target);
+}
+
+static const BlockJobType backup_job_type = {
+    .instance_size  = sizeof(BackupBlockJob),
+    .job_type       = "backup",
+    .set_speed      = backup_set_speed,
+    .iostatus_reset = backup_iostatus_reset,
+};
+
+static BlockErrorAction backup_error_action(BackupBlockJob *job,
+                                            bool read, int error)
+{
+    if (read) {
+        return block_job_error_action(&job->common, job->common.bs,
+                                      job->on_source_error, true, error);
+    } else {
+        return block_job_error_action(&job->common, job->target,
+                                      job->on_target_error, false, error);
+    }
+}
+
+static void coroutine_fn backup_run(void *opaque)
+{
+    BackupBlockJob *job = opaque;
+    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *target = job->target;
+    BlockdevOnError on_target_error = job->on_target_error;
+    NotifierWithReturn before_write = {
+        .notify = backup_before_write_notify,
+    };
+    int64_t start, end;
+    int ret = 0;
+
+    QLIST_INIT(&job->inflight_reqs);
+    qemu_co_rwlock_init(&job->flush_rwlock);
+
+    start = 0;
+    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
+                       BACKUP_SECTORS_PER_CLUSTER);
+
+    job->bitmap = hbitmap_alloc(end, 0);
+
+    bdrv_set_enable_write_cache(target, true);
+    bdrv_set_on_error(target, on_target_error, on_target_error);
+    bdrv_iostatus_enable(target);
+
+    bdrv_add_before_write_notifier(bs, &before_write);
+
+    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
+        while (!block_job_is_cancelled(&job->common)) {
+            /* Yield until the job is cancelled.  We just let our before_write
+             * notify callback service CoW requests. */
+            job->common.busy = false;
+            qemu_coroutine_yield();
+            job->common.busy = true;
+        }
+    } else {
+        /* Both FULL and TOP SYNC_MODE's require copying.. */
+        for (; start < end; start++) {
+            bool error_is_read;
+
+            if (block_job_is_cancelled(&job->common)) {
+                break;
+            }
+
+            /* we need to yield so that qemu_aio_flush() returns.
+             * (without, VM does not reboot)
+             */
+            if (job->common.speed) {
+                uint64_t delay_ns = ratelimit_calculate_delay(
+                        &job->limit, job->sectors_read);
+                job->sectors_read = 0;
+                block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+            } else {
+                block_job_sleep_ns(&job->common, rt_clock, 0);
+            }
+
+            if (block_job_is_cancelled(&job->common)) {
+                break;
+            }
+
+            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
+                int i, n;
+                int alloced = 0;
+
+                /* Check to see if these blocks are already in the
+                 * backing file. */
+
+                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
+                    /* bdrv_co_is_allocated() only returns true/false based
+                     * on the first set of sectors it comes accross that
+                     * are are all in the same state.
+                     * For that reason we must verify each sector in the
+                     * backup cluster length.  We end up copying more than
+                     * needed but at some point that is always the case. */
+                    alloced =
+                        bdrv_co_is_allocated(bs,
+                                start * BACKUP_SECTORS_PER_CLUSTER + i,
+                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
+                    i += n;
+
+                    if (alloced == 1) {
+                        break;
+                    }
+                }
+
+                /* If the above loop never found any sectors that are in
+                 * the topmost image, skip this backup. */
+                if (alloced == 0) {
+                    continue;
+                }
+            }
+            /* FULL sync mode we copy the whole drive. */
+            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
+                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+            if (ret < 0) {
+                /* Depending on error action, fail now or retry cluster */
+                BlockErrorAction action =
+                    backup_error_action(job, error_is_read, -ret);
+                if (action == BDRV_ACTION_REPORT) {
+                    break;
+                } else {
+                    start--;
+                    continue;
+                }
+            }
+        }
+    }
+
+    notifier_with_return_remove(&before_write);
+
+    /* wait until pending backup_do_cow() calls have completed */
+    qemu_co_rwlock_wrlock(&job->flush_rwlock);
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    hbitmap_free(job->bitmap);
+
+    bdrv_iostatus_disable(target);
+    bdrv_delete(target);
+
+    block_job_completed(&job->common, ret);
+}
+
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode sync_mode,
+                  BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp)
+{
+    int64_t len;
+
+    assert(bs);
+    assert(target);
+    assert(cb);
+
+    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        return;
+    }
+
+    len = bdrv_getlength(bs);
+    if (len < 0) {
+        error_setg_errno(errp, -len, "unable to get length for '%s'",
+                         bdrv_get_device_name(bs));
+        return;
+    }
+
+    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+                                           cb, opaque, errp);
+    if (!job) {
+        return;
+    }
+
+    job->on_source_error = on_source_error;
+    job->on_target_error = on_target_error;
+    job->target = target;
+    job->sync_mode = sync_mode;
+    job->common.len = len;
+    job->common.co = qemu_coroutine_create(backup_run);
+    qemu_coroutine_enter(job->common.co, job);
+}
diff --git a/block/blkdebug.c b/block/blkdebug.c
index d61ece86a..ccb627ad9 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -23,14 +23,17 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/config-file.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 typedef struct BDRVBlkdebugState {
     int state;
     int new_state;
+
     QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
     QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
+    QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
 } BDRVBlkdebugState;
 
 typedef struct BlkdebugAIOCB {
@@ -39,6 +42,12 @@ typedef struct BlkdebugAIOCB {
     int ret;
 } BlkdebugAIOCB;
 
+typedef struct BlkdebugSuspendedReq {
+    Coroutine *co;
+    char *tag;
+    QLIST_ENTRY(BlkdebugSuspendedReq) next;
+} BlkdebugSuspendedReq;
+
 static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
 
 static const AIOCBInfo blkdebug_aiocb_info = {
@@ -49,6 +58,7 @@ static const AIOCBInfo blkdebug_aiocb_info = {
 enum {
     ACTION_INJECT_ERROR,
     ACTION_SET_STATE,
+    ACTION_SUSPEND,
 };
 
 typedef struct BlkdebugRule {
@@ -65,6 +75,9 @@ typedef struct BlkdebugRule {
         struct {
             int new_state;
         } set_state;
+        struct {
+            char *tag;
+        } suspend;
     } options;
     QLIST_ENTRY(BlkdebugRule) next;
     QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
@@ -169,6 +182,9 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
     [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc",
     [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes",
     [BLKDBG_CLUSTER_FREE]                   = "cluster_free",
+
+    [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os",
+    [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -226,6 +242,11 @@ static int add_rule(QemuOpts *opts, void *opaque)
         rule->options.set_state.new_state =
             qemu_opt_get_number(opts, "new_state", 0);
         break;
+
+    case ACTION_SUSPEND:
+        rule->options.suspend.tag =
+            g_strdup(qemu_opt_get(opts, "tag"));
+        break;
     };
 
     /* Add the rule */
@@ -234,6 +255,21 @@ static int add_rule(QemuOpts *opts, void *opaque)
     return 0;
 }
 
+static void remove_rule(BlkdebugRule *rule)
+{
+    switch (rule->action) {
+    case ACTION_INJECT_ERROR:
+    case ACTION_SET_STATE:
+        break;
+    case ACTION_SUSPEND:
+        g_free(rule->options.suspend.tag);
+        break;
+    }
+
+    QLIST_REMOVE(rule, next);
+    g_free(rule);
+}
+
 static int read_config(BDRVBlkdebugState *s, const char *filename)
 {
     FILE *f;
@@ -266,43 +302,98 @@ fail:
 }
 
 /* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */
-static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags)
+static void blkdebug_parse_filename(const char *filename, QDict *options,
+                                    Error **errp)
 {
-    BDRVBlkdebugState *s = bs->opaque;
-    int ret;
-    char *config, *c;
+    const char *c;
 
     /* Parse the blkdebug: prefix */
-    if (strncmp(filename, "blkdebug:", strlen("blkdebug:"))) {
-        return -EINVAL;
+    if (!strstart(filename, "blkdebug:", &filename)) {
+        error_setg(errp, "File name string must start with 'blkdebug:'");
+        return;
     }
-    filename += strlen("blkdebug:");
 
-    /* Read rules from config file */
+    /* Parse config file path */
     c = strchr(filename, ':');
     if (c == NULL) {
-        return -EINVAL;
+        error_setg(errp, "blkdebug requires both config file and image path");
+        return;
     }
 
-    config = g_strdup(filename);
-    config[c - filename] = '\0';
-    ret = read_config(s, config);
-    g_free(config);
-    if (ret < 0) {
-        return ret;
+    if (c != filename) {
+        QString *config_path;
+        config_path = qstring_from_substr(filename, 0, c - filename - 1);
+        qdict_put(options, "config", config_path);
     }
+
+    /* TODO Allow multi-level nesting and set file.filename here */
     filename = c + 1;
+    qdict_put(options, "x-image", qstring_from_str(filename));
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "blkdebug",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "config",
+            .type = QEMU_OPT_STRING,
+            .help = "Path to the configuration file",
+        },
+        {
+            .name = "x-image",
+            .type = QEMU_OPT_STRING,
+            .help = "[internal use only, will be removed]",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename, *config;
+    int ret;
+
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Read rules from config file */
+    config = qemu_opt_get(opts, "config");
+    if (config) {
+        ret = read_config(s, config);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
 
     /* Set initial state */
     s->state = 1;
 
     /* Open the backing file */
-    ret = bdrv_file_open(&bs->file, filename, flags);
+    filename = qemu_opt_get(opts, "x-image");
+    if (filename == NULL) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = bdrv_file_open(&bs->file, filename, NULL, flags);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
 
-    return 0;
+    ret = 0;
+fail:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static void error_callback_bh(void *opaque)
@@ -389,6 +480,7 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
     return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
+
 static void blkdebug_close(BlockDriverState *bs)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -397,12 +489,32 @@ static void blkdebug_close(BlockDriverState *bs)
 
     for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
         QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
-            QLIST_REMOVE(rule, next);
-            g_free(rule);
+            remove_rule(rule);
         }
     }
 }
 
+static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq r;
+
+    r = (BlkdebugSuspendedReq) {
+        .co         = qemu_coroutine_self(),
+        .tag        = g_strdup(rule->options.suspend.tag),
+    };
+
+    remove_rule(rule);
+    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);
+
+    printf("blkdebug: Suspended request '%s'\n", r.tag);
+    qemu_coroutine_yield();
+    printf("blkdebug: Resuming request '%s'\n", r.tag);
+
+    QLIST_REMOVE(&r, next);
+    g_free(r.tag);
+}
+
 static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
     bool injected)
 {
@@ -426,6 +538,10 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
     case ACTION_SET_STATE:
         s->new_state = rule->options.set_state.new_state;
         break;
+
+    case ACTION_SUSPEND:
+        suspend_request(bs, rule);
+        break;
     }
     return injected;
 }
@@ -433,38 +549,94 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
 static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    struct BlkdebugRule *rule;
+    struct BlkdebugRule *rule, *next;
     bool injected;
 
     assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
 
     injected = false;
     s->new_state = s->state;
-    QLIST_FOREACH(rule, &s->rules[event], next) {
+    QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) {
         injected = process_rule(bs, rule, injected);
     }
     s->state = s->new_state;
 }
 
-static int64_t blkdebug_getlength(BlockDriverState *bs)
+static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
+                                     const char *tag)
 {
-    return bdrv_getlength(bs->file);
+    BDRVBlkdebugState *s = bs->opaque;
+    struct BlkdebugRule *rule;
+    BlkDebugEvent blkdebug_event;
+
+    if (get_event_by_name(event, &blkdebug_event) < 0) {
+        return -ENOENT;
+    }
+
+
+    rule = g_malloc(sizeof(*rule));
+    *rule = (struct BlkdebugRule) {
+        .event  = blkdebug_event,
+        .action = ACTION_SUSPEND,
+        .state  = 0,
+        .options.suspend.tag = g_strdup(tag),
+    };
+
+    QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next);
+
+    return 0;
+}
+
+static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
+
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            qemu_coroutine_enter(r->co, NULL);
+            return 0;
+        }
+    }
+    return -ENOENT;
 }
 
-static BlockDriver bdrv_blkdebug = {
-    .format_name        = "blkdebug",
-    .protocol_name      = "blkdebug",
 
-    .instance_size      = sizeof(BDRVBlkdebugState),
+static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
 
-    .bdrv_file_open     = blkdebug_open,
-    .bdrv_close         = blkdebug_close,
-    .bdrv_getlength     = blkdebug_getlength,
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            return true;
+        }
+    }
+    return false;
+}
 
-    .bdrv_aio_readv     = blkdebug_aio_readv,
-    .bdrv_aio_writev    = blkdebug_aio_writev,
+static int64_t blkdebug_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file);
+}
 
-    .bdrv_debug_event   = blkdebug_debug_event,
+static BlockDriver bdrv_blkdebug = {
+    .format_name            = "blkdebug",
+    .protocol_name          = "blkdebug",
+    .instance_size          = sizeof(BDRVBlkdebugState),
+
+    .bdrv_parse_filename    = blkdebug_parse_filename,
+    .bdrv_file_open         = blkdebug_open,
+    .bdrv_close             = blkdebug_close,
+    .bdrv_getlength         = blkdebug_getlength,
+
+    .bdrv_aio_readv         = blkdebug_aio_readv,
+    .bdrv_aio_writev        = blkdebug_aio_writev,
+
+    .bdrv_debug_event           = blkdebug_debug_event,
+    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
+    .bdrv_debug_resume          = blkdebug_debug_resume,
+    .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended,
 };
 
 static void bdrv_blkdebug_init(void)
diff --git a/block/blkverify.c b/block/blkverify.c
index 4beede77a..1d58cc393 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -8,8 +8,8 @@
  */
 
 #include <stdarg.h>
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
-#include "block_int.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
 
 typedef struct {
     BlockDriverState *test_file;
@@ -69,43 +69,100 @@ static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb,
 }
 
 /* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */
-static int blkverify_open(BlockDriverState *bs, const char *filename, int flags)
+static void blkverify_parse_filename(const char *filename, QDict *options,
+                                     Error **errp)
 {
-    BDRVBlkverifyState *s = bs->opaque;
-    int ret;
-    char *raw, *c;
+    const char *c;
+    QString *raw_path;
+
 
     /* Parse the blkverify: prefix */
-    if (strncmp(filename, "blkverify:", strlen("blkverify:"))) {
-        return -EINVAL;
+    if (!strstart(filename, "blkverify:", &filename)) {
+        error_setg(errp, "File name string must start with 'blkverify:'");
+        return;
     }
-    filename += strlen("blkverify:");
 
     /* Parse the raw image filename */
     c = strchr(filename, ':');
     if (c == NULL) {
-        return -EINVAL;
+        error_setg(errp, "blkverify requires raw copy and original image path");
+        return;
     }
 
-    raw = g_strdup(filename);
-    raw[c - filename] = '\0';
-    ret = bdrv_file_open(&bs->file, raw, flags);
-    g_free(raw);
+    /* TODO Implement option pass-through and set raw.filename here */
+    raw_path = qstring_from_substr(filename, 0, c - filename - 1);
+    qdict_put(options, "x-raw", raw_path);
+
+    /* TODO Allow multi-level nesting and set file.filename here */
+    filename = c + 1;
+    qdict_put(options, "x-image", qstring_from_str(filename));
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "blkverify",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "x-raw",
+            .type = QEMU_OPT_STRING,
+            .help = "[internal use only, will be removed]",
+        },
+        {
+            .name = "x-image",
+            .type = QEMU_OPT_STRING,
+            .help = "[internal use only, will be removed]",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int blkverify_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename, *raw;
+    int ret;
+
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Parse the raw image filename */
+    raw = qemu_opt_get(opts, "x-raw");
+    if (raw == NULL) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = bdrv_file_open(&bs->file, raw, NULL, flags);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
-    filename = c + 1;
 
     /* Open the test file */
+    filename = qemu_opt_get(opts, "x-image");
+    if (filename == NULL) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
     s->test_file = bdrv_new("");
-    ret = bdrv_open(s->test_file, filename, flags, NULL);
+    ret = bdrv_open(s->test_file, filename, NULL, flags, NULL);
     if (ret < 0) {
         bdrv_delete(s->test_file);
         s->test_file = NULL;
-        return ret;
+        goto fail;
     }
 
-    return 0;
+    ret = 0;
+fail:
+    return ret;
 }
 
 static void blkverify_close(BlockDriverState *bs)
@@ -343,19 +400,18 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
 }
 
 static BlockDriver bdrv_blkverify = {
-    .format_name        = "blkverify",
-    .protocol_name      = "blkverify",
-
-    .instance_size      = sizeof(BDRVBlkverifyState),
-
-    .bdrv_getlength     = blkverify_getlength,
-
-    .bdrv_file_open     = blkverify_open,
-    .bdrv_close         = blkverify_close,
-
-    .bdrv_aio_readv     = blkverify_aio_readv,
-    .bdrv_aio_writev    = blkverify_aio_writev,
-    .bdrv_aio_flush     = blkverify_aio_flush,
+    .format_name            = "blkverify",
+    .protocol_name          = "blkverify",
+    .instance_size          = sizeof(BDRVBlkverifyState),
+
+    .bdrv_parse_filename    = blkverify_parse_filename,
+    .bdrv_file_open         = blkverify_open,
+    .bdrv_close             = blkverify_close,
+    .bdrv_getlength         = blkverify_getlength,
+
+    .bdrv_aio_readv         = blkverify_aio_readv,
+    .bdrv_aio_writev        = blkverify_aio_writev,
+    .bdrv_aio_flush         = blkverify_aio_flush,
 };
 
 static void bdrv_blkverify_init(void)
diff --git a/block/bochs.c b/block/bochs.c
index ab7944dc4..d7078c077 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -23,8 +23,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 
@@ -108,17 +108,19 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int bochs_open(BlockDriverState *bs, int flags)
+static int bochs_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVBochsState *s = bs->opaque;
     int i;
     struct bochs_header bochs;
     struct bochs_header_v1 header_v1;
+    int ret;
 
     bs->read_only = 1; // no write support yet
 
-    if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) {
-        goto fail;
+    ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
+    if (ret < 0) {
+        return ret;
     }
 
     if (strcmp(bochs.magic, HEADER_MAGIC) ||
@@ -126,7 +128,7 @@ static int bochs_open(BlockDriverState *bs, int flags)
         strcmp(bochs.subtype, GROWING_TYPE) ||
 	((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
 	(le32_to_cpu(bochs.version) != HEADER_V1))) {
-        goto fail;
+        return -EMEDIUMTYPE;
     }
 
     if (le32_to_cpu(bochs.version) == HEADER_V1) {
@@ -138,9 +140,13 @@ static int bochs_open(BlockDriverState *bs, int flags)
 
     s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
     s->catalog_bitmap = g_malloc(s->catalog_size * 4);
-    if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
-                   s->catalog_size * 4) != s->catalog_size * 4)
-	goto fail;
+
+    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
+                     s->catalog_size * 4);
+    if (ret < 0) {
+        goto fail;
+    }
+
     for (i = 0; i < s->catalog_size; i++)
 	le32_to_cpus(&s->catalog_bitmap[i]);
 
@@ -153,8 +159,10 @@ static int bochs_open(BlockDriverState *bs, int flags)
 
     qemu_co_mutex_init(&s->lock);
     return 0;
- fail:
-    return -1;
+
+fail:
+    g_free(s->catalog_bitmap);
+    return ret;
 }
 
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
diff --git a/block/cloop.c b/block/cloop.c
index 7570eb8e7..6ea7cf404 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -22,8 +22,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
 
 typedef struct BDRVCloopState {
@@ -53,31 +53,36 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int cloop_open(BlockDriverState *bs, int flags)
+static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVCloopState *s = bs->opaque;
     uint32_t offsets_size, max_compressed_block_size = 1, i;
+    int ret;
 
     bs->read_only = 1;
 
     /* read header */
-    if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) {
-        goto cloop_close;
+    ret = bdrv_pread(bs->file, 128, &s->block_size, 4);
+    if (ret < 0) {
+        return ret;
     }
     s->block_size = be32_to_cpu(s->block_size);
 
-    if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) {
-        goto cloop_close;
+    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
+    if (ret < 0) {
+        return ret;
     }
     s->n_blocks = be32_to_cpu(s->n_blocks);
 
     /* read offsets */
     offsets_size = s->n_blocks * sizeof(uint64_t);
     s->offsets = g_malloc(offsets_size);
-    if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) <
-            offsets_size) {
-        goto cloop_close;
+
+    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
+    if (ret < 0) {
+        goto fail;
     }
+
     for(i=0;i<s->n_blocks;i++) {
         s->offsets[i] = be64_to_cpu(s->offsets[i]);
         if (i > 0) {
@@ -92,7 +97,8 @@ static int cloop_open(BlockDriverState *bs, int flags)
     s->compressed_block = g_malloc(max_compressed_block_size + 1);
     s->uncompressed_block = g_malloc(s->block_size);
     if (inflateInit(&s->zstream) != Z_OK) {
-        goto cloop_close;
+        ret = -EINVAL;
+        goto fail;
     }
     s->current_block = s->n_blocks;
 
@@ -101,8 +107,11 @@ static int cloop_open(BlockDriverState *bs, int flags)
     qemu_co_mutex_init(&s->lock);
     return 0;
 
-cloop_close:
-    return -1;
+fail:
+    g_free(s->offsets);
+    g_free(s->compressed_block);
+    g_free(s->uncompressed_block);
+    return ret;
 }
 
 static inline int cloop_read_block(BlockDriverState *bs, int block_num)
diff --git a/block/commit.c b/block/commit.c
index fae79582d..2227fc2e6 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -13,8 +13,8 @@
  */
 
 #include "trace.h"
-#include "block_int.h"
-#include "blockjob.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -65,7 +65,7 @@ static void coroutine_fn commit_run(void *opaque)
     BlockDriverState *active = s->active;
     BlockDriverState *top = s->top;
     BlockDriverState *base = s->base;
-    BlockDriverState *overlay_bs = NULL;
+    BlockDriverState *overlay_bs;
     int64_t sector_num, end;
     int ret = 0;
     int n = 0;
@@ -92,8 +92,6 @@ static void coroutine_fn commit_run(void *opaque)
         }
     }
 
-    overlay_bs = bdrv_find_overlay(active, top);
-
     end = s->common.len >> BDRV_SECTOR_BITS;
     buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
 
@@ -103,7 +101,7 @@ static void coroutine_fn commit_run(void *opaque)
 
 wait:
         /* Note that even when no rate limit is applied we need to yield
-         * with no pending I/O here so that qemu_aio_flush() returns.
+         * with no pending I/O here so that bdrv_drain_all() returns.
          */
         block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
@@ -156,7 +154,8 @@ exit_restore_reopen:
     if (s->base_flags != bdrv_get_flags(base)) {
         bdrv_reopen(base, s->base_flags, NULL);
     }
-    if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+    overlay_bs = bdrv_find_overlay(active, top);
+    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
         bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
     }
 
@@ -174,7 +173,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static BlockJobType commit_job_type = {
+static const BlockJobType commit_job_type = {
     .instance_size = sizeof(CommitBlockJob),
     .job_type      = "commit",
     .set_speed     = commit_set_speed,
diff --git a/block/cow.c b/block/cow.c
index a5a00eb9c..1cc2e89c7 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -22,8 +22,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 /* COW block driver using file system holes */
@@ -58,7 +58,7 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
         return 0;
 }
 
-static int cow_open(BlockDriverState *bs, int flags)
+static int cow_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVCowState *s = bs->opaque;
     struct cow_header_v2 cow_header;
@@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags)
     }
 
     if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
 
@@ -279,7 +279,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         return ret;
     }
 
-    ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
+    ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR);
     if (ret < 0) {
         return ret;
     }
@@ -340,6 +340,7 @@ static BlockDriver bdrv_cow = {
     .bdrv_open      = cow_open,
     .bdrv_close     = cow_close,
     .bdrv_create    = cow_create,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
 
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
diff --git a/block/curl.c b/block/curl.c
index 1179484de..82d39ff53 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -22,7 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include <curl/curl.h>
 
 // #define DEBUG
@@ -34,6 +34,10 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif
 
+#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
+                   CURLPROTO_FTP | CURLPROTO_FTPS | \
+                   CURLPROTO_TFTP)
+
 #define CURL_NUM_STATES 8
 #define CURL_NUM_ACB    8
 #define SECTOR_SIZE     512
@@ -77,6 +81,7 @@ typedef struct BDRVCURLState {
     CURLState states[CURL_NUM_STATES];
     char *url;
     size_t readahead_size;
+    bool accept_range;
 } BDRVCURLState;
 
 static void curl_clean_state(CURLState *s);
@@ -106,14 +111,15 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     return 0;
 }
 
-static size_t curl_size_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
+static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
 {
-    CURLState *s = ((CURLState*)opaque);
+    BDRVCURLState *s = opaque;
     size_t realsize = size * nmemb;
-    size_t fsize;
+    const char *accept_line = "Accept-Ranges: bytes";
 
-    if(sscanf(ptr, "Content-Length: %zd", &fsize) == 1) {
-        s->s->len = fsize;
+    if (realsize >= strlen(accept_line)
+        && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) {
+        s->accept_range = true;
     }
 
     return realsize;
@@ -302,6 +308,17 @@ static CURLState *curl_init_state(BDRVCURLState *s)
     curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
     curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);
 
+    /* Restrict supported protocols to avoid security issues in the more
+     * obscure protocols.  For example, do not allow POP3/SMTP/IMAP see
+     * CVE-2013-0249.
+     *
+     * Restricting protocols is only supported from 7.19.4 upwards.
+     */
+#if LIBCURL_VERSION_NUM >= 0x071304
+    curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS);
+    curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS);
+#endif
+
 #ifdef DEBUG_VERBOSE
     curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1);
 #endif
@@ -320,11 +337,9 @@ static void curl_clean_state(CURLState *s)
     s->in_use = 0;
 }
 
-static int curl_open(BlockDriverState *bs, const char *filename, int flags)
+static void curl_parse_filename(const char *filename, QDict *options,
+                                Error **errp)
 {
-    BDRVCURLState *s = bs->opaque;
-    CURLState *state = NULL;
-    double d;
 
     #define RA_OPTSTR ":readahead="
     char *file;
@@ -332,19 +347,17 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
     const char *ra_val;
     int parse_state = 0;
 
-    static int inited = 0;
-
     file = g_strdup(filename);
-    s->readahead_size = READ_AHEAD_SIZE;
 
     /* Parse a trailing ":readahead=#:" param, if present. */
     ra = file + strlen(file) - 1;
     while (ra >= file) {
         if (parse_state == 0) {
-            if (*ra == ':')
+            if (*ra == ':') {
                 parse_state++;
-            else
+            } else {
                 break;
+            }
         } else if (parse_state == 1) {
             if (*ra > '9' || *ra < '0') {
                 char *opt_start = ra - strlen(RA_OPTSTR) + 1;
@@ -353,46 +366,108 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
                     ra_val = ra + 1;
                     ra -= strlen(RA_OPTSTR) - 1;
                     *ra = '\0';
-                    s->readahead_size = atoi(ra_val);
-                    break;
-                } else {
-                    break;
+                    qdict_put(options, "readahead", qstring_from_str(ra_val));
                 }
+                break;
             }
         }
         ra--;
     }
 
+    qdict_put(options, "url", qstring_from_str(file));
+
+    g_free(file);
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "curl",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "url",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to open",
+        },
+        {
+            .name = "readahead",
+            .type = QEMU_OPT_SIZE,
+            .help = "Readahead size",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int curl_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVCURLState *s = bs->opaque;
+    CURLState *state = NULL;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *file;
+    double d;
+
+    static int inited = 0;
+
+    if (flags & BDRV_O_RDWR) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "curl block device does not support writes");
+        return -EROFS;
+    }
+
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        goto out_noclean;
+    }
+
+    s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE);
     if ((s->readahead_size & 0x1ff) != 0) {
         fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n",
                 s->readahead_size);
         goto out_noclean;
     }
 
+    file = qemu_opt_get(opts, "url");
+    if (file == NULL) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires "
+                      "an 'url' option");
+        goto out_noclean;
+    }
+
     if (!inited) {
         curl_global_init(CURL_GLOBAL_ALL);
         inited = 1;
     }
 
     DPRINTF("CURL: Opening %s\n", file);
-    s->url = file;
+    s->url = g_strdup(file);
     state = curl_init_state(s);
     if (!state)
         goto out_noclean;
 
     // Get file size
 
+    s->accept_range = false;
     curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1);
-    curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_size_cb);
+    curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION,
+                     curl_header_cb);
+    curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s);
     if (curl_easy_perform(state->curl))
         goto out;
     curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d);
-    curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb);
-    curl_easy_setopt(state->curl, CURLOPT_NOBODY, 0);
     if (d)
         s->len = (size_t)d;
     else if(!s->len)
         goto out;
+    if ((!strncasecmp(s->url, "http://", strlen("http://"))
+        || !strncasecmp(s->url, "https://", strlen("https://")))
+        && !s->accept_range) {
+        pstrcpy(state->errmsg, CURL_ERROR_SIZE,
+                "Server does not support 'range' (byte ranges).");
+        goto out;
+    }
     DPRINTF("CURL: Size = %zd\n", s->len);
 
     curl_clean_state(state);
@@ -403,10 +478,11 @@ static int curl_open(BlockDriverState *bs, const char *filename, int flags)
     // initialize the multi interface!
 
     s->multi = curl_multi_init();
-    curl_multi_setopt( s->multi, CURLMOPT_SOCKETDATA, s); 
-    curl_multi_setopt( s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb ); 
+    curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
+    curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
     curl_multi_do(s);
 
+    qemu_opts_del(opts);
     return 0;
 
 out:
@@ -414,7 +490,8 @@ out:
     curl_easy_cleanup(state->curl);
     state->curl = NULL;
 out_noclean:
-    g_free(file);
+    g_free(s->url);
+    qemu_opts_del(opts);
     return -EINVAL;
 }
 
@@ -552,63 +629,68 @@ static int64_t curl_getlength(BlockDriverState *bs)
 }
 
 static BlockDriver bdrv_http = {
-    .format_name     = "http",
-    .protocol_name   = "http",
+    .format_name            = "http",
+    .protocol_name          = "http",
 
-    .instance_size   = sizeof(BDRVCURLState),
-    .bdrv_file_open  = curl_open,
-    .bdrv_close      = curl_close,
-    .bdrv_getlength  = curl_getlength,
+    .instance_size          = sizeof(BDRVCURLState),
+    .bdrv_parse_filename    = curl_parse_filename,
+    .bdrv_file_open         = curl_open,
+    .bdrv_close             = curl_close,
+    .bdrv_getlength         = curl_getlength,
 
-    .bdrv_aio_readv  = curl_aio_readv,
+    .bdrv_aio_readv         = curl_aio_readv,
 };
 
 static BlockDriver bdrv_https = {
-    .format_name     = "https",
-    .protocol_name   = "https",
+    .format_name            = "https",
+    .protocol_name          = "https",
 
-    .instance_size   = sizeof(BDRVCURLState),
-    .bdrv_file_open  = curl_open,
-    .bdrv_close      = curl_close,
-    .bdrv_getlength  = curl_getlength,
+    .instance_size          = sizeof(BDRVCURLState),
+    .bdrv_parse_filename    = curl_parse_filename,
+    .bdrv_file_open         = curl_open,
+    .bdrv_close             = curl_close,
+    .bdrv_getlength         = curl_getlength,
 
-    .bdrv_aio_readv  = curl_aio_readv,
+    .bdrv_aio_readv         = curl_aio_readv,
 };
 
 static BlockDriver bdrv_ftp = {
-    .format_name     = "ftp",
-    .protocol_name   = "ftp",
+    .format_name            = "ftp",
+    .protocol_name          = "ftp",
 
-    .instance_size   = sizeof(BDRVCURLState),
-    .bdrv_file_open  = curl_open,
-    .bdrv_close      = curl_close,
-    .bdrv_getlength  = curl_getlength,
+    .instance_size          = sizeof(BDRVCURLState),
+    .bdrv_parse_filename    = curl_parse_filename,
+    .bdrv_file_open         = curl_open,
+    .bdrv_close             = curl_close,
+    .bdrv_getlength         = curl_getlength,
 
-    .bdrv_aio_readv  = curl_aio_readv,
+    .bdrv_aio_readv         = curl_aio_readv,
 };
 
 static BlockDriver bdrv_ftps = {
-    .format_name     = "ftps",
-    .protocol_name   = "ftps",
+    .format_name            = "ftps",
+    .protocol_name          = "ftps",
 
-    .instance_size   = sizeof(BDRVCURLState),
-    .bdrv_file_open  = curl_open,
-    .bdrv_close      = curl_close,
-    .bdrv_getlength  = curl_getlength,
+    .instance_size          = sizeof(BDRVCURLState),
+    .bdrv_parse_filename    = curl_parse_filename,
+    .bdrv_file_open         = curl_open,
+    .bdrv_close             = curl_close,
+    .bdrv_getlength         = curl_getlength,
 
-    .bdrv_aio_readv  = curl_aio_readv,
+    .bdrv_aio_readv         = curl_aio_readv,
 };
 
 static BlockDriver bdrv_tftp = {
-    .format_name     = "tftp",
-    .protocol_name   = "tftp",
+    .format_name            = "tftp",
+    .protocol_name          = "tftp",
 
-    .instance_size   = sizeof(BDRVCURLState),
-    .bdrv_file_open  = curl_open,
-    .bdrv_close      = curl_close,
-    .bdrv_getlength  = curl_getlength,
+    .instance_size          = sizeof(BDRVCURLState),
+    .bdrv_parse_filename    = curl_parse_filename,
+    .bdrv_file_open         = curl_open,
+    .bdrv_close             = curl_close,
+    .bdrv_getlength         = curl_getlength,
 
-    .bdrv_aio_readv  = curl_aio_readv,
+    .bdrv_aio_readv         = curl_aio_readv,
 };
 
 static void curl_block_init(void)
diff --git a/block/dmg.c b/block/dmg.c
index 37902a434..3141cb5b8 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -22,9 +22,9 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "bswap.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/bswap.h"
+#include "qemu/module.h"
 #include <zlib.h>
 
 typedef struct BDRVDMGState {
@@ -51,35 +51,55 @@ typedef struct BDRVDMGState {
 
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
-    int len=strlen(filename);
-    if(len>4 && !strcmp(filename+len-4,".dmg"))
-	return 2;
+    int len;
+
+    if (!filename) {
+        return 0;
+    }
+
+    len = strlen(filename);
+    if (len > 4 && !strcmp(filename + len - 4, ".dmg")) {
+        return 2;
+    }
     return 0;
 }
 
-static off_t read_off(BlockDriverState *bs, int64_t offset)
+static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
 {
-	uint64_t buffer;
-	if (bdrv_pread(bs->file, offset, &buffer, 8) < 8)
-		return 0;
-	return be64_to_cpu(buffer);
+    uint64_t buffer;
+    int ret;
+
+    ret = bdrv_pread(bs->file, offset, &buffer, 8);
+    if (ret < 0) {
+        return ret;
+    }
+
+    *result = be64_to_cpu(buffer);
+    return 0;
 }
 
-static off_t read_uint32(BlockDriverState *bs, int64_t offset)
+static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
 {
-	uint32_t buffer;
-	if (bdrv_pread(bs->file, offset, &buffer, 4) < 4)
-		return 0;
-	return be32_to_cpu(buffer);
+    uint32_t buffer;
+    int ret;
+
+    ret = bdrv_pread(bs->file, offset, &buffer, 4);
+    if (ret < 0) {
+        return ret;
+    }
+
+    *result = be32_to_cpu(buffer);
+    return 0;
 }
 
-static int dmg_open(BlockDriverState *bs, int flags)
+static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVDMGState *s = bs->opaque;
-    off_t info_begin,info_end,last_in_offset,last_out_offset;
-    uint32_t count;
+    uint64_t info_begin,info_end,last_in_offset,last_out_offset;
+    uint32_t count, tmp;
     uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
     int64_t offset;
+    int ret;
 
     bs->read_only = 1;
     s->n_chunks = 0;
@@ -88,21 +108,32 @@ static int dmg_open(BlockDriverState *bs, int flags)
     /* read offset of info blocks */
     offset = bdrv_getlength(bs->file);
     if (offset < 0) {
+        ret = offset;
         goto fail;
     }
     offset -= 0x1d8;
 
-    info_begin = read_off(bs, offset);
-    if (info_begin == 0) {
-	goto fail;
+    ret = read_uint64(bs, offset, &info_begin);
+    if (ret < 0) {
+        goto fail;
+    } else if (info_begin == 0) {
+        ret = -EINVAL;
+        goto fail;
     }
 
-    if (read_uint32(bs, info_begin) != 0x100) {
+    ret = read_uint32(bs, info_begin, &tmp);
+    if (ret < 0) {
+        goto fail;
+    } else if (tmp != 0x100) {
+        ret = -EINVAL;
         goto fail;
     }
 
-    count = read_uint32(bs, info_begin + 4);
-    if (count == 0) {
+    ret = read_uint32(bs, info_begin + 4, &count);
+    if (ret < 0) {
+        goto fail;
+    } else if (count == 0) {
+        ret = -EINVAL;
         goto fail;
     }
     info_end = info_begin + count;
@@ -114,12 +145,20 @@ static int dmg_open(BlockDriverState *bs, int flags)
     while (offset < info_end) {
         uint32_t type;
 
-	count = read_uint32(bs, offset);
-	if(count==0)
-	    goto fail;
+        ret = read_uint32(bs, offset, &count);
+        if (ret < 0) {
+            goto fail;
+        } else if (count == 0) {
+            ret = -EINVAL;
+            goto fail;
+        }
         offset += 4;
 
-	type = read_uint32(bs, offset);
+        ret = read_uint32(bs, offset, &type);
+        if (ret < 0) {
+            goto fail;
+        }
+
 	if (type == 0x6d697368 && count >= 244) {
 	    int new_size, chunk_count;
 
@@ -134,8 +173,11 @@ static int dmg_open(BlockDriverState *bs, int flags)
 	    s->sectors = g_realloc(s->sectors, new_size);
 	    s->sectorcounts = g_realloc(s->sectorcounts, new_size);
 
-	    for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
-		s->types[i] = read_uint32(bs, offset);
+            for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
+                ret = read_uint32(bs, offset, &s->types[i]);
+                if (ret < 0) {
+                    goto fail;
+                }
 		offset += 4;
 		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
 		    if(s->types[i]==0xffffffff) {
@@ -149,17 +191,31 @@ static int dmg_open(BlockDriverState *bs, int flags)
 		}
 		offset += 4;
 
-		s->sectors[i] = last_out_offset+read_off(bs, offset);
-		offset += 8;
-
-		s->sectorcounts[i] = read_off(bs, offset);
-		offset += 8;
-
-		s->offsets[i] = last_in_offset+read_off(bs, offset);
-		offset += 8;
-
-		s->lengths[i] = read_off(bs, offset);
-		offset += 8;
+                ret = read_uint64(bs, offset, &s->sectors[i]);
+                if (ret < 0) {
+                    goto fail;
+                }
+                s->sectors[i] += last_out_offset;
+                offset += 8;
+
+                ret = read_uint64(bs, offset, &s->sectorcounts[i]);
+                if (ret < 0) {
+                    goto fail;
+                }
+                offset += 8;
+
+                ret = read_uint64(bs, offset, &s->offsets[i]);
+                if (ret < 0) {
+                    goto fail;
+                }
+                s->offsets[i] += last_in_offset;
+                offset += 8;
+
+                ret = read_uint64(bs, offset, &s->lengths[i]);
+                if (ret < 0) {
+                    goto fail;
+                }
+                offset += 8;
 
 		if(s->lengths[i]>max_compressed_size)
 		    max_compressed_size = s->lengths[i];
@@ -173,15 +229,25 @@ static int dmg_open(BlockDriverState *bs, int flags)
     /* initialize zlib engine */
     s->compressed_chunk = g_malloc(max_compressed_size+1);
     s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
-    if(inflateInit(&s->zstream) != Z_OK)
-	goto fail;
+    if(inflateInit(&s->zstream) != Z_OK) {
+        ret = -EINVAL;
+        goto fail;
+    }
 
     s->current_chunk = s->n_chunks;
 
     qemu_co_mutex_init(&s->lock);
     return 0;
+
 fail:
-    return -1;
+    g_free(s->types);
+    g_free(s->offsets);
+    g_free(s->lengths);
+    g_free(s->sectors);
+    g_free(s->sectorcounts);
+    g_free(s->compressed_chunk);
+    g_free(s->uncompressed_chunk);
+    return ret;
 }
 
 static inline int is_sector_in_chunk(BDRVDMGState* s,
@@ -296,15 +362,15 @@ static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
 static void dmg_close(BlockDriverState *bs)
 {
     BDRVDMGState *s = bs->opaque;
-    if(s->n_chunks>0) {
-	free(s->types);
-	free(s->offsets);
-	free(s->lengths);
-	free(s->sectors);
-	free(s->sectorcounts);
-    }
-    free(s->compressed_chunk);
-    free(s->uncompressed_chunk);
+
+    g_free(s->types);
+    g_free(s->offsets);
+    g_free(s->lengths);
+    g_free(s->sectors);
+    g_free(s->sectorcounts);
+    g_free(s->compressed_chunk);
+    g_free(s->uncompressed_chunk);
+
     inflateEnd(&s->zstream);
 }
 
diff --git a/block/gluster.c b/block/gluster.c
index 1c90174b1..645b7f12a 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -16,9 +16,9 @@
  * GNU GPL, version 2 or (at your option) any later version.
  */
 #include <glusterfs/api/glfs.h>
-#include "block_int.h"
-#include "qemu_socket.h"
-#include "uri.h"
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
 
 typedef struct GlusterAIOCB {
     BlockDriverAIOCB common;
@@ -217,7 +217,7 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
     ret = glfs_init(glfs);
     if (ret) {
         error_report("Gluster connection failed for server=%s port=%d "
-             "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+             "volume=%s image=%s transport=%s", gconf->server, gconf->port,
              gconf->volname, gconf->image, gconf->transport);
         goto out;
     }
@@ -282,13 +282,42 @@ static int qemu_gluster_aio_flush_cb(void *opaque)
     return (s->qemu_aio_count > 0);
 }
 
-static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
-    int bdrv_flags)
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+    .name = "gluster",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the gluster image",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
+                             int bdrv_flags)
 {
     BDRVGlusterState *s = bs->opaque;
     int open_flags = O_BINARY;
     int ret = 0;
     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
+
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
 
     s->glfs = qemu_gluster_init(gconf, filename);
     if (!s->glfs) {
@@ -322,6 +351,7 @@ static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
         qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
 
 out:
+    qemu_opts_del(opts);
     qemu_gluster_gconf_free(gconf);
     if (!ret) {
         return ret;
@@ -463,6 +493,19 @@ out:
     return NULL;
 }
 
+static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret;
+    BDRVGlusterState *s = bs->opaque;
+
+    ret = glfs_ftruncate(s->fd, offset);
+    if (ret < 0) {
+        return -errno;
+    }
+
+    return 0;
+}
+
 static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
@@ -502,6 +545,39 @@ out:
     return NULL;
 }
 
+#ifdef CONFIG_GLUSTERFS_DISCARD
+static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
+        void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->finished = NULL;
+    s->qemu_aio_count++;
+
+    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+#endif
+
 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
 {
     BDRVGlusterState *s = bs->opaque;
@@ -544,6 +620,12 @@ static void qemu_gluster_close(BlockDriverState *bs)
     glfs_fini(s->glfs);
 }
 
+static int qemu_gluster_has_zero_init(BlockDriverState *bs)
+{
+    /* GlusterFS volume could be backed by a block device */
+    return 0;
+}
+
 static QEMUOptionParameter qemu_gluster_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -562,9 +644,14 @@ static BlockDriver bdrv_gluster = {
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_truncate                = qemu_gluster_truncate,
     .bdrv_aio_readv               = qemu_gluster_aio_readv,
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -577,9 +664,14 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_truncate                = qemu_gluster_truncate,
     .bdrv_aio_readv               = qemu_gluster_aio_readv,
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -592,9 +684,14 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_truncate                = qemu_gluster_truncate,
     .bdrv_aio_readv               = qemu_gluster_aio_readv,
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -607,9 +704,14 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_truncate                = qemu_gluster_truncate,
     .bdrv_aio_readv               = qemu_gluster_aio_readv,
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
diff --git a/block/iscsi.c b/block/iscsi.c
index c0b70b3d3..e7c1c2b53 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -27,17 +27,19 @@
 #include <poll.h>
 #include <arpa/inet.h>
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
 #include "trace.h"
-#include "hw/scsi-defs.h"
+#include "block/scsi.h"
+#include "qemu/iov.h"
 
 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
 
 #ifdef __linux__
 #include <scsi/sg.h>
-#include <hw/scsi-defs.h>
+#include <block/scsi.h>
 #endif
 
 typedef struct IscsiLun {
@@ -47,6 +49,7 @@ typedef struct IscsiLun {
     int block_size;
     uint64_t num_blocks;
     int events;
+    QEMUTimer *nop_timer;
 } IscsiLun;
 
 typedef struct IscsiAIOCB {
@@ -58,13 +61,18 @@ typedef struct IscsiAIOCB {
     uint8_t *buf;
     int status;
     int canceled;
-    size_t read_size;
-    size_t read_offset;
+    int retries;
+    int64_t sector_num;
+    int nb_sectors;
 #ifdef __linux__
     sg_io_hdr_t *ioh;
 #endif
 } IscsiAIOCB;
 
+#define NOP_INTERVAL 5000
+#define MAX_NOP_FAILURES 3
+#define ISCSI_CMD_RETRIES 5
+
 static void
 iscsi_bh_cb(void *p)
 {
@@ -72,6 +80,9 @@ iscsi_bh_cb(void *p)
 
     qemu_bh_delete(acb->bh);
 
+    g_free(acb->buf);
+    acb->buf = NULL;
+
     if (acb->canceled == 0) {
         acb->common.cb(acb->common.opaque, acb->status);
     }
@@ -183,6 +194,8 @@ iscsi_process_write(void *arg)
     iscsi_set_events(iscsilun);
 }
 
+static int
+iscsi_aio_writev_acb(IscsiAIOCB *acb);
 
 static void
 iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
@@ -193,13 +206,24 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
     trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled);
 
     g_free(acb->buf);
+    acb->buf = NULL;
 
     if (acb->canceled != 0) {
         return;
     }
 
     acb->status = 0;
-    if (status < 0) {
+    if (status != 0) {
+        if (status == SCSI_STATUS_CHECK_CONDITION
+            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+            && acb->retries-- > 0) {
+            scsi_free_scsi_task(acb->task);
+            acb->task = NULL;
+            if (iscsi_aio_writev_acb(acb) == 0) {
+                iscsi_set_events(acb->iscsilun);
+                return;
+            }
+        }
         error_report("Failed to write16 data to iSCSI lun. %s",
                      iscsi_get_error(iscsi));
         acb->status = -EIO;
@@ -208,78 +232,139 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
     iscsi_schedule_bh(acb);
 }
 
+static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+    return sector * iscsilun->block_size / BDRV_SECTOR_SIZE;
+}
+
 static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
 {
     return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }
 
-static BlockDriverAIOCB *
-iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                 QEMUIOVector *qiov, int nb_sectors,
-                 BlockDriverCompletionFunc *cb,
-                 void *opaque)
+static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
+                                      IscsiLun *iscsilun)
 {
-    IscsiLun *iscsilun = bs->opaque;
-    struct iscsi_context *iscsi = iscsilun->iscsi;
-    IscsiAIOCB *acb;
+    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
+        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
+            error_report("iSCSI misaligned request: "
+                         "iscsilun->block_size %u, sector_num %" PRIi64
+                         ", nb_sectors %d",
+                         iscsilun->block_size, sector_num, nb_sectors);
+            return 0;
+    }
+    return 1;
+}
+
+static int
+iscsi_aio_writev_acb(IscsiAIOCB *acb)
+{
+    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
     size_t size;
     uint32_t num_sectors;
     uint64_t lba;
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
     struct iscsi_data data;
-
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->iscsilun = iscsilun;
-    acb->qiov     = qiov;
+#endif
+    int ret;
 
     acb->canceled   = 0;
     acb->bh         = NULL;
     acb->status     = -EINPROGRESS;
+    acb->buf        = NULL;
 
-    /* XXX we should pass the iovec to write16 to avoid the extra copy */
     /* this will allow us to get rid of 'buf' completely */
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->buf = g_malloc(size);
-    qemu_iovec_to_buf(acb->qiov, 0, acb->buf, size);
+    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
+
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
+    data.size = MIN(size, acb->qiov->size);
+
+    /* if the iovec only contains one buffer we can pass it directly */
+    if (acb->qiov->niov == 1) {
+        data.data = acb->qiov->iov[0].iov_base;
+    } else {
+        acb->buf = g_malloc(data.size);
+        qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size);
+        data.data = acb->buf;
+    }
+#endif
 
     acb->task = malloc(sizeof(struct scsi_task));
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to allocate task for scsi WRITE16 "
                      "command. %s", iscsi_get_error(iscsi));
-        qemu_aio_release(acb);
-        return NULL;
+        return -1;
     }
     memset(acb->task, 0, sizeof(struct scsi_task));
 
     acb->task->xfer_dir = SCSI_XFER_WRITE;
     acb->task->cdb_size = 16;
     acb->task->cdb[0] = 0x8a;
-    lba = sector_qemu2lun(sector_num, iscsilun);
+    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
     *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
     *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
-    num_sectors = size / iscsilun->block_size;
+    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
     *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
     acb->task->expxferlen = size;
 
-    data.data = acb->buf;
-    data.size = size;
-
-    if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
-                                 iscsi_aio_write16_cb,
-                                 &data,
-                                 acb) != 0) {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+                                   iscsi_aio_write16_cb,
+                                   NULL,
+                                   acb);
+#else
+    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+                                   iscsi_aio_write16_cb,
+                                   &data,
+                                   acb);
+#endif
+    if (ret != 0) {
         scsi_free_scsi_task(acb->task);
         g_free(acb->buf);
+        return -1;
+    }
+
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+    scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+#endif
+
+    return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                 QEMUIOVector *qiov, int nb_sectors,
+                 BlockDriverCompletionFunc *cb,
+                 void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    IscsiAIOCB *acb;
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return NULL;
+    }
+
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+    trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
+
+    acb->iscsilun    = iscsilun;
+    acb->qiov        = qiov;
+    acb->nb_sectors  = nb_sectors;
+    acb->sector_num  = sector_num;
+    acb->retries     = ISCSI_CMD_RETRIES;
+
+    if (iscsi_aio_writev_acb(acb) != 0) {
         qemu_aio_release(acb);
         return NULL;
     }
 
     iscsi_set_events(iscsilun);
-
     return &acb->common;
 }
 
+static int
+iscsi_aio_readv_acb(IscsiAIOCB *acb);
+
 static void
 iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
                     void *command_data, void *opaque)
@@ -294,6 +379,16 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
 
     acb->status = 0;
     if (status != 0) {
+        if (status == SCSI_STATUS_CHECK_CONDITION
+            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+            && acb->retries-- > 0) {
+            scsi_free_scsi_task(acb->task);
+            acb->task = NULL;
+            if (iscsi_aio_readv_acb(acb) == 0) {
+                iscsi_set_events(acb->iscsilun);
+                return;
+            }
+        }
         error_report("Failed to read16 data from iSCSI lun. %s",
                      iscsi_get_error(iscsi));
         acb->status = -EIO;
@@ -302,63 +397,39 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
     iscsi_schedule_bh(acb);
 }
 
-static BlockDriverAIOCB *
-iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                QEMUIOVector *qiov, int nb_sectors,
-                BlockDriverCompletionFunc *cb,
-                void *opaque)
+static int
+iscsi_aio_readv_acb(IscsiAIOCB *acb)
 {
-    IscsiLun *iscsilun = bs->opaque;
-    struct iscsi_context *iscsi = iscsilun->iscsi;
-    IscsiAIOCB *acb;
-    size_t qemu_read_size;
-    int i;
+    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
+    size_t size;
     uint64_t lba;
     uint32_t num_sectors;
-
-    qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
-
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->iscsilun = iscsilun;
-    acb->qiov     = qiov;
+    int ret;
+#if !defined(LIBISCSI_FEATURE_IOVECTOR)
+    int i;
+#endif
 
     acb->canceled    = 0;
     acb->bh          = NULL;
     acb->status      = -EINPROGRESS;
-    acb->read_size   = qemu_read_size;
     acb->buf         = NULL;
 
-    /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU
-     * may be misaligned to the LUN, so we may need to read some extra
-     * data.
-     */
-    acb->read_offset = 0;
-    if (iscsilun->block_size > BDRV_SECTOR_SIZE) {
-        uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num;
-
-        acb->read_offset  = bdrv_offset % iscsilun->block_size;
-    }
-
-    num_sectors  = (qemu_read_size + iscsilun->block_size
-                    + acb->read_offset - 1)
-                    / iscsilun->block_size;
+    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
 
     acb->task = malloc(sizeof(struct scsi_task));
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to allocate task for scsi READ16 "
                      "command. %s", iscsi_get_error(iscsi));
-        qemu_aio_release(acb);
-        return NULL;
+        return -1;
     }
     memset(acb->task, 0, sizeof(struct scsi_task));
 
     acb->task->xfer_dir = SCSI_XFER_READ;
-    lba = sector_qemu2lun(sector_num, iscsilun);
-    acb->task->expxferlen = qemu_read_size;
+    acb->task->expxferlen = size;
+    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
+    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
 
-    switch (iscsilun->type) {
+    switch (acb->iscsilun->type) {
     case TYPE_DISK:
         acb->task->cdb_size = 16;
         acb->task->cdb[0]  = 0x88;
@@ -374,26 +445,60 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
         break;
     }
 
-    if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
-                                 iscsi_aio_read16_cb,
-                                 NULL,
-                                 acb) != 0) {
+    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
+                                   iscsi_aio_read16_cb,
+                                   NULL,
+                                   acb);
+    if (ret != 0) {
         scsi_free_scsi_task(acb->task);
-        qemu_aio_release(acb);
-        return NULL;
+        return -1;
     }
 
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+    scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+#else
     for (i = 0; i < acb->qiov->niov; i++) {
         scsi_task_add_data_in_buffer(acb->task,
                 acb->qiov->iov[i].iov_len,
                 acb->qiov->iov[i].iov_base);
     }
+#endif
+    return 0;
+}
 
-    iscsi_set_events(iscsilun);
+static BlockDriverAIOCB *
+iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                QEMUIOVector *qiov, int nb_sectors,
+                BlockDriverCompletionFunc *cb,
+                void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    IscsiAIOCB *acb;
 
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return NULL;
+    }
+
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+    trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
+
+    acb->nb_sectors  = nb_sectors;
+    acb->sector_num  = sector_num;
+    acb->iscsilun    = iscsilun;
+    acb->qiov        = qiov;
+    acb->retries     = ISCSI_CMD_RETRIES;
+
+    if (iscsi_aio_readv_acb(acb) != 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+
+    iscsi_set_events(iscsilun);
     return &acb->common;
 }
 
+static int
+iscsi_aio_flush_acb(IscsiAIOCB *acb);
 
 static void
 iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
@@ -406,7 +511,17 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
     }
 
     acb->status = 0;
-    if (status < 0) {
+    if (status != 0) {
+        if (status == SCSI_STATUS_CHECK_CONDITION
+            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+            && acb->retries-- > 0) {
+            scsi_free_scsi_task(acb->task);
+            acb->task = NULL;
+            if (iscsi_aio_flush_acb(acb) == 0) {
+                iscsi_set_events(acb->iscsilun);
+                return;
+            }
+        }
         error_report("Failed to sync10 data on iSCSI lun. %s",
                      iscsi_get_error(iscsi));
         acb->status = -EIO;
@@ -415,28 +530,43 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
     iscsi_schedule_bh(acb);
 }
 
-static BlockDriverAIOCB *
-iscsi_aio_flush(BlockDriverState *bs,
-                BlockDriverCompletionFunc *cb, void *opaque)
+static int
+iscsi_aio_flush_acb(IscsiAIOCB *acb)
 {
-    IscsiLun *iscsilun = bs->opaque;
-    struct iscsi_context *iscsi = iscsilun->iscsi;
-    IscsiAIOCB *acb;
+    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
 
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
-    acb->iscsilun = iscsilun;
     acb->canceled   = 0;
     acb->bh         = NULL;
     acb->status     = -EINPROGRESS;
+    acb->buf        = NULL;
 
-    acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
+    acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun,
                                          0, 0, 0, 0,
                                          iscsi_synccache10_cb,
                                          acb);
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to send synchronizecache10 command. %s",
                      iscsi_get_error(iscsi));
+        return -1;
+    }
+
+    return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_flush(BlockDriverState *bs,
+                BlockDriverCompletionFunc *cb, void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+
+    IscsiAIOCB *acb;
+
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+
+    acb->iscsilun    = iscsilun;
+    acb->retries     = ISCSI_CMD_RETRIES;
+
+    if (iscsi_aio_flush_acb(acb) != 0) {
         qemu_aio_release(acb);
         return NULL;
     }
@@ -446,6 +576,8 @@ iscsi_aio_flush(BlockDriverState *bs,
     return &acb->common;
 }
 
+static int iscsi_aio_discard_acb(IscsiAIOCB *acb);
+
 static void
 iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
                      void *command_data, void *opaque)
@@ -457,7 +589,17 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
     }
 
     acb->status = 0;
-    if (status < 0) {
+    if (status != 0) {
+        if (status == SCSI_STATUS_CHECK_CONDITION
+            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+            && acb->retries-- > 0) {
+            scsi_free_scsi_task(acb->task);
+            acb->task = NULL;
+            if (iscsi_aio_discard_acb(acb) == 0) {
+                iscsi_set_events(acb->iscsilun);
+                return;
+            }
+        }
         error_report("Failed to unmap data on iSCSI lun. %s",
                      iscsi_get_error(iscsi));
         acb->status = -EIO;
@@ -466,33 +608,47 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
     iscsi_schedule_bh(acb);
 }
 
-static BlockDriverAIOCB *
-iscsi_aio_discard(BlockDriverState *bs,
-                  int64_t sector_num, int nb_sectors,
-                  BlockDriverCompletionFunc *cb, void *opaque)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    struct iscsi_context *iscsi = iscsilun->iscsi;
-    IscsiAIOCB *acb;
+static int iscsi_aio_discard_acb(IscsiAIOCB *acb) {
+    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
     struct unmap_list list[1];
 
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
-    acb->iscsilun = iscsilun;
     acb->canceled   = 0;
     acb->bh         = NULL;
     acb->status     = -EINPROGRESS;
+    acb->buf        = NULL;
 
-    list[0].lba = sector_qemu2lun(sector_num, iscsilun);
-    list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
+    list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
+    list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size;
 
-    acb->task = iscsi_unmap_task(iscsi, iscsilun->lun,
+    acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun,
                                  0, 0, &list[0], 1,
                                  iscsi_unmap_cb,
                                  acb);
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to send unmap command. %s",
                      iscsi_get_error(iscsi));
+        return -1;
+    }
+
+    return 0;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_discard(BlockDriverState *bs,
+                  int64_t sector_num, int nb_sectors,
+                  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    IscsiAIOCB *acb;
+
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
+
+    acb->iscsilun    = iscsilun;
+    acb->nb_sectors  = nb_sectors;
+    acb->sector_num  = sector_num;
+    acb->retries     = ISCSI_CMD_RETRIES;
+
+    if (iscsi_aio_discard_acb(acb) != 0) {
         qemu_aio_release(acb);
         return NULL;
     }
@@ -509,6 +665,9 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
 {
     IscsiAIOCB *acb = opaque;
 
+    g_free(acb->buf);
+    acb->buf = NULL;
+
     if (acb->canceled != 0) {
         return;
     }
@@ -585,14 +744,30 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len);
     acb->task->expxferlen = acb->ioh->dxfer_len;
 
+    data.size = 0;
     if (acb->task->xfer_dir == SCSI_XFER_WRITE) {
-        data.data = acb->ioh->dxferp;
-        data.size = acb->ioh->dxfer_len;
+        if (acb->ioh->iovec_count == 0) {
+            data.data = acb->ioh->dxferp;
+            data.size = acb->ioh->dxfer_len;
+        } else {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+            scsi_task_set_iov_out(acb->task,
+                                 (struct scsi_iovec *) acb->ioh->dxferp,
+                                 acb->ioh->iovec_count);
+#else
+            struct iovec *iov = (struct iovec *)acb->ioh->dxferp;
+
+            acb->buf = g_malloc(acb->ioh->dxfer_len);
+            data.data = acb->buf;
+            data.size = iov_to_buf(iov, acb->ioh->iovec_count, 0,
+                                   acb->buf, acb->ioh->dxfer_len);
+#endif
+        }
     }
+
     if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
                                  iscsi_aio_ioctl_cb,
-                                 (acb->task->xfer_dir == SCSI_XFER_WRITE) ?
-                                     &data : NULL,
+                                 (data.size > 0) ? &data : NULL,
                                  acb) != 0) {
         scsi_free_scsi_task(acb->task);
         qemu_aio_release(acb);
@@ -601,9 +776,26 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
 
     /* tell libiscsi to read straight into the buffer we got from ioctl */
     if (acb->task->xfer_dir == SCSI_XFER_READ) {
-        scsi_task_add_data_in_buffer(acb->task,
-                                     acb->ioh->dxfer_len,
-                                     acb->ioh->dxferp);
+        if (acb->ioh->iovec_count == 0) {
+            scsi_task_add_data_in_buffer(acb->task,
+                                         acb->ioh->dxfer_len,
+                                         acb->ioh->dxferp);
+        } else {
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+            scsi_task_set_iov_in(acb->task,
+                                 (struct scsi_iovec *) acb->ioh->dxferp,
+                                 acb->ioh->iovec_count);
+#else
+            int i;
+            for (i = 0; i < acb->ioh->iovec_count; i++) {
+                struct iovec *iov = (struct iovec *)acb->ioh->dxferp;
+
+                scsi_task_add_data_in_buffer(acb->task,
+                    iov[i].iov_len,
+                    iov[i].iov_base);
+            }
+#endif
+        }
     }
 
     iscsi_set_events(iscsilun);
@@ -761,20 +953,118 @@ static char *parse_initiator_name(const char *target)
     }
 }
 
+#if defined(LIBISCSI_FEATURE_NOP_COUNTER)
+static void iscsi_nop_timed_event(void *opaque)
+{
+    IscsiLun *iscsilun = opaque;
+
+    if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) {
+        error_report("iSCSI: NOP timeout. Reconnecting...");
+        iscsi_reconnect(iscsilun->iscsi);
+    }
+
+    if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
+        error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
+        return;
+    }
+
+    qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+    iscsi_set_events(iscsilun);
+}
+#endif
+
+static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
+{
+    struct scsi_task *task = NULL;
+    struct scsi_readcapacity10 *rc10 = NULL;
+    struct scsi_readcapacity16 *rc16 = NULL;
+    int ret = 0;
+    int retries = ISCSI_CMD_RETRIES; 
+
+    do {
+        if (task != NULL) {
+            scsi_free_scsi_task(task);
+            task = NULL;
+        }
+
+        switch (iscsilun->type) {
+        case TYPE_DISK:
+            task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun);
+            if (task != NULL && task->status == SCSI_STATUS_GOOD) {
+                rc16 = scsi_datain_unmarshall(task);
+                if (rc16 == NULL) {
+                    error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+                    ret = -EINVAL;
+                } else {
+                    iscsilun->block_size = rc16->block_length;
+                    iscsilun->num_blocks = rc16->returned_lba + 1;
+                }
+            }
+            break;
+        case TYPE_ROM:
+            task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0);
+            if (task != NULL && task->status == SCSI_STATUS_GOOD) {
+                rc10 = scsi_datain_unmarshall(task);
+                if (rc10 == NULL) {
+                    error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+                    ret = -EINVAL;
+                } else {
+                    iscsilun->block_size = rc10->block_size;
+                    if (rc10->lba == 0) {
+                        /* blank disk loaded */
+                        iscsilun->num_blocks = 0;
+                    } else {
+                        iscsilun->num_blocks = rc10->lba + 1;
+                    }
+                }
+            }
+            break;
+        default:
+            return 0;
+        }
+    } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION
+             && task->sense.key == SCSI_SENSE_UNIT_ATTENTION
+             && retries-- > 0);
+
+    if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI: failed to send readcapacity10 command.");
+        ret = -EINVAL;
+    }
+    if (task) {
+        scsi_free_scsi_task(task);
+    }
+    return ret;
+}
+
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+    .name = "iscsi",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the iscsi image",
+        },
+        { /* end of list */ }
+    },
+};
+
 /*
  * We support iscsi url's on the form
  * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
  */
-static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
+static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct iscsi_context *iscsi = NULL;
     struct iscsi_url *iscsi_url = NULL;
     struct scsi_task *task = NULL;
     struct scsi_inquiry_standard *inq = NULL;
-    struct scsi_readcapacity10 *rc10 = NULL;
-    struct scsi_readcapacity16 *rc16 = NULL;
     char *initiator_name = NULL;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
     int ret;
 
     if ((BDRV_SECTOR_SIZE % 512) != 0) {
@@ -784,6 +1074,18 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
         return -EINVAL;
     }
 
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
+
     iscsi_url = iscsi_parse_full_url(iscsi, filename);
     if (iscsi_url == NULL) {
         error_report("Failed to parse URL : %s", filename);
@@ -863,52 +1165,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
 
     iscsilun->type = inq->periperal_device_type;
 
-    scsi_free_scsi_task(task);
-
-    switch (iscsilun->type) {
-    case TYPE_DISK:
-        task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun);
-        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
-            error_report("iSCSI: failed to send readcapacity16 command.");
-            ret = -EINVAL;
-            goto out;
-        }
-        rc16 = scsi_datain_unmarshall(task);
-        if (rc16 == NULL) {
-            error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
-            ret = -EINVAL;
-            goto out;
-        }
-        iscsilun->block_size = rc16->block_length;
-        iscsilun->num_blocks = rc16->returned_lba + 1;
-        break;
-    case TYPE_ROM:
-        task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0);
-        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
-            error_report("iSCSI: failed to send readcapacity10 command.");
-            ret = -EINVAL;
-            goto out;
-        }
-        rc10 = scsi_datain_unmarshall(task);
-        if (rc10 == NULL) {
-            error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
-            ret = -EINVAL;
-            goto out;
-        }
-        iscsilun->block_size = rc10->block_size;
-        if (rc10->lba == 0) {
-            /* blank disk loaded */
-            iscsilun->num_blocks = 0;
-        } else {
-            iscsilun->num_blocks = rc10->lba + 1;
-        }
-        break;
-    default:
-        break;
+    if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
+        goto out;
     }
-
-    bs->total_sectors    = iscsilun->num_blocks *
-                           iscsilun->block_size / BDRV_SECTOR_SIZE ;
+    bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
 
     /* Medium changer or tape. We dont have any emulation for this so this must
      * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
@@ -919,9 +1179,14 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
         bs->sg = 1;
     }
 
-    ret = 0;
+#if defined(LIBISCSI_FEATURE_NOP_COUNTER)
+    /* Set up a timer for sending out iSCSI NOPs */
+    iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun);
+    qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+#endif
 
 out:
+    qemu_opts_del(opts);
     if (initiator_name != NULL) {
         g_free(initiator_name);
     }
@@ -946,16 +1211,100 @@ static void iscsi_close(BlockDriverState *bs)
     IscsiLun *iscsilun = bs->opaque;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    if (iscsilun->nop_timer) {
+        qemu_del_timer(iscsilun->nop_timer);
+        qemu_free_timer(iscsilun->nop_timer);
+    }
     qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL);
     iscsi_destroy_context(iscsi);
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    int ret = 0;
+
+    if (iscsilun->type != TYPE_DISK) {
+        return -ENOTSUP;
+    }
+
+    if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
+        return ret;
+    }
+
+    if (offset > iscsi_getlength(bs)) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static int iscsi_has_zero_init(BlockDriverState *bs)
 {
     return 0;
 }
 
+static int iscsi_create(const char *filename, QEMUOptionParameter *options)
+{
+    int ret = 0;
+    int64_t total_size = 0;
+    BlockDriverState bs;
+    IscsiLun *iscsilun = NULL;
+    QDict *bs_options;
+
+    memset(&bs, 0, sizeof(BlockDriverState));
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, "size")) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    bs.opaque = g_malloc0(sizeof(struct IscsiLun));
+    iscsilun = bs.opaque;
+
+    bs_options = qdict_new();
+    qdict_put(bs_options, "filename", qstring_from_str(filename));
+    ret = iscsi_open(&bs, bs_options, 0);
+    QDECREF(bs_options);
+
+    if (ret != 0) {
+        goto out;
+    }
+    if (iscsilun->nop_timer) {
+        qemu_del_timer(iscsilun->nop_timer);
+        qemu_free_timer(iscsilun->nop_timer);
+    }
+    if (iscsilun->type != TYPE_DISK) {
+        ret = -ENODEV;
+        goto out;
+    }
+    if (bs.total_sectors < total_size) {
+        ret = -ENOSPC;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (iscsilun->iscsi != NULL) {
+        iscsi_destroy_context(iscsilun->iscsi);
+    }
+    g_free(bs.opaque);
+    return ret;
+}
+
+static QEMUOptionParameter iscsi_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
 static BlockDriver bdrv_iscsi = {
     .format_name     = "iscsi",
     .protocol_name   = "iscsi",
@@ -963,8 +1312,11 @@ static BlockDriver bdrv_iscsi = {
     .instance_size   = sizeof(IscsiLun),
     .bdrv_file_open  = iscsi_open,
     .bdrv_close      = iscsi_close,
+    .bdrv_create     = iscsi_create,
+    .create_options  = iscsi_create_options,
 
     .bdrv_getlength  = iscsi_getlength,
+    .bdrv_truncate   = iscsi_truncate,
 
     .bdrv_aio_readv  = iscsi_aio_readv,
     .bdrv_aio_writev = iscsi_aio_writev,
@@ -979,9 +1331,36 @@ static BlockDriver bdrv_iscsi = {
 #endif
 };
 
+static QemuOptsList qemu_iscsi_opts = {
+    .name = "iscsi",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head),
+    .desc = {
+        {
+            .name = "user",
+            .type = QEMU_OPT_STRING,
+            .help = "username for CHAP authentication to target",
+        },{
+            .name = "password",
+            .type = QEMU_OPT_STRING,
+            .help = "password for CHAP authentication to target",
+        },{
+            .name = "header-digest",
+            .type = QEMU_OPT_STRING,
+            .help = "HeaderDigest setting. "
+                    "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}",
+        },{
+            .name = "initiator-name",
+            .type = QEMU_OPT_STRING,
+            .help = "Initiator iqn name to use when connecting",
+        },
+        { /* end of list */ }
+    },
+};
+
 static void iscsi_block_init(void)
 {
     bdrv_register(&bdrv_iscsi);
+    qemu_add_opts(&qemu_iscsi_opts);
 }
 
 block_init(iscsi_block_init);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 91ef86324..ee0f8d10c 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -8,10 +8,10 @@
  * See the COPYING file in the top-level directory.
  */
 #include "qemu-common.h"
-#include "qemu-aio.h"
-#include "qemu-queue.h"
+#include "block/aio.h"
+#include "qemu/queue.h"
 #include "block/raw-aio.h"
-#include "event_notifier.h"
+#include "qemu/event_notifier.h"
 
 #include <libaio.h>
 
diff --git a/block/mirror.c b/block/mirror.c
index d6618a4b3..bed4a7ead 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -12,20 +12,20 @@
  */
 
 #include "trace.h"
-#include "blockjob.h"
-#include "block_int.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
 #include "qemu/ratelimit.h"
+#include "qemu/bitmap.h"
 
-enum {
-    /*
-     * Size of data buffer for populating the image file.  This should be large
-     * enough to process multiple clusters in a single call, so that populating
-     * contiguous regions of the image is efficient.
-     */
-    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
-};
+#define SLICE_TIME    100000000ULL /* ns */
+#define MAX_IN_FLIGHT 16
 
-#define SLICE_TIME 100000000ULL /* ns */
+/* The mirroring buffer is a list of granularity-sized chunks.
+ * Free chunks are organized in a list.
+ */
+typedef struct MirrorBuffer {
+    QSIMPLEQ_ENTRY(MirrorBuffer) next;
+} MirrorBuffer;
 
 typedef struct MirrorBlockJob {
     BlockJob common;
@@ -36,9 +36,26 @@ typedef struct MirrorBlockJob {
     bool synced;
     bool should_complete;
     int64_t sector_num;
+    int64_t granularity;
+    size_t buf_size;
+    unsigned long *cow_bitmap;
+    HBitmapIter hbi;
     uint8_t *buf;
+    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
+    int buf_free_count;
+
+    unsigned long *in_flight_bitmap;
+    int in_flight;
+    int ret;
 } MirrorBlockJob;
 
+typedef struct MirrorOp {
+    MirrorBlockJob *s;
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+} MirrorOp;
+
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                             int error)
 {
@@ -52,51 +69,234 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
     }
 }
 
-static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
-                                         BlockErrorAction *p_action)
+static void mirror_iteration_done(MirrorOp *op, int ret)
 {
-    BlockDriverState *source = s->common.bs;
-    BlockDriverState *target = s->target;
-    QEMUIOVector qiov;
-    int ret, nb_sectors;
-    int64_t end;
-    struct iovec iov;
+    MirrorBlockJob *s = op->s;
+    struct iovec *iov;
+    int64_t chunk_num;
+    int i, nb_chunks, sectors_per_chunk;
+
+    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
+
+    s->in_flight--;
+    iov = op->qiov.iov;
+    for (i = 0; i < op->qiov.niov; i++) {
+        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
+        s->buf_free_count++;
+    }
 
-    end = s->common.len >> BDRV_SECTOR_BITS;
-    s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
-    nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
-    bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    chunk_num = op->sector_num / sectors_per_chunk;
+    nb_chunks = op->nb_sectors / sectors_per_chunk;
+    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
+    if (s->cow_bitmap && ret >= 0) {
+        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+    }
 
-    /* Copy the dirty cluster.  */
-    iov.iov_base = s->buf;
-    iov.iov_len  = nb_sectors * 512;
-    qemu_iovec_init_external(&qiov, &iov, 1);
+    g_slice_free(MirrorOp, op);
+    qemu_coroutine_enter(s->common.co, NULL);
+}
 
-    trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
-    ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+static void mirror_write_complete(void *opaque, int ret)
+{
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        *p_action = mirror_error_action(s, true, -ret);
-        goto fail;
+        BlockDriverState *source = s->common.bs;
+        BlockErrorAction action;
+
+        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        action = mirror_error_action(s, false, -ret);
+        if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+            s->ret = ret;
+        }
     }
-    ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+    mirror_iteration_done(op, ret);
+}
+
+static void mirror_read_complete(void *opaque, int ret)
+{
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        *p_action = mirror_error_action(s, false, -ret);
-        s->synced = false;
-        goto fail;
+        BlockDriverState *source = s->common.bs;
+        BlockErrorAction action;
+
+        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        action = mirror_error_action(s, true, -ret);
+        if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+            s->ret = ret;
+        }
+
+        mirror_iteration_done(op, ret);
+        return;
+    }
+    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+                    mirror_write_complete, op);
+}
+
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
+{
+    BlockDriverState *source = s->common.bs;
+    int nb_sectors, sectors_per_chunk, nb_chunks;
+    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
+    MirrorOp *op;
+
+    s->sector_num = hbitmap_iter_next(&s->hbi);
+    if (s->sector_num < 0) {
+        bdrv_dirty_iter_init(source, &s->hbi);
+        s->sector_num = hbitmap_iter_next(&s->hbi);
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
+        assert(s->sector_num >= 0);
+    }
+
+    hbitmap_next_sector = s->sector_num;
+    sector_num = s->sector_num;
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    end = s->common.len >> BDRV_SECTOR_BITS;
+
+    /* Extend the QEMUIOVector to include all adjacent blocks that will
+     * be copied in this operation.
+     *
+     * We have to do this if we have no backing file yet in the destination,
+     * and the cluster size is very large.  Then we need to do COW ourselves.
+     * The first time a cluster is copied, copy it entirely.  Note that,
+     * because both the granularity and the cluster size are powers of two,
+     * the number of sectors to copy cannot exceed one cluster.
+     *
+     * We also want to extend the QEMUIOVector to include more adjacent
+     * dirty blocks if possible, to limit the number of I/O operations and
+     * run efficiently even with a small granularity.
+     */
+    nb_chunks = 0;
+    nb_sectors = 0;
+    next_sector = sector_num;
+    next_chunk = sector_num / sectors_per_chunk;
+
+    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
+    while (test_bit(next_chunk, s->in_flight_bitmap)) {
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        qemu_coroutine_yield();
+    }
+
+    do {
+        int added_sectors, added_chunks;
+
+        if (!bdrv_get_dirty(source, next_sector) ||
+            test_bit(next_chunk, s->in_flight_bitmap)) {
+            assert(nb_sectors > 0);
+            break;
+        }
+
+        added_sectors = sectors_per_chunk;
+        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
+            bdrv_round_to_clusters(s->target,
+                                   next_sector, added_sectors,
+                                   &next_sector, &added_sectors);
+
+            /* On the first iteration, the rounding may make us copy
+             * sectors before the first dirty one.
+             */
+            if (next_sector < sector_num) {
+                assert(nb_sectors == 0);
+                sector_num = next_sector;
+                next_chunk = next_sector / sectors_per_chunk;
+            }
+        }
+
+        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
+        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
+
+        /* When doing COW, it may happen that there is not enough space for
+         * a full cluster.  Wait if that is the case.
+         */
+        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
+            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+            qemu_coroutine_yield();
+        }
+        if (s->buf_free_count < nb_chunks + added_chunks) {
+            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
+            break;
+        }
+
+        /* We have enough free space to copy these sectors.  */
+        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
+
+        nb_sectors += added_sectors;
+        nb_chunks += added_chunks;
+        next_sector += added_sectors;
+        next_chunk += added_chunks;
+    } while (next_sector < end);
+
+    /* Allocate a MirrorOp that is used as an AIO callback.  */
+    op = g_slice_new(MirrorOp);
+    op->s = s;
+    op->sector_num = sector_num;
+    op->nb_sectors = nb_sectors;
+
+    /* Now make a QEMUIOVector taking enough granularity-sized chunks
+     * from s->buf_free.
+     */
+    qemu_iovec_init(&op->qiov, nb_chunks);
+    next_sector = sector_num;
+    while (nb_chunks-- > 0) {
+        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
+        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
+        s->buf_free_count--;
+        qemu_iovec_add(&op->qiov, buf, s->granularity);
+
+        /* Advance the HBitmapIter in parallel, so that we do not examine
+         * the same sector twice.
+         */
+        if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
+        }
+
+        next_sector += sectors_per_chunk;
     }
-    return 0;
 
-fail:
-    /* Try again later.  */
-    bdrv_set_dirty(source, s->sector_num, nb_sectors);
-    return ret;
+    bdrv_reset_dirty(source, sector_num, nb_sectors);
+
+    /* Copy the dirty cluster.  */
+    s->in_flight++;
+    trace_mirror_one_iteration(s, sector_num, nb_sectors);
+    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+                   mirror_read_complete, op);
+}
+
+static void mirror_free_init(MirrorBlockJob *s)
+{
+    int granularity = s->granularity;
+    size_t buf_size = s->buf_size;
+    uint8_t *buf = s->buf;
+
+    assert(s->buf_free_count == 0);
+    QSIMPLEQ_INIT(&s->buf_free);
+    while (buf_size != 0) {
+        MirrorBuffer *cur = (MirrorBuffer *)buf;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
+        s->buf_free_count++;
+        buf_size -= granularity;
+        buf += granularity;
+    }
+}
+
+static void mirror_drain(MirrorBlockJob *s)
+{
+    while (s->in_flight > 0) {
+        qemu_coroutine_yield();
+    }
 }
 
 static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     BlockDriverState *bs = s->common.bs;
-    int64_t sector_num, end;
+    int64_t sector_num, end, sectors_per_chunk, length;
+    uint64_t last_pause_ns;
+    BlockDriverInfo bdi;
+    char backing_filename[1024];
     int ret = 0;
     int n;
 
@@ -105,20 +305,39 @@ static void coroutine_fn mirror_run(void *opaque)
     }
 
     s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
+    if (s->common.len <= 0) {
         block_job_completed(&s->common, s->common.len);
         return;
     }
 
+    length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
+    s->in_flight_bitmap = bitmap_new(length);
+
+    /* If we have no backing file yet in the destination, we cannot let
+     * the destination do COW.  Instead, we copy sectors around the
+     * dirty data if needed.  We need a bitmap to do that.
+     */
+    bdrv_get_backing_filename(s->target, backing_filename,
+                              sizeof(backing_filename));
+    if (backing_filename[0] && !s->target->backing_hd) {
+        bdrv_get_info(s->target, &bdi);
+        if (s->granularity < bdi.cluster_size) {
+            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
+            s->cow_bitmap = bitmap_new(length);
+        }
+    }
+
     end = s->common.len >> BDRV_SECTOR_BITS;
-    s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+    s->buf = qemu_blockalign(bs, s->buf_size);
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    mirror_free_init(s);
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
         BlockDriverState *base;
         base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
         for (sector_num = 0; sector_num < end; ) {
-            int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+            int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
             ret = bdrv_co_is_allocated_above(bs, base,
                                              sector_num, next - sector_num, &n);
 
@@ -136,24 +355,40 @@ static void coroutine_fn mirror_run(void *opaque)
         }
     }
 
-    s->sector_num = -1;
+    bdrv_dirty_iter_init(bs, &s->hbi);
+    last_pause_ns = qemu_get_clock_ns(rt_clock);
     for (;;) {
         uint64_t delay_ns;
         int64_t cnt;
         bool should_complete;
 
+        if (s->ret < 0) {
+            ret = s->ret;
+            goto immediate_exit;
+        }
+
         cnt = bdrv_get_dirty_count(bs);
-        if (cnt != 0) {
-            BlockErrorAction action = BDRV_ACTION_REPORT;
-            ret = mirror_iteration(s, &action);
-            if (ret < 0 && action == BDRV_ACTION_REPORT) {
-                goto immediate_exit;
+
+        /* Note that even when no rate limit is applied we need to yield
+         * periodically with no pending I/O so that qemu_aio_flush() returns.
+         * We do so every SLICE_TIME nanoseconds, or when there is an error,
+         * or when the source is clean, whichever comes first.
+         */
+        if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
+            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+                (cnt == 0 && s->in_flight > 0)) {
+                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
+                qemu_coroutine_yield();
+                continue;
+            } else if (cnt != 0) {
+                mirror_iteration(s);
+                continue;
             }
-            cnt = bdrv_get_dirty_count(bs);
         }
 
         should_complete = false;
-        if (cnt == 0) {
+        if (s->in_flight == 0 && cnt == 0) {
             trace_mirror_before_flush(s);
             ret = bdrv_flush(s->target);
             if (ret < 0) {
@@ -196,23 +431,20 @@ static void coroutine_fn mirror_run(void *opaque)
         trace_mirror_before_sleep(s, cnt, s->synced);
         if (!s->synced) {
             /* Publish progress */
-            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
 
             if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+                delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
             } else {
                 delay_ns = 0;
             }
 
-            /* Note that even when no rate limit is applied we need to yield
-             * with no pending I/O here so that qemu_aio_flush() returns.
-             */
             block_job_sleep_ns(&s->common, rt_clock, delay_ns);
             if (block_job_is_cancelled(&s->common)) {
                 break;
             }
         } else if (!should_complete) {
-            delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
             block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         } else if (cnt == 0) {
             /* The two disks are in sync.  Exit and report successful
@@ -222,11 +454,24 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.cancelled = false;
             break;
         }
+        last_pause_ns = qemu_get_clock_ns(rt_clock);
     }
 
 immediate_exit:
-    g_free(s->buf);
-    bdrv_set_dirty_tracking(bs, false);
+    if (s->in_flight > 0) {
+        /* We get here only if something went wrong.  Either the job failed,
+         * or it was cancelled prematurely so that we do not guarantee that
+         * the target is a copy of the source.
+         */
+        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
+        mirror_drain(s);
+    }
+
+    assert(s->in_flight == 0);
+    qemu_vfree(s->buf);
+    g_free(s->cow_bitmap);
+    g_free(s->in_flight_bitmap);
+    bdrv_set_dirty_tracking(bs, 0);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
         if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
@@ -262,12 +507,12 @@ static void mirror_complete(BlockJob *job, Error **errp)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
     int ret;
 
-    ret = bdrv_open_backing_file(s->target);
+    ret = bdrv_open_backing_file(s->target, NULL);
     if (ret < 0) {
         char backing_filename[PATH_MAX];
         bdrv_get_full_backing_filename(s->target, backing_filename,
                                        sizeof(backing_filename));
-        error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename);
+        error_setg_file_open(errp, -ret, backing_filename);
         return;
     }
     if (!s->synced) {
@@ -279,7 +524,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
     block_job_resume(job);
 }
 
-static BlockJobType mirror_job_type = {
+static const BlockJobType mirror_job_type = {
     .instance_size = sizeof(MirrorBlockJob),
     .job_type      = "mirror",
     .set_speed     = mirror_set_speed,
@@ -288,14 +533,28 @@ static BlockJobType mirror_job_type = {
 };
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, MirrorSyncMode mode,
-                  BlockdevOnError on_source_error,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp)
 {
     MirrorBlockJob *s;
 
+    if (granularity == 0) {
+        /* Choose the default granularity based on the target file's cluster
+         * size, clamped between 4k and 64k.  */
+        BlockDriverInfo bdi;
+        if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
+            granularity = MAX(4096, bdi.cluster_size);
+            granularity = MIN(65536, granularity);
+        } else {
+            granularity = 65536;
+        }
+    }
+
+    assert ((granularity & (granularity - 1)) == 0);
+
     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
         !bdrv_iostatus_is_enabled(bs)) {
@@ -312,7 +571,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_target_error = on_target_error;
     s->target = target;
     s->mode = mode;
-    bdrv_set_dirty_tracking(bs, true);
+    s->granularity = granularity;
+    s->buf_size = MAX(buf_size, granularity);
+
+    bdrv_set_dirty_tracking(bs, granularity);
     bdrv_set_enable_write_cache(s->target, true);
     bdrv_set_on_error(s->target, on_target_error, on_target_error);
     bdrv_iostatus_enable(s->target);
diff --git a/block/nbd.c b/block/nbd.c
index e87c24817..9c480b8f2 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -27,11 +27,13 @@
  */
 
 #include "qemu-common.h"
-#include "nbd.h"
-#include "uri.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu_socket.h"
+#include "block/nbd.h"
+#include "qemu/uri.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"
+#include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qint.h"
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -65,17 +67,19 @@ typedef struct BDRVNBDState {
     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
     struct nbd_reply reply;
 
-    int is_unix;
-    char *host_spec;
+    bool is_unix;
+    QemuOpts *socket_opts;
+
     char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;
 
-static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
+static int nbd_parse_uri(const char *filename, QDict *options)
 {
     URI *uri;
     const char *p;
     QueryParams *qp = NULL;
     int ret = 0;
+    bool is_unix;
 
     uri = uri_parse(filename);
     if (!uri) {
@@ -84,11 +88,11 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
 
     /* transport */
     if (!strcmp(uri->scheme, "nbd")) {
-        s->is_unix = false;
+        is_unix = false;
     } else if (!strcmp(uri->scheme, "nbd+tcp")) {
-        s->is_unix = false;
+        is_unix = false;
     } else if (!strcmp(uri->scheme, "nbd+unix")) {
-        s->is_unix = true;
+        is_unix = true;
     } else {
         ret = -EINVAL;
         goto out;
@@ -97,32 +101,44 @@ static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
     p = uri->path ? uri->path : "/";
     p += strspn(p, "/");
     if (p[0]) {
-        s->export_name = g_strdup(p);
+        qdict_put(options, "export", qstring_from_str(p));
     }
 
     qp = query_params_parse(uri->query);
-    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
         ret = -EINVAL;
         goto out;
     }
 
-    if (s->is_unix) {
+    if (is_unix) {
         /* nbd+unix:///export?socket=path */
         if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
             ret = -EINVAL;
             goto out;
         }
-        s->host_spec = g_strdup(qp->p[0].value);
+        qdict_put(options, "path", qstring_from_str(qp->p[0].value));
     } else {
-        /* nbd[+tcp]://host:port/export */
+        QString *host;
+        /* nbd[+tcp]://host[:port]/export */
         if (!uri->server) {
             ret = -EINVAL;
             goto out;
         }
-        if (!uri->port) {
-            uri->port = NBD_DEFAULT_PORT;
+
+        /* strip braces from literal IPv6 address */
+        if (uri->server[0] == '[') {
+            host = qstring_from_substr(uri->server, 1,
+                                       strlen(uri->server) - 2);
+        } else {
+            host = qstring_from_str(uri->server);
+        }
+
+        qdict_put(options, "host", host);
+        if (uri->port) {
+            char* port_str = g_strdup_printf("%d", uri->port);
+            qdict_put(options, "port", qstring_from_str(port_str));
+            g_free(port_str);
         }
-        s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port);
     }
 
 out:
@@ -133,16 +149,29 @@ out:
     return ret;
 }
 
-static int nbd_config(BDRVNBDState *s, const char *filename)
+static void nbd_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
 {
     char *file;
     char *export_name;
     const char *host_spec;
     const char *unixpath;
-    int err = -EINVAL;
+
+    if (qdict_haskey(options, "host")
+        || qdict_haskey(options, "port")
+        || qdict_haskey(options, "path"))
+    {
+        error_setg(errp, "host/port/path and a file name may not be specified "
+                         "at the same time");
+        return;
+    }
 
     if (strstr(filename, "://")) {
-        return nbd_parse_uri(s, filename);
+        int ret = nbd_parse_uri(filename, options);
+        if (ret < 0) {
+            error_setg(errp, "No valid URL specified");
+        }
+        return;
     }
 
     file = g_strdup(filename);
@@ -154,34 +183,79 @@ static int nbd_config(BDRVNBDState *s, const char *filename)
         }
         export_name[0] = 0; /* truncate 'file' */
         export_name += strlen(EN_OPTSTR);
-        s->export_name = g_strdup(export_name);
+
+        qdict_put(options, "export", qstring_from_str(export_name));
     }
 
     /* extract the host_spec - fail if it's not nbd:... */
     if (!strstart(file, "nbd:", &host_spec)) {
+        error_setg(errp, "File name string for NBD must start with 'nbd:'");
+        goto out;
+    }
+
+    if (!*host_spec) {
         goto out;
     }
 
     /* are we a UNIX or TCP socket? */
     if (strstart(host_spec, "unix:", &unixpath)) {
-        s->is_unix = true;
-        s->host_spec = g_strdup(unixpath);
+        qdict_put(options, "path", qstring_from_str(unixpath));
     } else {
-        s->is_unix = false;
-        s->host_spec = g_strdup(host_spec);
-    }
+        InetSocketAddress *addr = NULL;
 
-    err = 0;
+        addr = inet_parse(host_spec, errp);
+        if (error_is_set(errp)) {
+            goto out;
+        }
+
+        qdict_put(options, "host", qstring_from_str(addr->host));
+        qdict_put(options, "port", qstring_from_str(addr->port));
+        qapi_free_InetSocketAddress(addr);
+    }
 
 out:
     g_free(file);
-    if (err != 0) {
-        g_free(s->export_name);
-        g_free(s->host_spec);
+}
+
+static int nbd_config(BDRVNBDState *s, QDict *options)
+{
+    Error *local_err = NULL;
+
+    if (qdict_haskey(options, "path")) {
+        if (qdict_haskey(options, "host")) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not "
+                          "be used at the same time.");
+            return -EINVAL;
+        }
+        s->is_unix = true;
+    } else if (qdict_haskey(options, "host")) {
+        s->is_unix = false;
+    } else {
+        return -EINVAL;
+    }
+
+    s->socket_opts = qemu_opts_create_nofail(&socket_optslist);
+
+    qemu_opts_absorb_qdict(s->socket_opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        return -EINVAL;
+    }
+
+    if (!qemu_opt_get(s->socket_opts, "port")) {
+        qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT);
     }
-    return err;
+
+    s->export_name = g_strdup(qdict_get_try_str(options, "export"));
+    if (s->export_name) {
+        qdict_del(options, "export");
+    }
+
+    return 0;
 }
 
+
 static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
 {
     int i;
@@ -269,13 +343,23 @@ static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
     s->send_coroutine = qemu_coroutine_self();
     qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
                             nbd_have_request, s);
-    rc = nbd_send_request(s->sock, request);
-    if (rc >= 0 && qiov) {
-        ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
-                            offset, request->len);
-        if (ret != request->len) {
-            return -EIO;
+    if (qiov) {
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 1);
         }
+        rc = nbd_send_request(s->sock, request);
+        if (rc >= 0) {
+            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                rc = -EIO;
+            }
+        }
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 0);
+        }
+    } else {
+        rc = nbd_send_request(s->sock, request);
     }
     qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
                             nbd_have_request, s);
@@ -328,9 +412,12 @@ static int nbd_establish_connection(BlockDriverState *bs)
     size_t blocksize;
 
     if (s->is_unix) {
-        sock = unix_socket_outgoing(s->host_spec);
+        sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path"));
     } else {
-        sock = tcp_socket_outgoing_spec(s->host_spec);
+        sock = tcp_socket_outgoing_opts(s->socket_opts);
+        if (sock >= 0) {
+            socket_set_nodelay(sock);
+        }
     }
 
     /* Failed to establish connection */
@@ -350,7 +437,7 @@ static int nbd_establish_connection(BlockDriverState *bs)
 
     /* Now that we're connected, set the socket to be non-blocking and
      * kick the reply mechanism.  */
-    socket_set_nonblock(sock);
+    qemu_set_nonblock(sock);
     qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
                             nbd_have_request, s);
 
@@ -376,7 +463,7 @@ static void nbd_teardown_connection(BlockDriverState *bs)
     closesocket(s->sock);
 }
 
-static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+static int nbd_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVNBDState *s = bs->opaque;
     int result;
@@ -385,7 +472,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
     qemu_co_mutex_init(&s->free_sema);
 
     /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, filename);
+    result = nbd_config(s, options);
     if (result != 0) {
         return result;
     }
@@ -531,7 +618,7 @@ static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
     request.type = NBD_CMD_TRIM;
-    request.from = sector_num * 512;;
+    request.from = sector_num * 512;
     request.len = nb_sectors * 512;
 
     nbd_coroutine_start(s, &request);
@@ -549,7 +636,7 @@ static void nbd_close(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
     g_free(s->export_name);
-    g_free(s->host_spec);
+    qemu_opts_del(s->socket_opts);
 
     nbd_teardown_connection(bs);
 }
@@ -565,6 +652,7 @@ static BlockDriver bdrv_nbd = {
     .format_name         = "nbd",
     .protocol_name       = "nbd",
     .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_parse_filename = nbd_parse_filename,
     .bdrv_file_open      = nbd_open,
     .bdrv_co_readv       = nbd_co_readv,
     .bdrv_co_writev      = nbd_co_writev,
@@ -578,6 +666,7 @@ static BlockDriver bdrv_nbd_tcp = {
     .format_name         = "nbd",
     .protocol_name       = "nbd+tcp",
     .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_parse_filename = nbd_parse_filename,
     .bdrv_file_open      = nbd_open,
     .bdrv_co_readv       = nbd_co_readv,
     .bdrv_co_writev      = nbd_co_writev,
@@ -591,6 +680,7 @@ static BlockDriver bdrv_nbd_unix = {
     .format_name         = "nbd",
     .protocol_name       = "nbd+unix",
     .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_parse_filename = nbd_parse_filename,
     .bdrv_file_open      = nbd_open,
     .bdrv_co_readv       = nbd_co_readv,
     .bdrv_co_writev      = nbd_co_writev,
diff --git a/block/parallels.c b/block/parallels.c
index d30f0ecf7..18b3ac0b2 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -24,8 +24,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 
@@ -68,19 +68,23 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam
     return 0;
 }
 
-static int parallels_open(BlockDriverState *bs, int flags)
+static int parallels_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVParallelsState *s = bs->opaque;
     int i;
     struct parallels_header ph;
+    int ret;
 
     bs->read_only = 1; // no write support yet
 
-    if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph))
+    ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
+    if (ret < 0) {
         goto fail;
+    }
 
     if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
-	(le32_to_cpu(ph.version) != HEADER_VERSION)) {
+        (le32_to_cpu(ph.version) != HEADER_VERSION)) {
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
 
@@ -90,18 +94,21 @@ static int parallels_open(BlockDriverState *bs, int flags)
 
     s->catalog_size = le32_to_cpu(ph.catalog_entries);
     s->catalog_bitmap = g_malloc(s->catalog_size * 4);
-    if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) !=
-	s->catalog_size * 4)
-	goto fail;
+
+    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
+    if (ret < 0) {
+        goto fail;
+    }
+
     for (i = 0; i < s->catalog_size; i++)
 	le32_to_cpus(&s->catalog_bitmap[i]);
 
     qemu_co_mutex_init(&s->lock);
     return 0;
+
 fail:
-    if (s->catalog_bitmap)
-	g_free(s->catalog_bitmap);
-    return -1;
+    g_free(s->catalog_bitmap);
+    return ret;
 }
 
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
diff --git a/block/qapi.c b/block/qapi.c
new file mode 100644
index 000000000..a4bc4113b
--- /dev/null
+++ b/block/qapi.c
@@ -0,0 +1,470 @@
+/*
+ * Block layer qmp and info dump related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/qapi.h"
+#include "block/block_int.h"
+#include "qmp-commands.h"
+
+/*
+ * Returns 0 on success, with *p_list either set to describe snapshot
+ * information, or NULL because there are no snapshots.  Returns -errno on
+ * error, with *p_list untouched.
+ */
+int bdrv_query_snapshot_info_list(BlockDriverState *bs,
+                                  SnapshotInfoList **p_list,
+                                  Error **errp)
+{
+    int i, sn_count;
+    QEMUSnapshotInfo *sn_tab = NULL;
+    SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL;
+    SnapshotInfo *info;
+
+    sn_count = bdrv_snapshot_list(bs, &sn_tab);
+    if (sn_count < 0) {
+        const char *dev = bdrv_get_device_name(bs);
+        switch (sn_count) {
+        case -ENOMEDIUM:
+            error_setg(errp, "Device '%s' is not inserted", dev);
+            break;
+        case -ENOTSUP:
+            error_setg(errp,
+                       "Device '%s' does not support internal snapshots",
+                       dev);
+            break;
+        default:
+            error_setg_errno(errp, -sn_count,
+                             "Can't list snapshots of device '%s'", dev);
+            break;
+        }
+        return sn_count;
+    }
+
+    for (i = 0; i < sn_count; i++) {
+        info = g_new0(SnapshotInfo, 1);
+        info->id            = g_strdup(sn_tab[i].id_str);
+        info->name          = g_strdup(sn_tab[i].name);
+        info->vm_state_size = sn_tab[i].vm_state_size;
+        info->date_sec      = sn_tab[i].date_sec;
+        info->date_nsec     = sn_tab[i].date_nsec;
+        info->vm_clock_sec  = sn_tab[i].vm_clock_nsec / 1000000000;
+        info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000;
+
+        info_list = g_new0(SnapshotInfoList, 1);
+        info_list->value = info;
+
+        /* XXX: waiting for the qapi to support qemu-queue.h types */
+        if (!cur_item) {
+            head = cur_item = info_list;
+        } else {
+            cur_item->next = info_list;
+            cur_item = info_list;
+        }
+
+    }
+
+    g_free(sn_tab);
+    *p_list = head;
+    return 0;
+}
+
+/**
+ * bdrv_query_image_info:
+ * @bs: block device to examine
+ * @p_info: location to store image information
+ * @errp: location to store error information
+ *
+ * Store "flat" image information in @p_info.
+ *
+ * "Flat" means it does *not* query backing image information,
+ * i.e. (*pinfo)->has_backing_image will be set to false and
+ * (*pinfo)->backing_image to NULL even when the image does in fact have
+ * a backing image.
+ *
+ * @p_info will be set only on success. On error, store error in @errp.
+ */
+void bdrv_query_image_info(BlockDriverState *bs,
+                           ImageInfo **p_info,
+                           Error **errp)
+{
+    uint64_t total_sectors;
+    const char *backing_filename;
+    char backing_filename2[1024];
+    BlockDriverInfo bdi;
+    int ret;
+    Error *err = NULL;
+    ImageInfo *info = g_new0(ImageInfo, 1);
+
+    bdrv_get_geometry(bs, &total_sectors);
+
+    info->filename        = g_strdup(bs->filename);
+    info->format          = g_strdup(bdrv_get_format_name(bs));
+    info->virtual_size    = total_sectors * 512;
+    info->actual_size     = bdrv_get_allocated_file_size(bs);
+    info->has_actual_size = info->actual_size >= 0;
+    if (bdrv_is_encrypted(bs)) {
+        info->encrypted = true;
+        info->has_encrypted = true;
+    }
+    if (bdrv_get_info(bs, &bdi) >= 0) {
+        if (bdi.cluster_size != 0) {
+            info->cluster_size = bdi.cluster_size;
+            info->has_cluster_size = true;
+        }
+        info->dirty_flag = bdi.is_dirty;
+        info->has_dirty_flag = true;
+    }
+    backing_filename = bs->backing_file;
+    if (backing_filename[0] != '\0') {
+        info->backing_filename = g_strdup(backing_filename);
+        info->has_backing_filename = true;
+        bdrv_get_full_backing_filename(bs, backing_filename2,
+                                       sizeof(backing_filename2));
+
+        if (strcmp(backing_filename, backing_filename2) != 0) {
+            info->full_backing_filename =
+                        g_strdup(backing_filename2);
+            info->has_full_backing_filename = true;
+        }
+
+        if (bs->backing_format[0]) {
+            info->backing_filename_format = g_strdup(bs->backing_format);
+            info->has_backing_filename_format = true;
+        }
+    }
+
+    ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err);
+    switch (ret) {
+    case 0:
+        if (info->snapshots) {
+            info->has_snapshots = true;
+        }
+        break;
+    /* recoverable error */
+    case -ENOMEDIUM:
+    case -ENOTSUP:
+        error_free(err);
+        break;
+    default:
+        error_propagate(errp, err);
+        qapi_free_ImageInfo(info);
+        return;
+    }
+
+    *p_info = info;
+}
+
+/* @p_info will be set only on success. */
+void bdrv_query_info(BlockDriverState *bs,
+                     BlockInfo **p_info,
+                     Error **errp)
+{
+    BlockInfo *info = g_malloc0(sizeof(*info));
+    BlockDriverState *bs0;
+    ImageInfo **p_image_info;
+    Error *local_err = NULL;
+    info->device = g_strdup(bs->device_name);
+    info->type = g_strdup("unknown");
+    info->locked = bdrv_dev_is_medium_locked(bs);
+    info->removable = bdrv_dev_has_removable_media(bs);
+
+    if (bdrv_dev_has_removable_media(bs)) {
+        info->has_tray_open = true;
+        info->tray_open = bdrv_dev_is_tray_open(bs);
+    }
+
+    if (bdrv_iostatus_is_enabled(bs)) {
+        info->has_io_status = true;
+        info->io_status = bs->iostatus;
+    }
+
+    if (bs->dirty_bitmap) {
+        info->has_dirty = true;
+        info->dirty = g_malloc0(sizeof(*info->dirty));
+        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
+        info->dirty->granularity =
+         ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
+    }
+
+    if (bs->drv) {
+        info->has_inserted = true;
+        info->inserted = g_malloc0(sizeof(*info->inserted));
+        info->inserted->file = g_strdup(bs->filename);
+        info->inserted->ro = bs->read_only;
+        info->inserted->drv = g_strdup(bs->drv->format_name);
+        info->inserted->encrypted = bs->encrypted;
+        info->inserted->encryption_key_missing = bdrv_key_required(bs);
+
+        if (bs->backing_file[0]) {
+            info->inserted->has_backing_file = true;
+            info->inserted->backing_file = g_strdup(bs->backing_file);
+        }
+
+        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
+
+        if (bs->io_limits_enabled) {
+            info->inserted->bps =
+                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+            info->inserted->bps_rd =
+                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
+            info->inserted->bps_wr =
+                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
+            info->inserted->iops =
+                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+            info->inserted->iops_rd =
+                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
+            info->inserted->iops_wr =
+                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+        }
+
+        bs0 = bs;
+        p_image_info = &info->inserted->image;
+        while (1) {
+            bdrv_query_image_info(bs0, p_image_info, &local_err);
+            if (error_is_set(&local_err)) {
+                error_propagate(errp, local_err);
+                goto err;
+            }
+            if (bs0->drv && bs0->backing_hd) {
+                bs0 = bs0->backing_hd;
+                (*p_image_info)->has_backing_image = true;
+                p_image_info = &((*p_image_info)->backing_image);
+            } else {
+                break;
+            }
+        }
+    }
+
+    *p_info = info;
+    return;
+
+ err:
+    qapi_free_BlockInfo(info);
+}
+
+BlockStats *bdrv_query_stats(const BlockDriverState *bs)
+{
+    BlockStats *s;
+
+    s = g_malloc0(sizeof(*s));
+
+    if (bs->device_name[0]) {
+        s->has_device = true;
+        s->device = g_strdup(bs->device_name);
+    }
+
+    s->stats = g_malloc0(sizeof(*s->stats));
+    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
+    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
+    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
+    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
+    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
+    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
+    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
+    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
+    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
+
+    if (bs->file) {
+        s->has_parent = true;
+        s->parent = bdrv_query_stats(bs->file);
+    }
+
+    return s;
+}
+
+BlockInfoList *qmp_query_block(Error **errp)
+{
+    BlockInfoList *head = NULL, **p_next = &head;
+    BlockDriverState *bs = NULL;
+    Error *local_err = NULL;
+
+     while ((bs = bdrv_next(bs))) {
+        BlockInfoList *info = g_malloc0(sizeof(*info));
+        bdrv_query_info(bs, &info->value, &local_err);
+        if (error_is_set(&local_err)) {
+            error_propagate(errp, local_err);
+            goto err;
+        }
+
+        *p_next = info;
+        p_next = &info->next;
+    }
+
+    return head;
+
+ err:
+    qapi_free_BlockInfoList(head);
+    return NULL;
+}
+
+BlockStatsList *qmp_query_blockstats(Error **errp)
+{
+    BlockStatsList *head = NULL, **p_next = &head;
+    BlockDriverState *bs = NULL;
+
+     while ((bs = bdrv_next(bs))) {
+        BlockStatsList *info = g_malloc0(sizeof(*info));
+        info->value = bdrv_query_stats(bs);
+
+        *p_next = info;
+        p_next = &info->next;
+    }
+
+    return head;
+}
+
+#define NB_SUFFIXES 4
+
+static char *get_human_readable_size(char *buf, int buf_size, int64_t size)
+{
+    static const char suffixes[NB_SUFFIXES] = "KMGT";
+    int64_t base;
+    int i;
+
+    if (size <= 999) {
+        snprintf(buf, buf_size, "%" PRId64, size);
+    } else {
+        base = 1024;
+        for (i = 0; i < NB_SUFFIXES; i++) {
+            if (size < (10 * base)) {
+                snprintf(buf, buf_size, "%0.1f%c",
+                         (double)size / base,
+                         suffixes[i]);
+                break;
+            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
+                snprintf(buf, buf_size, "%" PRId64 "%c",
+                         ((size + (base >> 1)) / base),
+                         suffixes[i]);
+                break;
+            }
+            base = base * 1024;
+        }
+    }
+    return buf;
+}
+
+void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f,
+                        QEMUSnapshotInfo *sn)
+{
+    char buf1[128], date_buf[128], clock_buf[128];
+    struct tm tm;
+    time_t ti;
+    int64_t secs;
+
+    if (!sn) {
+        func_fprintf(f,
+                     "%-10s%-20s%7s%20s%15s",
+                     "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
+    } else {
+        ti = sn->date_sec;
+        localtime_r(&ti, &tm);
+        strftime(date_buf, sizeof(date_buf),
+                 "%Y-%m-%d %H:%M:%S", &tm);
+        secs = sn->vm_clock_nsec / 1000000000;
+        snprintf(clock_buf, sizeof(clock_buf),
+                 "%02d:%02d:%02d.%03d",
+                 (int)(secs / 3600),
+                 (int)((secs / 60) % 60),
+                 (int)(secs % 60),
+                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
+        func_fprintf(f,
+                     "%-10s%-20s%7s%20s%15s",
+                     sn->id_str, sn->name,
+                     get_human_readable_size(buf1, sizeof(buf1),
+                                             sn->vm_state_size),
+                     date_buf,
+                     clock_buf);
+    }
+}
+
+void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
+                          ImageInfo *info)
+{
+    char size_buf[128], dsize_buf[128];
+    if (!info->has_actual_size) {
+        snprintf(dsize_buf, sizeof(dsize_buf), "unavailable");
+    } else {
+        get_human_readable_size(dsize_buf, sizeof(dsize_buf),
+                                info->actual_size);
+    }
+    get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size);
+    func_fprintf(f,
+                 "image: %s\n"
+                 "file format: %s\n"
+                 "virtual size: %s (%" PRId64 " bytes)\n"
+                 "disk size: %s\n",
+                 info->filename, info->format, size_buf,
+                 info->virtual_size,
+                 dsize_buf);
+
+    if (info->has_encrypted && info->encrypted) {
+        func_fprintf(f, "encrypted: yes\n");
+    }
+
+    if (info->has_cluster_size) {
+        func_fprintf(f, "cluster_size: %" PRId64 "\n",
+                       info->cluster_size);
+    }
+
+    if (info->has_dirty_flag && info->dirty_flag) {
+        func_fprintf(f, "cleanly shut down: no\n");
+    }
+
+    if (info->has_backing_filename) {
+        func_fprintf(f, "backing file: %s", info->backing_filename);
+        if (info->has_full_backing_filename) {
+            func_fprintf(f, " (actual path: %s)", info->full_backing_filename);
+        }
+        func_fprintf(f, "\n");
+        if (info->has_backing_filename_format) {
+            func_fprintf(f, "backing file format: %s\n",
+                         info->backing_filename_format);
+        }
+    }
+
+    if (info->has_snapshots) {
+        SnapshotInfoList *elem;
+
+        func_fprintf(f, "Snapshot list:\n");
+        bdrv_snapshot_dump(func_fprintf, f, NULL);
+        func_fprintf(f, "\n");
+
+        /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but
+         * we convert to the block layer's native QEMUSnapshotInfo for now.
+         */
+        for (elem = info->snapshots; elem; elem = elem->next) {
+            QEMUSnapshotInfo sn = {
+                .vm_state_size = elem->value->vm_state_size,
+                .date_sec = elem->value->date_sec,
+                .date_nsec = elem->value->date_nsec,
+                .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL +
+                                 elem->value->vm_clock_nsec,
+            };
+
+            pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id);
+            pstrcpy(sn.name, sizeof(sn.name), elem->value->name);
+            bdrv_snapshot_dump(func_fprintf, f, &sn);
+            func_fprintf(f, "\n");
+        }
+    }
+}
diff --git a/block/qcow.c b/block/qcow.c
index b239c82ae..5239bd68f 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -22,11 +22,11 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
-#include "migration.h"
+#include "qemu/aes.h"
+#include "migration/migration.h"
 
 /**************************************************************/
 /* QEMU COW block driver with compression and encryption support */
@@ -92,7 +92,7 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
         return 0;
 }
 
-static int qcow_open(BlockDriverState *bs, int flags)
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVQcowState *s = bs->opaque;
     int len, i, shift, ret;
@@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags)
     be64_to_cpus(&header.l1_table_offset);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
     if (header.version != QCOW_VERSION) {
@@ -679,7 +679,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
         return ret;
     }
 
-    ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR);
+    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
     if (ret < 0) {
         return ret;
     }
@@ -787,8 +787,21 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
     uint8_t *out_buf;
     uint64_t cluster_offset;
 
-    if (nb_sectors != s->cluster_sectors)
-        return -EINVAL;
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow_write_compressed(bs, sector_num,
+                                        pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
 
     out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
 
@@ -879,6 +892,7 @@ static BlockDriver bdrv_qcow = {
     .bdrv_close		= qcow_close,
     .bdrv_reopen_prepare = qcow_reopen_prepare,
     .bdrv_create	= qcow_create,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
 
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 2d4322a8d..2f3114ecc 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -22,7 +22,7 @@
  * THE SOFTWARE.
  */
 
-#include "block_int.h"
+#include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
 #include "trace.h"
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e179211c5..cca76d4fc 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -25,16 +25,17 @@
 #include <zlib.h>
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 #include "trace.h"
 
-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size)
 {
     BDRVQcowState *s = bs->opaque;
-    int new_l1_size, new_l1_size2, ret, i;
+    int new_l1_size2, ret, i;
     uint64_t *new_l1_table;
-    int64_t new_l1_table_offset;
+    int64_t new_l1_table_offset, new_l1_size;
     uint8_t data[12];
 
     if (min_size <= s->l1_size)
@@ -53,8 +54,13 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
         }
     }
 
+    if (new_l1_size > INT_MAX) {
+        return -EFBIG;
+    }
+
 #ifdef DEBUG_ALLOC2
-    fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+            s->l1_size, new_l1_size);
 #endif
 
     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
@@ -92,14 +98,16 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
         goto fail;
     }
     g_free(s->l1_table);
-    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
     s->l1_table_offset = new_l1_table_offset;
     s->l1_table = new_l1_table;
     s->l1_size = new_l1_size;
     return 0;
  fail:
     g_free(new_l1_table);
-    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2);
+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+                        QCOW2_DISCARD_OTHER);
     return ret;
 }
 
@@ -391,8 +399,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *cluster_offset)
 {
     BDRVQcowState *s = bs->opaque;
-    unsigned int l1_index, l2_index;
-    uint64_t l2_offset, *l2_table;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset, *l2_table;
     int l1_bits, c;
     unsigned int index_in_cluster, nb_clusters;
     uint64_t nb_available, nb_needed;
@@ -454,6 +462,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
         *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
         break;
     case QCOW2_CLUSTER_ZERO:
+        if (s->qcow_version < 3) {
+            return -EIO;
+        }
         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                 &l2_table[l2_index], 0,
                 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
@@ -504,8 +515,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
                              int *new_l2_index)
 {
     BDRVQcowState *s = bs->opaque;
-    unsigned int l1_index, l2_index;
-    uint64_t l2_offset;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset;
     uint64_t *l2_table = NULL;
     int ret;
 
@@ -519,6 +530,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
         }
     }
 
+    assert(l1_index < s->l1_size);
     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 
     /* seek the l2 table of the given l2 offset */
@@ -538,7 +550,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 
         /* Then decrease the refcount of the old table */
         if (l2_offset) {
-            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+                                QCOW2_DISCARD_OTHER);
         }
     }
 
@@ -615,57 +628,67 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 {
     BDRVQcowState *s = bs->opaque;
-    int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, start_sect, *l2_table;
-    uint64_t cluster_offset = m->alloc_offset;
-    bool cow = false;
-
-    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    int ret;
 
-    if (m->nb_clusters == 0)
+    if (r->nb_sectors == 0) {
         return 0;
+    }
 
-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+    qemu_co_mutex_unlock(&s->lock);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    qemu_co_mutex_lock(&s->lock);
 
-    /* copy content of unmodified sectors */
-    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
-    if (m->n_start) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
-    }
-
-    if (m->nb_available & (s->cluster_sectors - 1)) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
-                           align_offset(m->nb_available, s->cluster_sectors));
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
+    if (ret < 0) {
+        return ret;
     }
 
     /*
-     * Update L2 table.
-     *
      * Before we update the L2 table to actually point to the new cluster, we
      * need to be sure that the refcounts have been increased and COW was
      * handled.
      */
-    if (cow) {
-        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, *l2_table;
+    uint64_t cluster_offset = m->alloc_offset;
+
+    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    assert(m->nb_clusters > 0);
+
+    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    ret = perform_cow(bs, m, &m->cow_start);
+    if (ret < 0) {
+        goto err;
     }
 
+    ret = perform_cow(bs, m, &m->cow_end);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Update L2 table. */
+    if (s->use_lazy_refcounts) {
+        qcow2_mark_dirty(bs);
+    }
     if (qcow2_need_accurate_refcounts(s)) {
         qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                    s->refcount_block_cache);
     }
+
     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
     if (ret < 0) {
         goto err;
@@ -695,10 +718,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     /*
      * If this was a COW, we need to decrease the refcount of the old cluster.
      * Also flush bs->file to get the right order for L2 and refcount update.
+     *
+     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
+     * clusters), the next write will reuse them anyway.
      */
     if (j != 0) {
         for (i = 0; i < j; i++) {
-            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
+            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
+                                    QCOW2_DISCARD_NEVER);
         }
     }
 
@@ -743,56 +770,53 @@ out:
 }
 
 /*
- * Allocates new clusters for the given guest_offset.
- *
- * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
- * contain the number of clusters that have been allocated and are contiguous
- * in the image file.
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
  *
- * If *host_offset is non-zero, it specifies the offset in the image file at
- * which the new clusters must start. *nb_clusters can be 0 on return in this
- * case if the cluster at host_offset is already in use. If *host_offset is
- * zero, the clusters can be allocated anywhere in the image file.
+ * Returns:
+ *   0       if there was no dependency. *cur_bytes indicates the number of
+ *           bytes from guest_offset that can be read before the next
+ *           dependency must be processed (or the request is complete)
  *
- * *host_offset is updated to contain the offset into the image file at which
- * the first allocated cluster starts.
- *
- * Return 0 on success and -errno in error cases. -EAGAIN means that the
- * function has been waiting for another request and the allocation must be
- * restarted, but the whole request should not be failed.
+ *   -EAGAIN if we had to wait for another request, previously gathered
+ *           information on cluster allocation may be invalid now. The caller
+ *           must start over anyway, so consider *cur_bytes undefined.
  */
-static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, unsigned int *nb_clusters)
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *cur_bytes, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
+    uint64_t bytes = *cur_bytes;
 
-    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
-                                         *host_offset, *nb_clusters);
-
-    /*
-     * Check if there already is an AIO write request in flight which allocates
-     * the same cluster. In this case we need to wait until the previous
-     * request has completed and updated the L2 table accordingly.
-     */
     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 
-        uint64_t start = guest_offset >> s->cluster_bits;
-        uint64_t end = start + *nb_clusters;
-        uint64_t old_start = old_alloc->offset >> s->cluster_bits;
-        uint64_t old_end = old_start + old_alloc->nb_clusters;
+        uint64_t start = guest_offset;
+        uint64_t end = start + bytes;
+        uint64_t old_start = l2meta_cow_start(old_alloc);
+        uint64_t old_end = l2meta_cow_end(old_alloc);
 
-        if (end < old_start || start > old_end) {
+        if (end <= old_start || start >= old_end) {
             /* No intersection */
         } else {
             if (start < old_start) {
                 /* Stop at the start of a running allocation */
-                *nb_clusters = old_start - start;
+                bytes = old_start - start;
             } else {
-                *nb_clusters = 0;
+                bytes = 0;
             }
 
-            if (*nb_clusters == 0) {
+            /* Stop if already an l2meta exists. After yielding, it wouldn't
+             * be valid any more, so we'd have to clean up the old L2Metas
+             * and deal with requests depending on them before starting to
+             * gather new ones. Not worth the trouble. */
+            if (bytes == 0 && *m) {
+                *cur_bytes = 0;
+                return 0;
+            }
+
+            if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
                 qemu_co_mutex_unlock(&s->lock);
@@ -803,10 +827,144 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
         }
     }
 
-    if (!*nb_clusters) {
-        abort();
+    /* Make sure that existing clusters and new allocations are only used up to
+     * the next dependency if we shortened the request above */
+    *cur_bytes = bytes;
+
+    return 0;
+}
+
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no allocated clusters are available at the given offset.
+ *          *bytes is normally unchanged. It is set to 0 if the cluster
+ *          is allocated and doesn't need COW, but doesn't have the right
+ *          physical offset.
+ *
+ *   1:     if allocated clusters that don't require a COW are available at
+ *          the requested offset. *bytes may have decreased and describes
+ *          the length of the area that can be written to.
+ *
+ *  -errno: in error cases
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t cluster_offset;
+    uint64_t *l2_table;
+    unsigned int nb_clusters;
+    unsigned int keep_clusters;
+    int ret, pret;
+
+    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+                              *bytes);
+
+    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
+                                == offset_into_cluster(s, *host_offset));
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* Check how many clusters are already allocated and don't need COW */
+    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+        && (cluster_offset & QCOW_OFLAG_COPIED))
+    {
+        /* If a specific host_offset is required, check it */
+        bool offset_matches =
+            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+        if (*host_offset != 0 && !offset_matches) {
+            *bytes = 0;
+            ret = 0;
+            goto out;
+        }
+
+        /* We keep all QCOW_OFLAG_COPIED clusters */
+        keep_clusters =
+            count_contiguous_clusters(nb_clusters, s->cluster_size,
+                                      &l2_table[l2_index], 0,
+                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+        assert(keep_clusters <= nb_clusters);
+
+        *bytes = MIN(*bytes,
+                 keep_clusters * s->cluster_size
+                 - offset_into_cluster(s, guest_offset));
+
+        ret = 1;
+    } else {
+        ret = 0;
+    }
+
+    /* Cleanup */
+out:
+    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (pret < 0) {
+        return pret;
     }
 
+    /* Only return a host offset if we actually made progress. Otherwise we
+     * would make requirements for handle_alloc() that it can't fulfill */
+    if (ret) {
+        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+                     + offset_into_cluster(s, guest_offset);
+    }
+
+    return ret;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+                                         *host_offset, *nb_clusters);
+
     /* Allocate new clusters */
     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
     if (*host_offset == 0) {
@@ -828,6 +986,151 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
 }
 
 /*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no clusters could be allocated. *bytes is set to 0,
+ *          *host_offset is left unchanged.
+ *
+ *   1:     if new clusters were allocated. *bytes may be decreased if the
+ *          new allocation doesn't cover all of the requested area.
+ *          *host_offset is updated to contain the host offset of the first
+ *          newly allocated cluster.
+ *
+ *  -errno: in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t *l2_table;
+    uint64_t entry;
+    unsigned int nb_clusters;
+    int ret;
+
+    uint64_t alloc_cluster_offset;
+
+    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+                             *bytes);
+    assert(*bytes > 0);
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    entry = be64_to_cpu(l2_table[l2_index]);
+
+    /* For the moment, overwrite compressed clusters one by one */
+    if (entry & QCOW_OFLAG_COMPRESSED) {
+        nb_clusters = 1;
+    } else {
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+    }
+
+    /* This function is only called when there were no non-COW clusters, so if
+     * we can't find any unallocated or COW clusters either, something is
+     * wrong with our code. */
+    assert(nb_clusters > 0);
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Allocate, if necessary at a given offset in the image file */
+    alloc_cluster_offset = start_of_cluster(s, *host_offset);
+    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+                                  &nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Can't extend contiguous allocation */
+    if (nb_clusters == 0) {
+        *bytes = 0;
+        return 0;
+    }
+
+    /*
+     * Save info needed for meta data update.
+     *
+     * requested_sectors: Number of sectors from the start of the first
+     * newly allocated cluster to the end of the (possibly shortened
+     * before) write request.
+     *
+     * avail_sectors: Number of sectors from the start of the first
+     * newly allocated to the end of the last newly allocated cluster.
+     *
+     * nb_sectors: The number of sectors from the start of the first
+     * newly allocated cluster to the end of the area that the write
+     * request actually writes to (excluding COW at the end)
+     */
+    int requested_sectors =
+        (*bytes + offset_into_cluster(s, guest_offset))
+        >> BDRV_SECTOR_BITS;
+    int avail_sectors = nb_clusters
+                        << (s->cluster_bits - BDRV_SECTOR_BITS);
+    int alloc_n_start = offset_into_cluster(s, guest_offset)
+                        >> BDRV_SECTOR_BITS;
+    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    QCowL2Meta *old_m = *m;
+
+    *m = g_malloc0(sizeof(**m));
+
+    **m = (QCowL2Meta) {
+        .next           = old_m,
+
+        .alloc_offset   = alloc_cluster_offset,
+        .offset         = start_of_cluster(s, guest_offset),
+        .nb_clusters    = nb_clusters,
+        .nb_available   = nb_sectors,
+
+        .cow_start = {
+            .offset     = 0,
+            .nb_sectors = alloc_n_start,
+        },
+        .cow_end = {
+            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+            .nb_sectors = avail_sectors - nb_sectors,
+        },
+    };
+    qemu_co_queue_init(&(*m)->dependent_requests);
+    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
+    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                         - offset_into_cluster(s, guest_offset));
+    assert(*bytes != 0);
+
+    return 1;
+
+fail:
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
+    }
+    return ret;
+}
+
+/*
  * alloc_cluster_offset
  *
  * For a given offset on the virtual disk, find the cluster offset in qcow2
@@ -847,151 +1150,113 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m)
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
-    int l2_index, ret, sectors;
-    uint64_t *l2_table;
-    unsigned int nb_clusters, keep_clusters;
+    uint64_t start, remaining;
     uint64_t cluster_offset;
+    uint64_t cur_bytes;
+    int ret;
 
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
 
-    /* Find L2 entry for the first involved cluster */
-again:
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
-    if (ret < 0) {
-        return ret;
-    }
+    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+    offset = start_of_cluster(s, offset);
 
-    /*
-     * Calculate the number of clusters to look for. We stop at L2 table
-     * boundaries to keep things simple.
-     */
-    nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
-                      s->l2_size - l2_index);
+again:
+    start = offset + (n_start << BDRV_SECTOR_BITS);
+    remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+    cluster_offset = 0;
+    *host_offset = 0;
+    cur_bytes = 0;
+    *m = NULL;
 
-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    while (true) {
 
-    /*
-     * Check how many clusters are already allocated and don't need COW, and how
-     * many need a new allocation.
-     */
-    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
-        && (cluster_offset & QCOW_OFLAG_COPIED))
-    {
-        /* We keep all QCOW_OFLAG_COPIED clusters */
-        keep_clusters =
-            count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], 0,
-                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
-        assert(keep_clusters <= nb_clusters);
-        nb_clusters -= keep_clusters;
-    } else {
-        keep_clusters = 0;
-        cluster_offset = 0;
-    }
-
-    if (nb_clusters > 0) {
-        /* For the moment, overwrite compressed clusters one by one */
-        uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
-        if (entry & QCOW_OFLAG_COMPRESSED) {
-            nb_clusters = 1;
-        } else {
-            nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
-                                             l2_index + keep_clusters);
+        if (!*host_offset) {
+            *host_offset = start_of_cluster(s, cluster_offset);
         }
-    }
 
-    cluster_offset &= L2E_OFFSET_MASK;
+        assert(remaining >= cur_bytes);
 
-    /*
-     * The L2 table isn't used any more after this. As long as the cache works
-     * synchronously, it's important to release it before calling
-     * do_alloc_cluster_offset, which may yield if we need to wait for another
-     * request to complete. If we still had the reference, we could use up the
-     * whole cache with sleeping requests.
-     */
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* If there is something left to allocate, do that now */
-    *m = (QCowL2Meta) {
-        .cluster_offset     = cluster_offset,
-        .nb_clusters        = 0,
-    };
-    qemu_co_queue_init(&m->dependent_requests);
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
 
-    if (nb_clusters > 0) {
-        uint64_t alloc_offset;
-        uint64_t alloc_cluster_offset;
-        uint64_t keep_bytes = keep_clusters * s->cluster_size;
-
-        /* Calculate start and size of allocation */
-        alloc_offset = offset + keep_bytes;
-
-        if (keep_clusters == 0) {
-            alloc_cluster_offset = 0;
-        } else {
-            alloc_cluster_offset = cluster_offset + keep_bytes;
+        if (remaining == 0) {
+            break;
         }
 
-        /* Allocate, if necessary at a given offset in the image file */
-        ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
-                                      &nb_clusters);
+        cur_bytes = remaining;
+
+        /*
+         * Now start gathering as many contiguous clusters as possible:
+         *
+         * 1. Check for overlaps with in-flight allocations
+         *
+         *      a) Overlap not in the first cluster -> shorten this request and
+         *         let the caller handle the rest in its next loop iteration.
+         *
+         *      b) Real overlaps of two requests. Yield and restart the search
+         *         for contiguous clusters (the situation could have changed
+         *         while we were sleeping)
+         *
+         *      c) TODO: Request starts in the same cluster as the in-flight
+         *         allocation ends. Shorten the COW of the in-fight allocation,
+         *         set cluster_offset to write to the same cluster and set up
+         *         the right synchronisation between the in-flight request and
+         *         the new one.
+         */
+        ret = handle_dependencies(bs, start, &cur_bytes, m);
         if (ret == -EAGAIN) {
+            /* Currently handle_dependencies() doesn't yield if we already had
+             * an allocation. If it did, we would have to clean up the L2Meta
+             * structs before starting over. */
+            assert(*m == NULL);
             goto again;
         } else if (ret < 0) {
-            goto fail;
+            return ret;
+        } else if (cur_bytes == 0) {
+            break;
+        } else {
+            /* handle_dependencies() may have decreased cur_bytes (shortened
+             * the allocations below) so that the next dependency is processed
+             * correctly during the next loop iteration. */
         }
 
-        /* save info needed for meta data update */
-        if (nb_clusters > 0) {
-            /*
-             * requested_sectors: Number of sectors from the start of the first
-             * newly allocated cluster to the end of the (possibly shortened
-             * before) write request.
-             *
-             * avail_sectors: Number of sectors from the start of the first
-             * newly allocated to the end of the last newly allocated cluster.
-             */
-            int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
-            int avail_sectors = nb_clusters
-                                << (s->cluster_bits - BDRV_SECTOR_BITS);
-
-            *m = (QCowL2Meta) {
-                .cluster_offset = keep_clusters == 0 ?
-                                  alloc_cluster_offset : cluster_offset,
-                .alloc_offset   = alloc_cluster_offset,
-                .offset         = alloc_offset,
-                .n_start        = keep_clusters == 0 ? n_start : 0,
-                .nb_clusters    = nb_clusters,
-                .nb_available   = MIN(requested_sectors, avail_sectors),
-            };
-            qemu_co_queue_init(&m->dependent_requests);
-            QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
+        /*
+         * 2. Count contiguous COPIED clusters.
+         */
+        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else if (cur_bytes == 0) {
+            break;
         }
-    }
 
-    /* Some cleanup work */
-    sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
-    if (sectors > n_end) {
-        sectors = n_end;
+        /*
+         * 3. If the request still hasn't completed, allocate new clusters,
+         *    considering any cluster_offset of steps 1c or 2.
+         */
+        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else {
+            assert(cur_bytes == 0);
+            break;
+        }
     }
 
-    assert(sectors > n_start);
-    *num = sectors - n_start;
+    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+    assert(*num > 0);
+    assert(*host_offset != 0);
 
     return 0;
-
-fail:
-    if (m->nb_clusters > 0) {
-        QLIST_REMOVE(m, next_in_flight);
-    }
-    return ret;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@@ -1081,7 +1346,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         l2_table[l2_index + i] = cpu_to_be64(0);
 
         /* Then decrease the refcount */
-        qcow2_free_any_clusters(bs, old_offset, 1);
+        qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
     }
 
     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
@@ -1112,18 +1377,25 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 
     nb_clusters = size_to_clusters(s, end_offset - offset);
 
+    s->cache_discards = true;
+
     /* Each L2 table is handled by its own loop iteration */
     while (nb_clusters > 0) {
         ret = discard_single_l2(bs, offset, nb_clusters);
         if (ret < 0) {
-            return ret;
+            goto fail;
         }
 
         nb_clusters -= ret;
         offset += (ret * s->cluster_size);
     }
 
-    return 0;
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
 }
 
 /*
@@ -1157,7 +1429,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
         if (old_offset & QCOW_OFLAG_COMPRESSED) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
-            qcow2_free_any_clusters(bs, old_offset, 1);
+            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
         } else {
             l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
         }
@@ -1185,15 +1457,22 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
     /* Each L2 table is handled by its own loop iteration */
     nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
 
+    s->cache_discards = true;
+
     while (nb_clusters > 0) {
         ret = zero_single_l2(bs, offset, nb_clusters);
         if (ret < 0) {
-            return ret;
+            goto fail;
         }
 
         nb_clusters -= ret;
         offset += (ret * s->cluster_size);
     }
 
-    return 0;
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
 }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 96224d1af..1244693f3 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -23,13 +23,13 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length,
-                            int addend);
+                            int addend, enum qcow2_discard_type type);
 
 
 /*********************************************************/
@@ -201,7 +201,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
     *refcount_block = NULL;
 
     /* We write to the refcount table, so we might depend on L2 tables */
-    qcow2_cache_flush(bs, s->l2_table_cache);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
 
     /* Allocate the refcount block itself and mark it as used */
     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
@@ -232,12 +235,16 @@ static int alloc_refcount_block(BlockDriverState *bs,
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
-        ret = update_refcount(bs, new_block, s->cluster_size, 1);
+        ret = update_refcount(bs, new_block, s->cluster_size, 1,
+                              QCOW2_DISCARD_NEVER);
         if (ret < 0) {
             goto fail_block;
         }
 
-        bdrv_flush(bs->file);
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            goto fail_block;
+        }
 
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
@@ -393,7 +400,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
 
     /* Free old table. Remember, we must not change free_cluster_index */
     uint64_t old_free_cluster_index = s->free_cluster_index;
-    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
     s->free_cluster_index = old_free_cluster_index;
 
     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
@@ -412,9 +420,77 @@ fail_block:
     return ret;
 }
 
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *next;
+
+    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+        QTAILQ_REMOVE(&s->discards, d, next);
+
+        /* Discard is optional, ignore the return value */
+        if (ret >= 0) {
+            bdrv_discard(bs->file,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
+        }
+
+        g_free(d);
+    }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t length)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *p, *next;
+
+    QTAILQ_FOREACH(d, &s->discards, next) {
+        uint64_t new_start = MIN(offset, d->offset);
+        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+        if (new_end - new_start <= length + d->bytes) {
+            /* There can't be any overlap, areas ending up here have no
+             * references any more and therefore shouldn't get freed another
+             * time. */
+            assert(d->bytes + length == new_end - new_start);
+            d->offset = new_start;
+            d->bytes = new_end - new_start;
+            goto found;
+        }
+    }
+
+    d = g_malloc(sizeof(*d));
+    *d = (Qcow2DiscardRegion) {
+        .bs     = bs,
+        .offset = offset,
+        .bytes  = length,
+    };
+    QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+    /* Merge discard requests if they are adjacent now */
+    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+        if (p == d
+            || p->offset > d->offset + d->bytes
+            || d->offset > p->offset + p->bytes)
+        {
+            continue;
+        }
+
+        /* Still no overlap possible */
+        assert(p->offset == d->offset + d->bytes
+            || d->offset == p->offset + p->bytes);
+
+        QTAILQ_REMOVE(&s->discards, p, next);
+        d->offset = MIN(d->offset, p->offset);
+        d->bytes += p->bytes;
+    }
+}
+
 /* XXX: cache several refcount block clusters ? */
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-    int64_t offset, int64_t length, int addend)
+    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
@@ -480,10 +556,18 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
             s->free_cluster_index = cluster_index;
         }
         refcount_block[block_index] = cpu_to_be16(refcount);
+
+        if (refcount == 0 && s->discard_passthrough[type]) {
+            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        }
     }
 
     ret = 0;
 fail:
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
     /* Write last changed block to disk */
     if (refcount_block) {
         int wret;
@@ -500,7 +584,8 @@ fail:
      */
     if (ret < 0) {
         int dummy;
-        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
+        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+                                QCOW2_DISCARD_NEVER);
         (void)dummy;
     }
 
@@ -516,18 +601,18 @@ fail:
  */
 static int update_cluster_refcount(BlockDriverState *bs,
                                    int64_t cluster_index,
-                                   int addend)
+                                   int addend,
+                                   enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+                          type);
     if (ret < 0) {
         return ret;
     }
 
-    bdrv_flush(bs->file);
-
     return get_refcount(bs, cluster_index);
 }
 
@@ -575,7 +660,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
         return offset;
     }
 
-    ret = update_refcount(bs, offset, size, 1);
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
@@ -607,7 +692,8 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     old_free_cluster_index = s->free_cluster_index;
     s->free_cluster_index = cluster_index + i;
 
-    ret = update_refcount(bs, offset, i << s->cluster_bits, 1);
+    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                          QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
@@ -645,7 +731,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if (free_in_cluster == 0)
             s->free_byte_offset = 0;
         if ((offset & (s->cluster_size - 1)) != 0)
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
     } else {
         offset = qcow2_alloc_clusters(bs, s->cluster_size);
         if (offset < 0) {
@@ -655,7 +742,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if ((cluster_offset + s->cluster_size) == offset) {
             /* we are lucky: contiguous data */
             offset = s->free_byte_offset;
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
             s->free_byte_offset += size;
         } else {
             s->free_byte_offset = offset;
@@ -663,17 +751,22 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         }
     }
 
-    bdrv_flush(bs->file);
+    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+     * or explicitly by update_cluster_refcount().  Refcount blocks must be
+     * flushed before the caller's L2 table updates.
+     */
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     return offset;
 }
 
 void qcow2_free_clusters(BlockDriverState *bs,
-                          int64_t offset, int64_t size)
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type)
 {
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
-    ret = update_refcount(bs, offset, size, -1);
+    ret = update_refcount(bs, offset, size, -1, type);
     if (ret < 0) {
         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
         /* TODO Remember the clusters to free them later and avoid leaking */
@@ -684,8 +777,8 @@ void qcow2_free_clusters(BlockDriverState *bs,
  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
  * normal cluster, compressed cluster, etc.)
  */
-void qcow2_free_any_clusters(BlockDriverState *bs,
-    uint64_t l2_entry, int nb_clusters)
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
 
@@ -697,12 +790,12 @@ void qcow2_free_any_clusters(BlockDriverState *bs,
                            s->csize_mask) + 1;
             qcow2_free_clusters(bs,
                 (l2_entry & s->cluster_offset_mask) & ~511,
-                nb_csectors * 512);
+                nb_csectors * 512, type);
         }
         break;
     case QCOW2_CLUSTER_NORMAL:
         qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
-                            nb_clusters << s->cluster_bits);
+                            nb_clusters << s->cluster_bits, type);
         break;
     case QCOW2_CLUSTER_UNALLOCATED:
     case QCOW2_CLUSTER_ZERO:
@@ -733,20 +826,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
 
+    s->cache_discards = true;
+
     /* WARNING: qcow2_snapshot_goto relies on this function not using the
      * l1_table_offset when it is the current s->l1_table_offset! Be careful
      * when changing this! */
     if (l1_table_offset != s->l1_table_offset) {
-        if (l1_size2 != 0) {
-            l1_table = g_malloc0(align_offset(l1_size2, 512));
-        } else {
-            l1_table = NULL;
-        }
+        l1_table = g_malloc0(align_offset(l1_size2, 512));
         l1_allocated = 1;
-        if (bdrv_pread(bs->file, l1_table_offset,
-                       l1_table, l1_size2) != l1_size2)
-        {
-            ret = -EIO;
+
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        if (ret < 0) {
             goto fail;
         }
 
@@ -782,27 +872,25 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             int ret;
                             ret = update_refcount(bs,
                                 (offset & s->cluster_offset_mask) & ~511,
-                                nb_csectors * 512, addend);
+                                nb_csectors * 512, addend,
+                                QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
                                 goto fail;
                             }
-
-                            /* TODO Flushing once for the whole function should
-                             * be enough */
-                            bdrv_flush(bs->file);
                         }
                         /* compressed clusters are never modified */
                         refcount = 2;
                     } else {
                         uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
                         if (addend != 0) {
-                            refcount = update_cluster_refcount(bs, cluster_index, addend);
+                            refcount = update_cluster_refcount(bs, cluster_index, addend,
+                                                               QCOW2_DISCARD_SNAPSHOT);
                         } else {
                             refcount = get_refcount(bs, cluster_index);
                         }
 
                         if (refcount < 0) {
-                            ret = -EIO;
+                            ret = refcount;
                             goto fail;
                         }
                     }
@@ -828,12 +916,13 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 
             if (addend != 0) {
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
+                                                   QCOW2_DISCARD_SNAPSHOT);
             } else {
                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
             }
             if (refcount < 0) {
-                ret = -EIO;
+                ret = refcount;
                 goto fail;
             } else if (refcount == 1) {
                 l2_offset |= QCOW_OFLAG_COPIED;
@@ -845,21 +934,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
         }
     }
 
-    ret = 0;
+    ret = bdrv_flush(bs);
 fail:
     if (l2_table) {
         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     }
 
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
     /* Update L1 only if it isn't deleted anyway (addend = -1) */
-    if (addend >= 0 && l1_modified) {
-        for(i = 0; i < l1_size; i++)
+    if (ret == 0 && addend >= 0 && l1_modified) {
+        for (i = 0; i < l1_size; i++) {
             cpu_to_be64s(&l1_table[i]);
-        if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
-                        l1_size2) < 0)
-            goto fail;
-        for(i = 0; i < l1_size; i++)
+        }
+
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+        for (i = 0; i < l1_size; i++) {
             be64_to_cpus(&l1_table[i]);
+        }
     }
     if (l1_allocated)
         g_free(l1_table);
@@ -918,6 +1012,12 @@ static void inc_refcounts(BlockDriverState *bs,
     }
 }
 
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */
+    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
+};
+
 /*
  * Increases the refcount in the given refcount table for the all clusters
  * referenced in the L2 table. While doing so, performs some checks on L2
@@ -928,10 +1028,11 @@ static void inc_refcounts(BlockDriverState *bs,
  */
 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
-    int check_copied)
+    int flags)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table, l2_entry;
+    uint64_t next_contiguous_offset = 0;
     int i, l2_size, nb_csectors, refcount;
 
     /* Read L2 table from disk */
@@ -962,6 +1063,18 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
             l2_entry &= s->cluster_offset_mask;
             inc_refcounts(bs, res, refcount_table, refcount_table_size,
                 l2_entry & ~511, nb_csectors * 512);
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                res->bfi.compressed_clusters++;
+
+                /* Compressed clusters are fragmented by nature.  Since they
+                 * take up sub-sector space but we only have sector granularity
+                 * I/O we need to re-read the same sectors even for adjacent
+                 * compressed clusters.
+                 */
+                res->bfi.fragmented_clusters++;
+            }
             break;
 
         case QCOW2_CLUSTER_ZERO:
@@ -975,7 +1088,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
             /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
 
-            if (check_copied) {
+            if (flags & CHECK_OFLAG_COPIED) {
                 refcount = get_refcount(bs, offset >> s->cluster_bits);
                 if (refcount < 0) {
                     fprintf(stderr, "Can't get refcount for offset %"
@@ -989,6 +1102,15 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                 }
             }
 
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                if (next_contiguous_offset &&
+                    offset != next_contiguous_offset) {
+                    res->bfi.fragmented_clusters++;
+                }
+                next_contiguous_offset = offset + s->cluster_size;
+            }
+
             /* Mark cluster as used */
             inc_refcounts(bs, res, refcount_table,refcount_table_size,
                 offset, s->cluster_size);
@@ -1032,7 +1154,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
                               uint16_t *refcount_table,
                               int refcount_table_size,
                               int64_t l1_table_offset, int l1_size,
-                              int check_copied)
+                              int flags)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l1_table, l2_offset, l1_size2;
@@ -1061,7 +1183,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
         l2_offset = l1_table[i];
         if (l2_offset) {
             /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-            if (check_copied) {
+            if (flags & CHECK_OFLAG_COPIED) {
                 refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
                     >> s->cluster_bits);
                 if (refcount < 0) {
@@ -1090,7 +1212,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
 
             /* Process and check L2 entries */
             ret = check_refcounts_l2(bs, res, refcount_table,
-                refcount_table_size, l2_offset, check_copied);
+                                     refcount_table_size, l2_offset, flags);
             if (ret < 0) {
                 goto fail;
             }
@@ -1116,7 +1238,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t size, i;
+    int64_t size, i, highest_cluster;
     int nb_clusters, refcount1, refcount2;
     QCowSnapshot *sn;
     uint16_t *refcount_table;
@@ -1126,13 +1248,17 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     nb_clusters = size_to_clusters(s, size);
     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
 
+    res->bfi.total_clusters =
+        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
     /* header */
     inc_refcounts(bs, res, refcount_table, nb_clusters,
         0, s->cluster_size);
 
     /* current L1 table */
     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
-                       s->l1_table_offset, s->l1_size, 1);
+                             s->l1_table_offset, s->l1_size,
+                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
     if (ret < 0) {
         goto fail;
     }
@@ -1187,7 +1313,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     }
 
     /* compare ref counts */
-    for(i = 0; i < nb_clusters; i++) {
+    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
         refcount1 = get_refcount(bs, i);
         if (refcount1 < 0) {
             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
@@ -1197,6 +1323,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         }
 
         refcount2 = refcount_table[i];
+
+        if (refcount1 > 0 || refcount2 > 0) {
+            highest_cluster = i;
+        }
+
         if (refcount1 != refcount2) {
 
             /* Check if we're allowed to fix the mismatch */
@@ -1215,7 +1346,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
             if (num_fixed) {
                 ret = update_refcount(bs, i << s->cluster_bits, 1,
-                                      refcount2 - refcount1);
+                                      refcount2 - refcount1,
+                                      QCOW2_DISCARD_ALWAYS);
                 if (ret >= 0) {
                     (*num_fixed)++;
                     continue;
@@ -1231,6 +1363,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         }
     }
 
+    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
     ret = 0;
 
 fail:
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 4e7c93b8b..0caac9055 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -23,7 +23,7 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 
 typedef struct QEMU_PACKED QCowSnapshotHeader {
@@ -180,11 +180,14 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
 
     /* Allocate space for the new snapshot list */
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
-    bdrv_flush(bs->file);
     offset = snapshots_offset;
     if (offset < 0) {
         return offset;
     }
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        return ret;
+    }
 
     /* Write all snapshots to the new list */
     for(i = 0; i < s->nb_snapshots; i++) {
@@ -259,7 +262,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     }
 
     /* free the old snapshot table */
-    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size,
+                        QCOW2_DISCARD_SNAPSHOT);
     s->snapshots_offset = snapshots_offset;
     s->snapshots_size = snapshots_size;
     return 0;
@@ -378,11 +382,6 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto fail;
     }
 
-    ret = bdrv_flush(bs);
-    if (ret < 0) {
-        goto fail;
-    }
-
     /* Append the new snapshot to the snapshot list */
     new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
     if (s->snapshots) {
@@ -571,7 +570,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
     if (ret < 0) {
         return ret;
     }
-    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
+    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_SNAPSHOT);
 
     /* must update the copied flag on the current cluster offsets */
     ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
diff --git a/block/qcow2.c b/block/qcow2.c
index c1ff31f48..3376901bd 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -22,13 +22,14 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
+#include "qemu/aes.h"
 #include "block/qcow2.h"
-#include "qemu-error.h"
-#include "qerror.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qbool.h"
 #include "trace.h"
 
 /*
@@ -222,7 +223,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
  * updated successfully.  Therefore it is not required to check the return
  * value of this function.
  */
-static int qcow2_mark_dirty(BlockDriverState *bs)
+int qcow2_mark_dirty(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t val;
@@ -285,12 +286,44 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
     return ret;
 }
 
-static int qcow2_open(BlockDriverState *bs, int flags)
+static QemuOptsList qcow2_runtime_opts = {
+    .name = "qcow2",
+    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
+    .desc = {
+        {
+            .name = QCOW2_OPT_LAZY_REFCOUNTS,
+            .type = QEMU_OPT_BOOL,
+            .help = "Postpone refcount updates",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_REQUEST,
+            .type = QEMU_OPT_BOOL,
+            .help = "Pass guest discard requests to the layer below",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when snapshot related space "
+                    "is freed",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_OTHER,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when other clusters are freed",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVQcowState *s = bs->opaque;
     int len, i, ret = 0;
     QCowHeader header;
+    QemuOpts *opts;
+    Error *local_err = NULL;
     uint64_t ext_end;
+    uint64_t l1_vm_state_index;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
@@ -311,7 +344,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     be32_to_cpus(&header.nb_snapshots);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
     if (header.version < 2 || header.version > 3) {
@@ -408,7 +441,14 @@ static int qcow2_open(BlockDriverState *bs, int flags)
 
     /* read the level 1 table */
     s->l1_size = header.l1_size;
-    s->l1_vm_state_index = size_to_l1(s, header.size);
+
+    l1_vm_state_index = size_to_l1(s, header.size);
+    if (l1_vm_state_index > INT_MAX) {
+        ret = -EFBIG;
+        goto fail;
+    }
+    s->l1_vm_state_index = l1_vm_state_index;
+
     /* the L1 table must contain at least enough entries to put
        header.size bytes */
     if (s->l1_size < s->l1_vm_state_index) {
@@ -446,6 +486,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     }
 
     QLIST_INIT(&s->cluster_allocs);
+    QTAILQ_INIT(&s->discards);
 
     /* read qcow2 extensions */
     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
@@ -495,6 +536,38 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         }
     }
 
+    /* Enable lazy_refcounts according to image and command line options */
+    opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
+        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
+
+    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
+    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
+    s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
+                          flags & BDRV_O_UNMAP);
+    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
+    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+
+    qemu_opts_del(opts);
+
+    if (s->use_lazy_refcounts && s->qcow_version < 3) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
+            "a qcow2 image with at least qemu 1.1 compatibility level");
+        ret = -EINVAL;
+        goto fail;
+    }
+
 #ifdef DEBUG_ALLOC
     {
         BdrvCheckResult result = {0};
@@ -584,7 +657,7 @@ static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
         *pnum = 0;
     }
 
-    return (cluster_offset != 0);
+    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
 }
 
 /* handle reading after the end of the backing file */
@@ -665,10 +738,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             break;
 
         case QCOW2_CLUSTER_ZERO:
-            if (s->qcow_version < 3) {
-                ret = -EIO;
-                goto fail;
-            }
             qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
             break;
 
@@ -745,21 +814,6 @@ fail:
     return ret;
 }
 
-static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
-{
-    /* Take the request off the list of running requests */
-    if (m->nb_clusters != 0) {
-        QLIST_REMOVE(m, next_in_flight);
-    }
-
-    /* Restart all dependent requests */
-    if (!qemu_co_queue_empty(&m->dependent_requests)) {
-        qemu_co_mutex_unlock(&s->lock);
-        qemu_co_queue_restart_all(&m->dependent_requests);
-        qemu_co_mutex_lock(&s->lock);
-    }
-}
-
 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
@@ -774,15 +828,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
-    QCowL2Meta l2meta = {
-        .nb_clusters = 0,
-    };
+    QCowL2Meta *l2meta = NULL;
 
     trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
                                  remaining_sectors);
 
-    qemu_co_queue_init(&l2meta.dependent_requests);
-
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     s->cluster_cache_offset = -1; /* disable compressed cache */
@@ -791,6 +841,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     while (remaining_sectors != 0) {
 
+        l2meta = NULL;
+
         trace_qcow2_writev_start_part(qemu_coroutine_self());
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n_end = index_in_cluster + remaining_sectors;
@@ -800,17 +852,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         }
 
         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
 
-        if (l2meta.nb_clusters > 0 &&
-            (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
-            qcow2_mark_dirty(bs);
-        }
-
-        cluster_offset = l2meta.cluster_offset;
         assert((cluster_offset & 511) == 0);
 
         qemu_iovec_reset(&hd_qiov);
@@ -835,8 +881,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
                                 (cluster_offset >> 9) + index_in_cluster);
         ret = bdrv_co_writev(bs->file,
@@ -847,12 +893,25 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             goto fail;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
-        if (ret < 0) {
-            goto fail;
-        }
+        while (l2meta != NULL) {
+            QCowL2Meta *next;
+
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
+
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
 
-        run_dependent_requests(s, &l2meta);
+            next = l2meta->next;
+            g_free(l2meta);
+            l2meta = next;
+        }
 
         remaining_sectors -= cur_nr_sectors;
         sector_num += cur_nr_sectors;
@@ -862,10 +921,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     ret = 0;
 
 fail:
-    run_dependent_requests(s, &l2meta);
-
     qemu_co_mutex_unlock(&s->lock);
 
+    while (l2meta != NULL) {
+        QCowL2Meta *next;
+
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+        next = l2meta->next;
+        g_free(l2meta);
+        l2meta = next;
+    }
+
     qemu_iovec_destroy(&hd_qiov);
     qemu_vfree(cluster_data);
     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
@@ -902,6 +972,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
     AES_KEY aes_encrypt_key;
     AES_KEY aes_decrypt_key;
     uint32_t crypt_method = 0;
+    QDict *options;
 
     /*
      * Backing files are read-only which makes all of their metadata immutable,
@@ -916,8 +987,14 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
 
     qcow2_close(bs);
 
+    options = qdict_new();
+    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
+              qbool_from_int(s->use_lazy_refcounts));
+
     memset(s, 0, sizeof(BDRVQcowState));
-    qcow2_open(bs, flags);
+    qcow2_open(bs, options, flags);
+
+    QDECREF(options);
 
     if (crypt_method) {
         s->crypt_method = crypt_method;
@@ -1128,31 +1205,34 @@ static int preallocate(BlockDriverState *bs)
 {
     uint64_t nb_sectors;
     uint64_t offset;
+    uint64_t host_offset = 0;
     int num;
     int ret;
-    QCowL2Meta meta;
+    QCowL2Meta *meta;
 
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
-    qemu_co_queue_init(&meta.dependent_requests);
-    meta.cluster_offset = 0;
 
     while (nb_sectors) {
         num = MIN(nb_sectors, INT_MAX >> 9);
-        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
         if (ret < 0) {
             return ret;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
         if (ret < 0) {
-            qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
+                                    QCOW2_DISCARD_NEVER);
             return ret;
         }
 
         /* There are no dependent requests, but we need to remove our request
          * from the list of in-flight requests */
-        run_dependent_requests(bs->opaque, &meta);
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }
 
         /* TODO Preallocate data if requested */
 
@@ -1165,10 +1245,10 @@ static int preallocate(BlockDriverState *bs)
      * all of the allocated clusters (otherwise we get failing reads after
      * EOF). Extend the image to the last allocated sector.
      */
-    if (meta.cluster_offset != 0) {
+    if (host_offset != 0) {
         uint8_t buf[512];
         memset(buf, 0, 512);
-        ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
         if (ret < 0) {
             return ret;
         }
@@ -1216,7 +1296,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
     if (ret < 0) {
         return ret;
     }
@@ -1268,7 +1348,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
      */
     BlockDriver* drv = bdrv_find_format("qcow2");
     assert(drv != NULL);
-    ret = bdrv_open(bs, filename,
+    ret = bdrv_open(bs, filename, NULL,
         BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
     if (ret < 0) {
         goto out;
@@ -1436,7 +1516,8 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, new_l1_size;
+    int64_t new_l1_size;
+    int ret;
 
     if (offset & 511) {
         error_report("The new size must be a multiple of 512");
@@ -1493,8 +1574,21 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    if (nb_sectors != s->cluster_sectors)
-        return -EINVAL;
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow2_write_compressed(bs, sector_num,
+                                         pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
 
     out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
 
@@ -1608,8 +1702,8 @@ static void dump_refcounts(BlockDriverState *bs)
 }
 #endif
 
-static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
-                              int64_t pos, int size)
+static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
 {
     BDRVQcowState *s = bs->opaque;
     int growable = bs->growable;
@@ -1617,7 +1711,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
     bs->growable = 1;
-    ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
     bs->growable = growable;
 
     return ret;
@@ -1691,6 +1785,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_close         = qcow2_close,
     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
diff --git a/block/qcow2.h b/block/qcow2.h
index b4eb65470..dba977141 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,8 +25,8 @@
 #ifndef BLOCK_QCOW2_H
 #define BLOCK_QCOW2_H
 
-#include "aes.h"
-#include "qemu-coroutine.h"
+#include "qemu/aes.h"
+#include "block/coroutine.h"
 
 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -58,6 +58,12 @@
 
 #define DEFAULT_CLUSTER_SIZE 65536
 
+
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+
 typedef struct QCowHeader {
     uint32_t magic;
     uint32_t version;
@@ -126,12 +132,28 @@ enum {
     QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS,
 };
 
+enum qcow2_discard_type {
+    QCOW2_DISCARD_NEVER = 0,
+    QCOW2_DISCARD_ALWAYS,
+    QCOW2_DISCARD_REQUEST,
+    QCOW2_DISCARD_SNAPSHOT,
+    QCOW2_DISCARD_OTHER,
+    QCOW2_DISCARD_MAX
+};
+
 typedef struct Qcow2Feature {
     uint8_t type;
     uint8_t bit;
     char    name[46];
 } QEMU_PACKED Qcow2Feature;
 
+typedef struct Qcow2DiscardRegion {
+    BlockDriverState *bs;
+    uint64_t offset;
+    uint64_t bytes;
+    QTAILQ_ENTRY(Qcow2DiscardRegion) next;
+} Qcow2DiscardRegion;
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -173,6 +195,9 @@ typedef struct BDRVQcowState {
 
     int flags;
     int qcow_version;
+    bool use_lazy_refcounts;
+
+    bool discard_passthrough[QCOW2_DISCARD_MAX];
 
     uint64_t incompatible_features;
     uint64_t compatible_features;
@@ -181,6 +206,8 @@ typedef struct BDRVQcowState {
     size_t unknown_header_fields_size;
     void* unknown_header_fields;
     QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+    QTAILQ_HEAD (, Qcow2DiscardRegion) discards;
+    bool cache_discards;
 } BDRVQcowState;
 
 /* XXX: use std qcow open function ? */
@@ -196,17 +223,59 @@ typedef struct QCowCreateState {
 
 struct QCowAIOCB;
 
-/* XXX This could be private for qcow2-cluster.c */
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    uint64_t    offset;
+
+    /** Number of sectors to copy */
+    int         nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
 typedef struct QCowL2Meta
 {
+    /** Guest offset of the first newly allocated cluster */
     uint64_t offset;
-    uint64_t cluster_offset;
+
+    /** Host offset of the first newly allocated cluster */
     uint64_t alloc_offset;
-    int n_start;
+
+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
     int nb_available;
+
+    /** Number of newly allocated clusters */
     int nb_clusters;
+
+    /**
+     * Requests that overlap with this allocation and wait to be restarted
+     * when the allocating request has completed.
+     */
     CoQueue dependent_requests;
 
+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
+    /** Pointer to next L2Meta of the same write request */
+    struct QCowL2Meta *next;
+
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 
@@ -223,17 +292,32 @@ enum {
 
 #define REFT_OFFSET_MASK 0xffffffffffffff00ULL
 
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
 static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;
 }
 
-static inline int size_to_l1(BDRVQcowState *s, int64_t size)
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
 {
     int shift = s->cluster_bits + s->l2_bits;
     return (size + (1ULL << shift) - 1) >> shift;
 }
 
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+    return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
 static inline int64_t align_offset(int64_t offset, int n)
 {
     offset = (offset + n - 1) & ~(n - 1);
@@ -259,11 +343,24 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
     return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
 }
 
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+    return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+    return m->offset + m->cow_end.offset
+        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
 // FIXME Need qcow2_ prefix to global functions
 
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);
 
 /* qcow2-refcount.c functions */
@@ -275,9 +372,10 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     int nb_clusters);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
 void qcow2_free_clusters(BlockDriverState *bs,
-    int64_t offset, int64_t size);
-void qcow2_free_any_clusters(BlockDriverState *bs,
-    uint64_t cluster_offset, int nb_clusters);
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type);
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type);
 
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend);
@@ -285,8 +383,11 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix);
 
+void qcow2_process_discards(BlockDriverState *bs, int ret);
+
 /* qcow2-cluster.c functions */
-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size);
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
@@ -297,7 +398,7 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m);
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          uint64_t offset,
                                          int compressed_size);
diff --git a/block/qed-table.c b/block/qed-table.c
index de845ec3d..76d2dcccf 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -13,7 +13,7 @@
  */
 
 #include "trace.h"
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
 
 typedef struct {
diff --git a/block/qed.c b/block/qed.c
index 0b5374a20..f767b0528 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -12,11 +12,11 @@
  *
  */
 
-#include "qemu-timer.h"
+#include "qemu/timer.h"
 #include "trace.h"
 #include "qed.h"
-#include "qerror.h"
-#include "migration.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
 
 static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
 {
@@ -373,7 +373,7 @@ static void bdrv_qed_rebind(BlockDriverState *bs)
     s->bs = bs;
 }
 
-static int bdrv_qed_open(BlockDriverState *bs, int flags)
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVQEDState *s = bs->opaque;
     QEDHeader le_header;
@@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
     qed_header_le_to_cpu(&le_header, &s->header);
 
     if (s->header.magic != QED_MAGIC) {
-        return -EINVAL;
+        return -EMEDIUMTYPE;
     }
     if (s->header.features & ~QED_FEATURE_MASK) {
         /* image uses unsupported feature bits */
@@ -558,7 +558,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
     if (ret < 0) {
         return ret;
     }
@@ -1526,7 +1526,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
 
     bdrv_qed_close(bs);
     memset(s, 0, sizeof(BDRVQEDState));
-    bdrv_qed_open(bs, bs->open_flags);
+    bdrv_qed_open(bs, NULL, bs->open_flags);
 }
 
 static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
@@ -1574,6 +1574,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_close               = bdrv_qed_close,
     .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
     .bdrv_create              = bdrv_qed_create,
+    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
     .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
diff --git a/block/qed.h b/block/qed.h
index a063bf70a..2b4ddedf3 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -15,7 +15,7 @@
 #ifndef BLOCK_QED_H
 #define BLOCK_QED_H
 
-#include "block_int.h"
+#include "block/block_int.h"
 
 /* The layout of a QED file is as follows:
  *
diff --git a/block/raw-aio.h b/block/raw-aio.h
index e77f36114..c61f1595d 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -20,11 +20,14 @@
 #define QEMU_AIO_WRITE        0x0002
 #define QEMU_AIO_IOCTL        0x0004
 #define QEMU_AIO_FLUSH        0x0008
+#define QEMU_AIO_DISCARD      0x0010
 #define QEMU_AIO_TYPE_MASK \
-	(QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH)
+        (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
+         QEMU_AIO_DISCARD)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
+#define QEMU_AIO_BLKDEV       0x2000
 
 
 /* linux-aio.c - Linux native implementation */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 550c81f22..ba721d3f5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -22,14 +22,13 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "qemu-char.h"
-#include "qemu-log.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "qemu/log.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include "trace.h"
-#include "thread-pool.h"
-#include "iov.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
 #include "raw-aio.h"
 
 #if defined(__APPLE__) && (__MACH__)
@@ -60,6 +59,9 @@
 #ifdef CONFIG_FIEMAP
 #include <linux/fiemap.h>
 #endif
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include <linux/falloc.h>
+#endif
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
 #include <sys/disk.h>
 #include <sys/cdio.h>
@@ -139,6 +141,7 @@ typedef struct BDRVRawState {
 #ifdef CONFIG_XFS
     bool is_xfs : 1;
 #endif
+    bool has_discard : 1;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -160,7 +163,7 @@ typedef struct RawPosixAIOData {
         void *aio_ioctl_buf;
     };
     int aio_niov;
-    size_t aio_nbytes;
+    uint64_t aio_nbytes;
 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
     off_t aio_offset;
     int aio_type;
@@ -259,15 +262,42 @@ error:
 }
 #endif
 
-static int raw_open_common(BlockDriverState *bs, const char *filename,
+static QemuOptsList raw_runtime_opts = {
+    .name = "raw",
+    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "File name of the image",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int raw_open_common(BlockDriverState *bs, QDict *options,
                            int bdrv_flags, int open_flags)
 {
     BDRVRawState *s = bs->opaque;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
     int fd, ret;
 
+    opts = qemu_opts_create_nofail(&raw_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
     ret = raw_normalize_devicepath(&filename);
     if (ret != 0) {
-        return ret;
+        goto fail;
     }
 
     s->open_flags = open_flags;
@@ -277,34 +307,40 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     fd = qemu_open(filename, s->open_flags, 0644);
     if (fd < 0) {
         ret = -errno;
-        if (ret == -EROFS)
+        if (ret == -EROFS) {
             ret = -EACCES;
-        return ret;
+        }
+        goto fail;
     }
     s->fd = fd;
 
 #ifdef CONFIG_LINUX_AIO
     if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
         qemu_close(fd);
-        return -errno;
+        ret = -errno;
+        goto fail;
     }
 #endif
 
+    s->has_discard = 1;
 #ifdef CONFIG_XFS
     if (platform_test_xfs_fd(s->fd)) {
         s->is_xfs = 1;
     }
 #endif
 
-    return 0;
+    ret = 0;
+fail:
+    qemu_opts_del(opts);
+    return ret;
 }
 
-static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
 
     s->type = FTYPE_FILE;
-    return raw_open_common(bs, filename, flags, 0);
+    return raw_open_common(bs, options, flags, 0);
 }
 
 static int raw_reopen_prepare(BDRVReopenState *state,
@@ -341,11 +377,20 @@ static int raw_reopen_prepare(BDRVReopenState *state,
 
     raw_s->fd = -1;
 
-    int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+    int fcntl_flags = O_APPEND | O_NONBLOCK;
 #ifdef O_NOATIME
     fcntl_flags |= O_NOATIME;
 #endif
 
+#ifdef O_ASYNC
+    /* Not all operating systems have O_ASYNC, and those that don't
+     * will not let us track the state into raw_s->open_flags (typically
+     * you achieve the same effect with an ioctl, for example I_SETSIG
+     * on Solaris). But we do not use O_ASYNC, so that's fine.
+     */
+    assert((s->open_flags & O_ASYNC) == 0);
+#endif
+
     if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
         /* dup the original fd */
         /* TODO: use qemu fcntl wrapper */
@@ -431,22 +476,6 @@ static void raw_reopen_abort(BDRVReopenState *state)
 #endif
 */
 
-/*
- * Check if all memory in this vector is sector aligned.
- */
-static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
-{
-    int i;
-
-    for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
-            return 0;
-        }
-    }
-
-    return 1;
-}
-
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
     int ret;
@@ -456,15 +485,7 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
         return -errno;
     }
 
-    /*
-     * This looks weird, but the aio code only considers a request
-     * successful if it has written the full number of bytes.
-     *
-     * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
-     * so in fact we return the ioctl command here to make posix_aio_read()
-     * happy..
-     */
-    return aiocb->aio_nbytes;
+    return 0;
 }
 
 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
@@ -643,6 +664,72 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
     return nbytes;
 }
 
+#ifdef CONFIG_XFS
+static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
+{
+    struct xfs_flock64 fl;
+
+    memset(&fl, 0, sizeof(fl));
+    fl.l_whence = SEEK_SET;
+    fl.l_start = offset;
+    fl.l_len = bytes;
+
+    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
+        DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+#endif
+
+static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
+{
+    int ret = -EOPNOTSUPP;
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (s->has_discard == 0) {
+        return 0;
+    }
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+#ifdef BLKDISCARD
+        do {
+            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
+
+        ret = -errno;
+#endif
+    } else {
+#ifdef CONFIG_XFS
+        if (s->is_xfs) {
+            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
+        }
+#endif
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+        do {
+            if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                          aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
+
+        ret = -errno;
+#endif
+    }
+
+    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
+        ret == -ENOTTY) {
+        s->has_discard = 0;
+        ret = 0;
+    }
+    return ret;
+}
+
 static int aio_worker(void *arg)
 {
     RawPosixAIOData *aiocb = arg;
@@ -677,6 +764,9 @@ static int aio_worker(void *arg)
     case QEMU_AIO_IOCTL:
         ret = handle_aiocb_ioctl(aiocb);
         break;
+    case QEMU_AIO_DISCARD:
+        ret = handle_aiocb_discard(aiocb);
+        break;
     default:
         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
         ret = -EINVAL;
@@ -692,6 +782,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
         BlockDriverCompletionFunc *cb, void *opaque, int type)
 {
     RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+    ThreadPool *pool;
 
     acb->bs = bs;
     acb->aio_type = type;
@@ -705,23 +796,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_offset = sector_num * 512;
 
     trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
-    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
-}
-
-static BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
-        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
-
-    acb->bs = bs;
-    acb->aio_type = QEMU_AIO_IOCTL;
-    acb->aio_fildes = fd;
-    acb->aio_offset = 0;
-    acb->aio_ioctl_buf = buf;
-    acb->aio_ioctl_cmd = req;
-
-    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
 static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
@@ -739,7 +815,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
      * driver that it needs to copy the buffer.
      */
     if ((bs->open_flags & BDRV_O_NOCACHE)) {
-        if (!qiov_is_aligned(bs, qiov)) {
+        if (!bdrv_qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
         } else if (s->use_aio) {
@@ -1093,37 +1169,14 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     }
 }
 
-#ifdef CONFIG_XFS
-static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
-{
-    struct xfs_flock64 fl;
-
-    memset(&fl, 0, sizeof(fl));
-    fl.l_whence = SEEK_SET;
-    fl.l_start = sector_num << 9;
-    fl.l_len = (int64_t)nb_sectors << 9;
-
-    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
-        DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
-        return -errno;
-    }
-
-    return 0;
-}
-#endif
-
-static coroutine_fn int raw_co_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors,
+    BlockDriverCompletionFunc *cb, void *opaque)
 {
-#ifdef CONFIG_XFS
     BDRVRawState *s = bs->opaque;
 
-    if (s->is_xfs) {
-        return xfs_discard(s, sector_num, nb_sectors);
-    }
-#endif
-
-    return 0;
+    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+                       cb, opaque, QEMU_AIO_DISCARD);
 }
 
 static QEMUOptionParameter raw_create_options[] = {
@@ -1146,12 +1199,13 @@ static BlockDriver bdrv_file = {
     .bdrv_reopen_abort = raw_reopen_abort,
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
-    .bdrv_co_discard = raw_co_discard,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_co_is_allocated = raw_co_is_allocated,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
     .bdrv_aio_flush = raw_aio_flush,
+    .bdrv_aio_discard = raw_aio_discard,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
@@ -1238,9 +1292,44 @@ static int hdev_probe_device(const char *filename)
     return 0;
 }
 
-static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+static int check_hdev_writable(BDRVRawState *s)
+{
+#if defined(BLKROGET)
+    /* Linux block devices can be configured "read-only" using blockdev(8).
+     * This is independent of device node permissions and therefore open(2)
+     * with O_RDWR succeeds.  Actual writes fail with EPERM.
+     *
+     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
+     * check for read-only block devices so that Linux block devices behave
+     * properly.
+     */
+    struct stat st;
+    int readonly = 0;
+
+    if (fstat(s->fd, &st)) {
+        return -errno;
+    }
+
+    if (!S_ISBLK(st.st_mode)) {
+        return 0;
+    }
+
+    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
+        return -errno;
+    }
+
+    if (readonly) {
+        return -EACCES;
+    }
+#endif /* defined(BLKROGET) */
+    return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
+    int ret;
+    const char *filename = qdict_get_str(options, "filename");
 
 #if defined(__APPLE__) && defined(__MACH__)
     if (strstart(filename, "/dev/cdrom", NULL)) {
@@ -1262,6 +1351,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
                 qemu_close(fd);
             }
             filename = bsdPath;
+            qdict_put(options, "filename", qstring_from_str(filename));
         }
 
         if ( mediaIterator )
@@ -1281,7 +1371,20 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     }
 #endif
 
-    return raw_open_common(bs, filename, flags, 0);
+    ret = raw_open_common(bs, options, flags, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (flags & BDRV_O_RDWR) {
+        ret = check_hdev_writable(s);
+        if (ret < 0) {
+            raw_close(bs);
+            return ret;
+        }
+    }
+
+    return ret;
 }
 
 #if defined(__linux__)
@@ -1346,10 +1449,21 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
+    RawPosixAIOData *acb;
+    ThreadPool *pool;
 
     if (fd_open(bs) < 0)
         return NULL;
-    return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
+
+    acb = g_slice_new(RawPosixAIOData);
+    acb->bs = bs;
+    acb->aio_type = QEMU_AIO_IOCTL;
+    acb->aio_fildes = s->fd;
+    acb->aio_offset = 0;
+    acb->aio_ioctl_buf = buf;
+    acb->aio_ioctl_cmd = req;
+    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
 #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1371,6 +1485,19 @@ static int fd_open(BlockDriverState *bs)
 
 #endif /* !linux && !FreeBSD */
 
+static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors,
+    BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (fd_open(bs) < 0) {
+        return NULL;
+    }
+    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+}
+
 static int hdev_create(const char *filename, QEMUOptionParameter *options)
 {
     int fd;
@@ -1401,11 +1528,6 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options)
     return ret;
 }
 
-static int hdev_has_zero_init(BlockDriverState *bs)
-{
-    return 0;
-}
-
 static BlockDriver bdrv_host_device = {
     .format_name        = "host_device",
     .protocol_name        = "host_device",
@@ -1418,11 +1540,11 @@ static BlockDriver bdrv_host_device = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
-    .bdrv_has_zero_init = hdev_has_zero_init,
 
     .bdrv_aio_readv	= raw_aio_readv,
     .bdrv_aio_writev	= raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_aio_discard   = hdev_aio_discard,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -1437,7 +1559,7 @@ static BlockDriver bdrv_host_device = {
 };
 
 #ifdef __linux__
-static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
+static int floppy_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int ret;
@@ -1445,7 +1567,7 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
     s->type = FTYPE_FD;
 
     /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
+    ret = raw_open_common(bs, options, flags, O_NONBLOCK);
     if (ret)
         return ret;
 
@@ -1542,7 +1664,6 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
-    .bdrv_has_zero_init = hdev_has_zero_init,
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
@@ -1559,14 +1680,14 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_eject         = floppy_eject,
 };
 
-static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
 
     s->type = FTYPE_CD;
 
     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
-    return raw_open_common(bs, filename, flags, O_NONBLOCK);
+    return raw_open_common(bs, options, flags, O_NONBLOCK);
 }
 
 static int cdrom_probe_device(const char *filename)
@@ -1644,7 +1765,6 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
-    .bdrv_has_zero_init = hdev_has_zero_init,
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
@@ -1667,14 +1787,14 @@ static BlockDriver bdrv_host_cdrom = {
 #endif /* __linux__ */
 
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
-static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int ret;
 
     s->type = FTYPE_CD;
 
-    ret = raw_open_common(bs, filename, flags, 0);
+    ret = raw_open_common(bs, options, flags, 0);
     if (ret)
         return ret;
 
@@ -1766,7 +1886,6 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
-    .bdrv_has_zero_init = hdev_has_zero_init,
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
@@ -1784,6 +1903,40 @@ static BlockDriver bdrv_host_cdrom = {
 };
 #endif /* __FreeBSD__ */
 
+#ifdef CONFIG_LINUX_AIO
+/**
+ * Return the file descriptor for Linux AIO
+ *
+ * This function is a layering violation and should be removed when it becomes
+ * possible to call the block layer outside the global mutex.  It allows the
+ * caller to hijack the file descriptor so I/O can be performed outside the
+ * block layer.
+ */
+int raw_get_aio_fd(BlockDriverState *bs)
+{
+    BDRVRawState *s;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    if (bs->drv == bdrv_find_format("raw")) {
+        bs = bs->file;
+    }
+
+    /* raw-posix has several protocols so just check for raw_aio_readv */
+    if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
+        return -ENOTSUP;
+    }
+
+    s = bs->opaque;
+    if (!s->use_aio) {
+        return -ENOTSUP;
+    }
+    return s->fd;
+}
+#endif /* CONFIG_LINUX_AIO */
+
 static void bdrv_file_init(void)
 {
     /*
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 0c05c58c5..9b5b2af4e 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -22,13 +22,13 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include "raw-aio.h"
 #include "trace.h"
-#include "thread-pool.h"
-#include "iov.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
 #include <windows.h>
 #include <winioctl.h>
 
@@ -144,6 +144,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
         BlockDriverCompletionFunc *cb, void *opaque, int type)
 {
     RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
+    ThreadPool *pool;
 
     acb->bs = bs;
     acb->hfile = hfile;
@@ -157,7 +158,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
     acb->aio_offset = sector_num * 512;
 
     trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
-    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
 int qemu_ftruncate64(int fd, int64_t length)
@@ -219,20 +221,49 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
     }
 }
 
-static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+static QemuOptsList raw_runtime_opts = {
+    .name = "raw",
+    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "File name of the image",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int access_flags;
     DWORD overlapped;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
+    int ret;
 
     s->type = FTYPE_FILE;
 
+    opts = qemu_opts_create_nofail(&raw_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
     raw_parse_flags(flags, &access_flags, &overlapped);
-    
+
     if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
         aio = win32_aio_init();
         if (aio == NULL) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto fail;
         }
     }
 
@@ -242,20 +273,27 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
     if (s->hfile == INVALID_HANDLE_VALUE) {
         int err = GetLastError();
 
-        if (err == ERROR_ACCESS_DENIED)
-            return -EACCES;
-        return -EINVAL;
+        if (err == ERROR_ACCESS_DENIED) {
+            ret = -EACCES;
+        } else {
+            ret = -EINVAL;
+        }
+        goto fail;
     }
 
     if (flags & BDRV_O_NATIVE_AIO) {
-        int ret = win32_aio_attach(aio, s->hfile);
+        ret = win32_aio_attach(aio, s->hfile);
         if (ret < 0) {
             CloseHandle(s->hfile);
-            return ret;
+            goto fail;
         }
         s->aio = aio;
     }
-    return 0;
+
+    ret = 0;
+fail:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
@@ -303,13 +341,24 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
     BDRVRawState *s = bs->opaque;
     LONG low, high;
+    DWORD dwPtrLow;
 
     low = offset;
     high = offset >> 32;
-    if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
-	return -EIO;
-    if (!SetEndOfFile(s->hfile))
+
+    /*
+     * An error has occurred if the return value is INVALID_SET_FILE_POINTER
+     * and GetLastError doesn't return NO_ERROR.
+     */
+    dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN);
+    if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+        fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError());
+        return -EIO;
+    }
+    if (SetEndOfFile(s->hfile) == 0) {
+        fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError());
         return -EIO;
+    }
     return 0;
 }
 
@@ -410,6 +459,7 @@ static BlockDriver bdrv_file = {
     .bdrv_file_open	= raw_open,
     .bdrv_close		= raw_close,
     .bdrv_create	= raw_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
@@ -481,12 +531,13 @@ static int hdev_probe_device(const char *filename)
     return 0;
 }
 
-static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int access_flags, create_flags;
     DWORD overlapped;
     char device_name[64];
+    const char *filename = qdict_get_str(options, "filename");
 
     if (strstart(filename, "/dev/cdrom", NULL)) {
         if (find_cdrom(device_name, sizeof(device_name)) < 0)
@@ -520,11 +571,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     return 0;
 }
 
-static int hdev_has_zero_init(BlockDriverState *bs)
-{
-    return 0;
-}
-
 static BlockDriver bdrv_host_device = {
     .format_name	= "host_device",
     .protocol_name	= "host_device",
@@ -532,7 +578,6 @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_device	= hdev_probe_device,
     .bdrv_file_open	= hdev_open,
     .bdrv_close		= raw_close,
-    .bdrv_has_zero_init = hdev_has_zero_init,
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
diff --git a/block/raw.c b/block/raw.c
index 253e949b8..47518253f 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -1,9 +1,32 @@
+/*
+ * Block driver for RAW format
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
-static int raw_open(BlockDriverState *bs, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags)
 {
     bs->sg = bs->file->sg;
     return 0;
@@ -42,6 +65,13 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
 }
 
+static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors)
+{
+    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     return bdrv_getlength(bs->file);
@@ -114,6 +144,11 @@ static int raw_has_zero_init(BlockDriverState *bs)
     return bdrv_has_zero_init(bs->file);
 }
 
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    return bdrv_get_info(bs->file, bdi);
+}
+
 static BlockDriver bdrv_raw = {
     .format_name        = "raw",
 
@@ -128,10 +163,12 @@ static BlockDriver bdrv_raw = {
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
     .bdrv_co_is_allocated   = raw_co_is_allocated,
+    .bdrv_co_write_zeroes   = raw_co_write_zeroes,
     .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,
     .bdrv_getlength     = raw_getlength,
+    .bdrv_get_info      = raw_get_info,
     .bdrv_truncate      = raw_truncate,
 
     .bdrv_is_inserted   = raw_is_inserted,
diff --git a/block/rbd.c b/block/rbd.c
index f3becc7a8..cb7175121 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -14,8 +14,8 @@
 #include <inttypes.h>
 
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
 
 #include <rbd/librbd.h>
 
@@ -63,7 +63,8 @@
 typedef enum {
     RBD_AIO_READ,
     RBD_AIO_WRITE,
-    RBD_AIO_DISCARD
+    RBD_AIO_DISCARD,
+    RBD_AIO_FLUSH
 } RBDAIOCmd;
 
 typedef struct RBDAIOCB {
@@ -77,6 +78,7 @@ typedef struct RBDAIOCB {
     int error;
     struct BDRVRBDState *s;
     int cancelled;
+    int status;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
@@ -376,16 +378,9 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     RBDAIOCB *acb = rcb->acb;
     int64_t r;
 
-    if (acb->cancelled) {
-        qemu_vfree(acb->bounce);
-        qemu_aio_release(acb);
-        goto done;
-    }
-
     r = rcb->ret;
 
-    if (acb->cmd == RBD_AIO_WRITE ||
-        acb->cmd == RBD_AIO_DISCARD) {
+    if (acb->cmd != RBD_AIO_READ) {
         if (r < 0) {
             acb->ret = r;
             acb->error = 1;
@@ -409,7 +404,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     /* Note that acb->bh can be NULL in case where the aio was cancelled */
     acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
-done:
     g_free(rcb);
 }
 
@@ -447,7 +441,21 @@ static int qemu_rbd_aio_flush_cb(void *opaque)
     return (s->qemu_aio_count > 0);
 }
 
-static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+    .name = "rbd",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "Specification of the rbd image",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRBDState *s = bs->opaque;
     char pool[RBD_MAX_POOL_NAME_SIZE];
@@ -455,20 +463,35 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
     char conf[RBD_MAX_CONF_SIZE];
     char clientname_buf[RBD_MAX_CONF_SIZE];
     char *clientname;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
     int r;
 
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        qemu_opts_del(opts);
+        return -EINVAL;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
     if (qemu_rbd_parsename(filename, pool, sizeof(pool),
                            snap_buf, sizeof(snap_buf),
                            s->name, sizeof(s->name),
                            conf, sizeof(conf)) < 0) {
-        return -EINVAL;
+        r = -EINVAL;
+        goto failed_opts;
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
     r = rados_create(&s->cluster, clientname);
     if (r < 0) {
         error_report("error initializing");
-        return r;
+        goto failed_opts;
     }
 
     s->snap = NULL;
@@ -534,6 +557,7 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
                             NULL, qemu_rbd_aio_flush_cb, s);
 
 
+    qemu_opts_del(opts);
     return 0;
 
 failed:
@@ -543,6 +567,8 @@ failed_open:
 failed_shutdown:
     rados_shutdown(s->cluster);
     g_free(s->snap);
+failed_opts:
+    qemu_opts_del(opts);
     return r;
 }
 
@@ -568,6 +594,12 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     RBDAIOCB *acb = (RBDAIOCB *) blockacb;
     acb->cancelled = 1;
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    qemu_aio_release(acb);
 }
 
 static const AIOCBInfo rbd_aiocb_info = {
@@ -639,8 +671,11 @@ static void rbd_aio_bh_cb(void *opaque)
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
+    acb->status = 0;
 
-    qemu_aio_release(acb);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }
 
 static int rbd_aio_discard_wrapper(rbd_image_t image,
@@ -655,6 +690,16 @@ static int rbd_aio_discard_wrapper(rbd_image_t image,
 #endif
 }
 
+static int rbd_aio_flush_wrapper(rbd_image_t image,
+                                 rbd_completion_t comp)
+{
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+    return rbd_aio_flush(image, comp);
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
                                        int64_t sector_num,
                                        QEMUIOVector *qiov,
@@ -675,7 +720,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
     acb->cmd = cmd;
     acb->qiov = qiov;
-    if (cmd == RBD_AIO_DISCARD) {
+    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
         acb->bounce = NULL;
     } else {
         acb->bounce = qemu_blockalign(bs, qiov->size);
@@ -685,6 +730,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb->s = s;
     acb->cancelled = 0;
     acb->bh = NULL;
+    acb->status = -EINPROGRESS;
 
     if (cmd == RBD_AIO_WRITE) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -718,6 +764,9 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     case RBD_AIO_DISCARD:
         r = rbd_aio_discard_wrapper(s->image, off, size, c);
         break;
+    case RBD_AIO_FLUSH:
+        r = rbd_aio_flush_wrapper(s->image, c);
+        break;
     default:
         r = -EINVAL;
     }
@@ -757,6 +806,16 @@ static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                          RBD_AIO_WRITE);
 }
 
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
+}
+
+#else
+
 static int qemu_rbd_co_flush(BlockDriverState *bs)
 {
 #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
@@ -767,6 +826,7 @@ static int qemu_rbd_co_flush(BlockDriverState *bs)
     return 0;
 #endif
 }
+#endif
 
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
@@ -936,6 +996,7 @@ static BlockDriver bdrv_rbd = {
     .bdrv_file_open     = qemu_rbd_open,
     .bdrv_close         = qemu_rbd_close,
     .bdrv_create        = qemu_rbd_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_get_info      = qemu_rbd_getinfo,
     .create_options     = qemu_rbd_create_options,
     .bdrv_getlength     = qemu_rbd_getlength,
@@ -944,7 +1005,12 @@ static BlockDriver bdrv_rbd = {
 
     .bdrv_aio_readv         = qemu_rbd_aio_readv,
     .bdrv_aio_writev        = qemu_rbd_aio_writev,
+
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+    .bdrv_aio_flush         = qemu_rbd_aio_flush,
+#else
     .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
+#endif
 
 #ifdef LIBRBD_SUPPORTS_DISCARD
     .bdrv_aio_discard       = qemu_rbd_aio_discard,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a48f58cfe..afe053376 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,19 +13,22 @@
  */
 
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "qemu_socket.h"
-#include "block_int.h"
-#include "bitops.h"
+#include "qemu/uri.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"
 
 #define SD_PROTO_VER 0x01
 
 #define SD_DEFAULT_ADDR "localhost"
-#define SD_DEFAULT_PORT "7000"
+#define SD_DEFAULT_PORT 7000
 
 #define SD_OP_CREATE_AND_WRITE_OBJ  0x01
 #define SD_OP_READ_OBJ       0x02
 #define SD_OP_WRITE_OBJ      0x03
+/* 0x04 is used internally by Sheepdog */
+#define SD_OP_DISCARD_OBJ    0x05
 
 #define SD_OP_NEW_VDI        0x11
 #define SD_OP_LOCK_VDI       0x12
@@ -33,10 +36,12 @@
 #define SD_OP_GET_VDI_INFO   0x14
 #define SD_OP_READ_VDIS      0x15
 #define SD_OP_FLUSH_VDI      0x16
+#define SD_OP_DEL_VDI        0x17
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
-#define SD_FLAG_CMD_CACHE    0x04
+#define SD_FLAG_CMD_CACHE    0x04 /* Writeback mode for cache */
+#define SD_FLAG_CMD_DIRECT   0x08 /* Don't use cache */
 
 #define SD_RES_SUCCESS       0x00 /* Success */
 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
@@ -63,6 +68,8 @@
 #define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
 #define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
 #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT          0x19 /* Sheepdog is stopped serving IO request */
+#define SD_RES_READONLY      0x1A /* Object is read-only */
 
 /*
  * Object ID rules
@@ -84,7 +91,6 @@
 #define SD_NR_VDIS   (1U << 24)
 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
-#define SECTOR_SIZE 512
 
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0
@@ -144,7 +150,7 @@ typedef struct SheepdogVdiReq {
     uint32_t id;
     uint32_t data_length;
     uint64_t vdi_size;
-    uint32_t base_vdi_id;
+    uint32_t vdi_id;
     uint32_t copies;
     uint32_t snapid;
     uint32_t pad[3];
@@ -236,14 +242,14 @@ static inline bool is_snapshot(struct SheepdogInode *inode)
     return !!inode->snap_ctime;
 }
 
-#undef dprintf
+#undef DPRINTF
 #ifdef DEBUG_SDOG
-#define dprintf(fmt, args...)                                       \
+#define DPRINTF(fmt, args...)                                       \
     do {                                                            \
         fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
     } while (0)
 #else
-#define dprintf(fmt, args...)
+#define DPRINTF(fmt, args...)
 #endif
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
@@ -265,6 +271,8 @@ typedef struct AIOReq {
 enum AIOCBState {
     AIOCB_WRITE_UDATA,
     AIOCB_READ_UDATA,
+    AIOCB_FLUSH_CACHE,
+    AIOCB_DISCARD_OBJ,
 };
 
 struct SheepdogAIOCB {
@@ -293,12 +301,12 @@ typedef struct BDRVSheepdogState {
 
     char name[SD_MAX_VDI_LEN];
     bool is_snapshot;
-    bool cache_enabled;
+    uint32_t cache_flags;
+    bool discard_supported;
 
-    char *addr;
-    char *port;
+    char *host_spec;
+    bool is_unix;
     int fd;
-    int flush_fd;
 
     CoMutex lock;
     Coroutine *co_send;
@@ -342,6 +350,8 @@ static const char * sd_strerror(int err)
         {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
         {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
         {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
+        {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
+        {SD_RES_READONLY, "Object is read-only"},
     };
 
     for (i = 0; i < ARRAY_SIZE(errors); ++i) {
@@ -426,12 +436,11 @@ static const AIOCBInfo sd_aiocb_info = {
 };
 
 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
-                                   int64_t sector_num, int nb_sectors,
-                                   BlockDriverCompletionFunc *cb, void *opaque)
+                                   int64_t sector_num, int nb_sectors)
 {
     SheepdogAIOCB *acb;
 
-    acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque);
+    acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
 
     acb->qiov = qiov;
 
@@ -446,56 +455,31 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     return acb;
 }
 
-static int connect_to_sdog(const char *addr, const char *port)
+static int connect_to_sdog(BDRVSheepdogState *s)
 {
-    char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
-    int fd, ret;
-    struct addrinfo hints, *res, *res0;
-
-    if (!addr) {
-        addr = SD_DEFAULT_ADDR;
-        port = SD_DEFAULT_PORT;
-    }
-
-    memset(&hints, 0, sizeof(hints));
-    hints.ai_socktype = SOCK_STREAM;
-
-    ret = getaddrinfo(addr, port, &hints, &res0);
-    if (ret) {
-        error_report("unable to get address info %s, %s",
-                     addr, strerror(errno));
-        return -errno;
-    }
-
-    for (res = res0; res; res = res->ai_next) {
-        ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
-                          sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
-        if (ret) {
-            continue;
-        }
+    int fd;
+    Error *err = NULL;
 
-        fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
-        if (fd < 0) {
-            continue;
-        }
+    if (s->is_unix) {
+        fd = unix_connect(s->host_spec, &err);
+    } else {
+        fd = inet_connect(s->host_spec, &err);
 
-    reconnect:
-        ret = connect(fd, res->ai_addr, res->ai_addrlen);
-        if (ret < 0) {
-            if (errno == EINTR) {
-                goto reconnect;
+        if (err == NULL) {
+            int ret = socket_set_nodelay(fd);
+            if (ret < 0) {
+                error_report("%s", strerror(errno));
             }
-            close(fd);
-            break;
         }
+    }
 
-        dprintf("connected to %s:%s\n", addr, port);
-        goto success;
+    if (err != NULL) {
+        qerror_report_err(err);
+        error_free(err);
+    } else {
+        qemu_set_nonblock(fd);
     }
-    fd = -errno;
-    error_report("failed connect to %s:%s", addr, port);
-success:
-    freeaddrinfo(res0);
+
     return fd;
 }
 
@@ -525,6 +509,13 @@ static void restart_co_req(void *opaque)
     qemu_coroutine_enter(co, NULL);
 }
 
+static int have_co_req(void *opaque)
+{
+    /* this handler is set only when there is a pending request, so
+     * always returns 1. */
+    return 1;
+}
+
 typedef struct SheepdogReqCo {
     int sockfd;
     SheepdogReq *hdr;
@@ -547,15 +538,14 @@ static coroutine_fn void do_co_req(void *opaque)
     unsigned int *rlen = srco->rlen;
 
     co = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
+    qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co);
 
-    socket_set_block(sockfd);
     ret = send_co_req(sockfd, hdr, data, wlen);
     if (ret < 0) {
         goto out;
     }
 
-    qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
+    qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co);
 
     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
     if (ret < sizeof(*hdr)) {
@@ -578,8 +568,9 @@ static coroutine_fn void do_co_req(void *opaque)
     }
     ret = 0;
 out:
+    /* there is at most one request for this sockfd, so it is safe to
+     * set each handler to NULL. */
     qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
-    socket_set_nonblock(sockfd);
 
     srco->ret = ret;
     srco->finished = true;
@@ -615,6 +606,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
                            struct iovec *iov, int niov, bool create,
                            enum AIOCBState aiocb_type);
+static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
 
 
 static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
@@ -671,7 +663,7 @@ static void coroutine_fn aio_read_response(void *opaque)
     int ret;
     AIOReq *aio_req = NULL;
     SheepdogAIOCB *acb;
-    unsigned long idx;
+    uint64_t idx;
 
     if (QLIST_EMPTY(&s->inflight_aio_head)) {
         goto out;
@@ -714,16 +706,17 @@ static void coroutine_fn aio_read_response(void *opaque)
              * and max_dirty_data_idx are changed to include updated
              * index between them.
              */
-            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
-
+            if (rsp.result == SD_RES_SUCCESS) {
+                s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+            }
             /*
              * Some requests may be blocked because simultaneous
              * create requests are not allowed, so we search the
              * pending requests here.
              */
-            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+            send_pending_req(s, aio_req->oid);
         }
         break;
     case AIOCB_READ_UDATA:
@@ -734,11 +727,43 @@ static void coroutine_fn aio_read_response(void *opaque)
             goto out;
         }
         break;
+    case AIOCB_FLUSH_CACHE:
+        if (rsp.result == SD_RES_INVALID_PARMS) {
+            DPRINTF("disable cache since the server doesn't support it\n");
+            s->cache_flags = SD_FLAG_CMD_DIRECT;
+            rsp.result = SD_RES_SUCCESS;
+        }
+        break;
+    case AIOCB_DISCARD_OBJ:
+        switch (rsp.result) {
+        case SD_RES_INVALID_PARMS:
+            error_report("sheep(%s) doesn't support discard command",
+                         s->host_spec);
+            rsp.result = SD_RES_SUCCESS;
+            s->discard_supported = false;
+            break;
+        case SD_RES_SUCCESS:
+            idx = data_oid_to_idx(aio_req->oid);
+            s->inode.data_vdi_id[idx] = 0;
+            break;
+        default:
+            break;
+        }
     }
 
-    if (rsp.result != SD_RES_SUCCESS) {
+    switch (rsp.result) {
+    case SD_RES_SUCCESS:
+        break;
+    case SD_RES_READONLY:
+        ret = resend_aioreq(s, aio_req);
+        if (ret == SD_RES_SUCCESS) {
+            goto out;
+        }
+        /* fall through */
+    default:
         acb->ret = -EIO;
         error_report("%s", sd_strerror(rsp.result));
+        break;
     }
 
     free_aio_req(s, aio_req);
@@ -779,15 +804,6 @@ static int aio_flush_request(void *opaque)
         !QLIST_EMPTY(&s->pending_aio_head);
 }
 
-static int set_nodelay(int fd)
-{
-    int ret, opt;
-
-    opt = 1;
-    ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
-    return ret;
-}
-
 /*
  * Return a socket discriptor to read/write objects.
  *
@@ -796,29 +812,86 @@ static int set_nodelay(int fd)
  */
 static int get_sheep_fd(BDRVSheepdogState *s)
 {
-    int ret, fd;
+    int fd;
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
-        error_report("%s", strerror(errno));
         return fd;
     }
 
-    socket_set_nonblock(fd);
+    qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
+    return fd;
+}
 
-    ret = set_nodelay(fd);
-    if (ret) {
-        error_report("%s", strerror(errno));
-        closesocket(fd);
-        return -errno;
+static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
+                        char *vdi, uint32_t *snapid, char *tag)
+{
+    URI *uri;
+    QueryParams *qp = NULL;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
     }
 
-    qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
-    return fd;
+    /* transport */
+    if (!strcmp(uri->scheme, "sheepdog")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+        s->is_unix = true;
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (uri->path == NULL || !strcmp(uri->path, "/")) {
+        ret = -EINVAL;
+        goto out;
+    }
+    pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (s->is_unix) {
+        /* sheepdog+unix:///vdiname?socket=path */
+        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        s->host_spec = g_strdup(qp->p[0].value);
+    } else {
+        /* sheepdog[+tcp]://[host:port]/vdiname */
+        s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR,
+                                       uri->port ?: SD_DEFAULT_PORT);
+    }
+
+    /* snapshot tag */
+    if (uri->fragment) {
+        *snapid = strtoul(uri->fragment, NULL, 10);
+        if (*snapid == 0) {
+            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
+        }
+    } else {
+        *snapid = CURRENT_VDI_ID; /* search current vdi */
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
 }
 
 /*
- * Parse a filename
+ * Parse a filename (old syntax)
  *
  * filename must be one of the following formats:
  *   1. [vdiname]
@@ -837,9 +910,11 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
                          char *vdi, uint32_t *snapid, char *tag)
 {
-    char *p, *q;
-    int nr_sep;
+    char *p, *q, *uri;
+    const char *host_spec, *vdi_spec;
+    int nr_sep, ret;
 
+    strstart(filename, "sheepdog:", (const char **)&filename);
     p = q = g_strdup(filename);
 
     /* count the number of separators */
@@ -852,42 +927,37 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
     }
     p = q;
 
-    /* use the first two tokens as hostname and port number. */
+    /* use the first two tokens as host_spec. */
     if (nr_sep >= 2) {
-        s->addr = p;
+        host_spec = p;
         p = strchr(p, ':');
-        *p++ = '\0';
-
-        s->port = p;
+        p++;
         p = strchr(p, ':');
         *p++ = '\0';
     } else {
-        s->addr = NULL;
-        s->port = 0;
+        host_spec = "";
     }
 
-    pstrcpy(vdi, SD_MAX_VDI_LEN, p);
+    vdi_spec = p;
 
-    p = strchr(vdi, ':');
+    p = strchr(vdi_spec, ':');
     if (p) {
-        *p++ = '\0';
-        *snapid = strtoul(p, NULL, 10);
-        if (*snapid == 0) {
-            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
-        }
-    } else {
-        *snapid = CURRENT_VDI_ID; /* search current vdi */
+        *p++ = '#';
     }
 
-    if (s->addr == NULL) {
-        g_free(q);
-    }
+    uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
 
-    return 0;
+    ret = sd_parse_uri(s, uri, vdi, snapid, tag);
+
+    g_free(q);
+    g_free(uri);
+
+    return ret;
 }
 
-static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
-                         char *tag, uint32_t *vid, int for_snapshot)
+static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
+                         uint32_t snapid, const char *tag, uint32_t *vid,
+                         bool lock)
 {
     int ret, fd;
     SheepdogVdiReq hdr;
@@ -895,7 +965,7 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
     unsigned int wlen, rlen = 0;
     char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         return fd;
     }
@@ -908,10 +978,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
     strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
 
     memset(&hdr, 0, sizeof(hdr));
-    if (for_snapshot) {
-        hdr.opcode = SD_OP_GET_VDI_INFO;
-    } else {
+    if (lock) {
         hdr.opcode = SD_OP_LOCK_VDI;
+    } else {
+        hdr.opcode = SD_OP_GET_VDI_INFO;
     }
     wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
     hdr.proto_ver = SD_PROTO_VER;
@@ -948,7 +1018,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 {
     int nr_copies = s->inode.nr_copies;
     SheepdogObjReq hdr;
-    unsigned int wlen;
+    unsigned int wlen = 0;
     int ret;
     uint64_t oid = aio_req->oid;
     unsigned int datalen = aio_req->data_len;
@@ -962,22 +1032,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
     memset(&hdr, 0, sizeof(hdr));
 
-    if (aiocb_type == AIOCB_READ_UDATA) {
-        wlen = 0;
+    switch (aiocb_type) {
+    case AIOCB_FLUSH_CACHE:
+        hdr.opcode = SD_OP_FLUSH_VDI;
+        break;
+    case AIOCB_READ_UDATA:
         hdr.opcode = SD_OP_READ_OBJ;
         hdr.flags = flags;
-    } else if (create) {
-        wlen = datalen;
-        hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
-        hdr.flags = SD_FLAG_CMD_WRITE | flags;
-    } else {
+        break;
+    case AIOCB_WRITE_UDATA:
+        if (create) {
+            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
+        } else {
+            hdr.opcode = SD_OP_WRITE_OBJ;
+        }
         wlen = datalen;
-        hdr.opcode = SD_OP_WRITE_OBJ;
         hdr.flags = SD_FLAG_CMD_WRITE | flags;
+        break;
+    case AIOCB_DISCARD_OBJ:
+        hdr.opcode = SD_OP_DISCARD_OBJ;
+        break;
     }
 
-    if (s->cache_enabled) {
-        hdr.flags |= SD_FLAG_CMD_CACHE;
+    if (s->cache_flags) {
+        hdr.flags |= s->cache_flags;
     }
 
     hdr.oid = oid;
@@ -1022,7 +1100,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
                              unsigned int datalen, uint64_t offset,
-                             bool write, bool create, bool cache)
+                             bool write, bool create, uint32_t cache_flags)
 {
     SheepdogObjReq hdr;
     SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -1046,9 +1124,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
         hdr.opcode = SD_OP_READ_OBJ;
     }
 
-    if (cache) {
-        hdr.flags |= SD_FLAG_CMD_CACHE;
-    }
+    hdr.flags |= cache_flags;
 
     hdr.oid = oid;
     hdr.data_length = datalen;
@@ -1071,21 +1147,119 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
 }
 
 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-                       unsigned int datalen, uint64_t offset, bool cache)
+                       unsigned int datalen, uint64_t offset,
+                       uint32_t cache_flags)
 {
     return read_write_object(fd, buf, oid, copies, datalen, offset, false,
-                             false, cache);
+                             false, cache_flags);
 }
 
 static int write_object(int fd, char *buf, uint64_t oid, int copies,
                         unsigned int datalen, uint64_t offset, bool create,
-                        bool cache)
+                        uint32_t cache_flags)
 {
     return read_write_object(fd, buf, oid, copies, datalen, offset, true,
-                             create, cache);
+                             create, cache_flags);
 }
 
-static int sd_open(BlockDriverState *bs, const char *filename, int flags)
+/* update inode with the latest state */
+static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
+{
+    SheepdogInode *inode;
+    int ret = 0, fd;
+    uint32_t vid = 0;
+
+    fd = connect_to_sdog(s);
+    if (fd < 0) {
+        return -EIO;
+    }
+
+    inode = g_malloc(sizeof(s->inode));
+
+    ret = find_vdi_name(s, s->name, snapid, tag, &vid, false);
+    if (ret) {
+        goto out;
+    }
+
+    ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid),
+                      s->inode.nr_copies, sizeof(*inode), 0, s->cache_flags);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (inode->vdi_id != s->inode.vdi_id) {
+        memcpy(&s->inode, inode, sizeof(s->inode));
+    }
+
+out:
+    g_free(inode);
+    closesocket(fd);
+
+    return ret;
+}
+
+static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+    SheepdogAIOCB *acb = aio_req->aiocb;
+    bool create = false;
+    int ret;
+
+    ret = reload_inode(s, 0, "");
+    if (ret < 0) {
+        return ret;
+    }
+
+    aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
+                                   data_oid_to_idx(aio_req->oid));
+
+    /* check whether this request becomes a CoW one */
+    if (acb->aiocb_type == AIOCB_WRITE_UDATA) {
+        int idx = data_oid_to_idx(aio_req->oid);
+        AIOReq *areq;
+
+        if (s->inode.data_vdi_id[idx] == 0) {
+            create = true;
+            goto out;
+        }
+        if (is_data_obj_writable(&s->inode, idx)) {
+            goto out;
+        }
+
+        /* link to the pending list if there is another CoW request to
+         * the same object */
+        QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
+            if (areq != aio_req && areq->oid == aio_req->oid) {
+                DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid);
+                QLIST_REMOVE(aio_req, aio_siblings);
+                QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
+                return SD_RES_SUCCESS;
+            }
+        }
+
+        aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
+        aio_req->flags |= SD_FLAG_CMD_COW;
+        create = true;
+    }
+out:
+    return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
+                           create, acb->aiocb_type);
+}
+
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+    .name = "sheepdog",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the sheepdog image",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int sd_open(BlockDriverState *bs, QDict *options, int flags)
 {
     int ret, fd;
     uint32_t vid = 0;
@@ -1093,8 +1267,20 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
     char *buf = NULL;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
+
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto out;
+    }
 
-    strstart(filename, "sheepdog:", (const char **)&filename);
+    filename = qemu_opt_get(opts, "filename");
 
     QLIST_INIT(&s->inflight_aio_head);
     QLIST_INIT(&s->pending_aio_head);
@@ -1102,8 +1288,13 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
 
     memset(vdi, 0, sizeof(vdi));
     memset(tag, 0, sizeof(tag));
-    if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
-        ret = -EINVAL;
+
+    if (strstr(filename, "://")) {
+        ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+    } else {
+        ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+    }
+    if (ret < 0) {
         goto out;
     }
     s->fd = get_sheep_fd(s);
@@ -1112,34 +1303,35 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
         goto out;
     }
 
-    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
+    ret = find_vdi_name(s, vdi, snapid, tag, &vid, true);
     if (ret) {
         goto out;
     }
 
-    s->cache_enabled = true;
-    s->flush_fd = connect_to_sdog(s->addr, s->port);
-    if (s->flush_fd < 0) {
-        error_report("failed to connect");
-        ret = s->flush_fd;
-        goto out;
+    /*
+     * QEMU block layer emulates writethrough cache as 'writeback + flush', so
+     * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
+     */
+    s->cache_flags = SD_FLAG_CMD_CACHE;
+    if (flags & BDRV_O_NOCACHE) {
+        s->cache_flags = SD_FLAG_CMD_DIRECT;
     }
+    s->discard_supported = true;
 
     if (snapid || tag[0] != '\0') {
-        dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
+        DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
         s->is_snapshot = true;
     }
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
-        error_report("failed to connect");
         ret = fd;
         goto out;
     }
 
     buf = g_malloc(SD_INODE_SIZE);
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
-                      s->cache_enabled);
+                      s->cache_flags);
 
     closesocket(fd);
 
@@ -1151,9 +1343,10 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     s->min_dirty_data_idx = UINT32_MAX;
     s->max_dirty_data_idx = 0;
 
-    bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
+    bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
     pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
+    qemu_opts_del(opts);
     g_free(buf);
     return 0;
 out:
@@ -1161,13 +1354,13 @@ out:
     if (s->fd >= 0) {
         closesocket(s->fd);
     }
+    qemu_opts_del(opts);
     g_free(buf);
     return ret;
 }
 
-static int do_sd_create(char *filename, int64_t vdi_size,
-                        uint32_t base_vid, uint32_t *vdi_id, int snapshot,
-                        const char *addr, const char *port)
+static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
+                        uint32_t base_vid, uint32_t *vdi_id, int snapshot)
 {
     SheepdogVdiReq hdr;
     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
@@ -1175,7 +1368,7 @@ static int do_sd_create(char *filename, int64_t vdi_size,
     unsigned int wlen, rlen = 0;
     char buf[SD_MAX_VDI_LEN];
 
-    fd = connect_to_sdog(addr, port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         return fd;
     }
@@ -1188,7 +1381,7 @@ static int do_sd_create(char *filename, int64_t vdi_size,
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
-    hdr.base_vdi_id = base_vid;
+    hdr.vdi_id = base_vid;
 
     wlen = SD_MAX_VDI_LEN;
 
@@ -1226,7 +1419,7 @@ static int sd_prealloc(const char *filename)
     void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
     int ret;
 
-    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
     if (ret < 0) {
         goto out;
     }
@@ -1271,17 +1464,17 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
     char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
     bool prealloc = false;
-    const char *vdiname;
 
     s = g_malloc0(sizeof(BDRVSheepdogState));
 
-    strstart(filename, "sheepdog:", &vdiname);
-
     memset(vdi, 0, sizeof(vdi));
     memset(tag, 0, sizeof(tag));
-    if (parse_vdiname(s, vdiname, vdi, &snapid, tag) < 0) {
-        error_report("invalid filename");
-        ret = -EINVAL;
+    if (strstr(filename, "://")) {
+        ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+    } else {
+        ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+    }
+    if (ret < 0) {
         goto out;
     }
 
@@ -1317,14 +1510,14 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
         BlockDriver *drv;
 
         /* Currently, only Sheepdog backing image is supported. */
-        drv = bdrv_find_protocol(backing_file);
+        drv = bdrv_find_protocol(backing_file, true);
         if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
             error_report("backing_file must be a sheepdog image");
             ret = -EINVAL;
             goto out;
         }
 
-        ret = bdrv_file_open(&bs, backing_file, 0);
+        ret = bdrv_file_open(&bs, backing_file, NULL, 0);
         if (ret < 0) {
             goto out;
         }
@@ -1342,7 +1535,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
         bdrv_delete(bs);
     }
 
-    ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port);
+    ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0);
     if (!prealloc || ret) {
         goto out;
     }
@@ -1361,9 +1554,9 @@ static void sd_close(BlockDriverState *bs)
     unsigned int wlen, rlen = 0;
     int fd, ret;
 
-    dprintf("%s\n", s->name);
+    DPRINTF("%s\n", s->name);
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         return;
     }
@@ -1371,6 +1564,7 @@ static void sd_close(BlockDriverState *bs)
     memset(&hdr, 0, sizeof(hdr));
 
     hdr.opcode = SD_OP_RELEASE_VDI;
+    hdr.vdi_id = s->inode.vdi_id;
     wlen = strlen(s->name) + 1;
     hdr.data_length = wlen;
     hdr.flags = SD_FLAG_CMD_WRITE;
@@ -1386,10 +1580,7 @@ static void sd_close(BlockDriverState *bs)
 
     qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
     closesocket(s->fd);
-    if (s->cache_enabled) {
-        closesocket(s->flush_fd);
-    }
-    g_free(s->addr);
+    g_free(s->host_spec);
 }
 
 static int64_t sd_getlength(BlockDriverState *bs)
@@ -1413,7 +1604,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
         return -EINVAL;
     }
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         return fd;
     }
@@ -1422,7 +1613,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_flags);
     close(fd);
 
     if (ret < 0) {
@@ -1476,6 +1667,43 @@ out:
     sd_finish_aiocb(acb);
 }
 
+/* Delete current working VDI on the snapshot chain */
+static bool sd_delete(BDRVSheepdogState *s)
+{
+    unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
+    SheepdogVdiReq hdr = {
+        .opcode = SD_OP_DEL_VDI,
+        .vdi_id = s->inode.vdi_id,
+        .data_length = wlen,
+        .flags = SD_FLAG_CMD_WRITE,
+    };
+    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+    int fd, ret;
+
+    fd = connect_to_sdog(s);
+    if (fd < 0) {
+        return false;
+    }
+
+    ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
+    closesocket(fd);
+    if (ret) {
+        return false;
+    }
+    switch (rsp->result) {
+    case SD_RES_NO_VDI:
+        error_report("%s was already deleted", s->name);
+        /* fall through */
+    case SD_RES_SUCCESS:
+        break;
+    default:
+        error_report("%s, %s", sd_strerror(rsp->result), s->name);
+        return false;
+    }
+
+    return true;
+}
+
 /*
  * Create a writable VDI from a snapshot
  */
@@ -1484,28 +1712,34 @@ static int sd_create_branch(BDRVSheepdogState *s)
     int ret, fd;
     uint32_t vid;
     char *buf;
+    bool deleted;
 
-    dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
+    DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
 
     buf = g_malloc(SD_INODE_SIZE);
 
-    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
-                       s->addr, s->port);
+    /*
+     * Even If deletion fails, we will just create extra snapshot based on
+     * the workding VDI which was supposed to be deleted. So no need to
+     * false bail out.
+     */
+    deleted = sd_delete(s);
+    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid,
+                       !deleted);
     if (ret) {
         goto out;
     }
 
-    dprintf("%" PRIx32 " is created.\n", vid);
+    DPRINTF("%" PRIx32 " is created.\n", vid);
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
-        error_report("failed to connect");
         ret = fd;
         goto out;
     }
 
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0, s->cache_enabled);
+                      SD_INODE_SIZE, 0, s->cache_flags);
 
     closesocket(fd);
 
@@ -1517,7 +1751,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
 
     s->is_snapshot = false;
     ret = 0;
-    dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
+    DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
 
 out:
     g_free(buf);
@@ -1541,10 +1775,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 {
     SheepdogAIOCB *acb = p;
     int ret = 0;
-    unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
-    unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+    unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
+    unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
     uint64_t oid;
-    uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+    uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
     BDRVSheepdogState *s = acb->common.bs->opaque;
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
@@ -1593,16 +1827,25 @@ static int coroutine_fn sd_co_rw_vector(void *p)
                 flags = SD_FLAG_CMD_COW;
             }
             break;
+        case AIOCB_DISCARD_OBJ:
+            /*
+             * We discard the object only when the whole object is
+             * 1) allocated 2) trimmed. Otherwise, simply skip it.
+             */
+            if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+                goto done;
+            }
+            break;
         default:
             break;
         }
 
         if (create) {
-            dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
+            DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
                     inode->vdi_id, oid,
                     vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
             oid = vid_to_data_oid(inode->vdi_id, idx);
-            dprintf("new oid %" PRIx64 "\n", oid);
+            DPRINTF("new oid %" PRIx64 "\n", oid);
         }
 
         aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
@@ -1654,14 +1897,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     int ret;
 
     if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
-        ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
+        ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
         if (ret < 0) {
             return ret;
         }
         bs->total_sectors = sector_num + nb_sectors;
     }
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
     acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
@@ -1682,7 +1925,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     SheepdogAIOCB *acb;
     int ret;
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
     acb->aiocb_type = AIOCB_READ_UDATA;
     acb->aio_done_func = sd_finish_aiocb;
 
@@ -1700,39 +1943,31 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
 static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
-    SheepdogObjReq hdr = { 0 };
-    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
-    SheepdogInode *inode = &s->inode;
+    SheepdogAIOCB *acb;
+    AIOReq *aio_req;
     int ret;
-    unsigned int wlen = 0, rlen = 0;
 
-    if (!s->cache_enabled) {
+    if (s->cache_flags != SD_FLAG_CMD_CACHE) {
         return 0;
     }
 
-    hdr.opcode = SD_OP_FLUSH_VDI;
-    hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+    acb = sd_aio_setup(bs, NULL, 0, 0);
+    acb->aiocb_type = AIOCB_FLUSH_CACHE;
+    acb->aio_done_func = sd_finish_aiocb;
 
-    ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
-    if (ret) {
-        error_report("failed to send a request to the sheep");
+    aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+                            0, 0, 0, 0, 0);
+    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+    ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
+    if (ret < 0) {
+        error_report("add_aio_request is failed");
+        free_aio_req(s, aio_req);
+        qemu_aio_release(acb);
         return ret;
     }
 
-    if (rsp->result == SD_RES_INVALID_PARMS) {
-        dprintf("disable write cache since the server doesn't support it\n");
-
-        s->cache_enabled = false;
-        closesocket(s->flush_fd);
-        return 0;
-    }
-
-    if (rsp->result != SD_RES_SUCCESS) {
-        error_report("%s", sd_strerror(rsp->result));
-        return -EIO;
-    }
-
-    return 0;
+    qemu_coroutine_yield();
+    return acb->ret;
 }
 
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -1743,7 +1978,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     SheepdogInode *inode;
     unsigned int datalen;
 
-    dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
+    DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
             "is_snapshot %d\n", sn_info->name, sn_info->id_str,
             s->name, sn_info->vm_state_size, s->is_snapshot);
 
@@ -1754,7 +1989,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         return -EINVAL;
     }
 
-    dprintf("%s %s\n", sn_info->name, sn_info->id_str);
+    DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
 
     s->inode.vm_state_size = sn_info->vm_state_size;
     s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
@@ -1766,21 +2001,21 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
 
     /* refresh inode. */
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         ret = fd;
         goto cleanup;
     }
 
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_flags);
     if (ret < 0) {
         error_report("failed to write snapshot's inode.");
         goto cleanup;
     }
 
-    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
-                       s->addr, s->port);
+    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid,
+                       1);
     if (ret < 0) {
         error_report("failed to create inode for snapshot. %s",
                      strerror(errno));
@@ -1790,7 +2025,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     inode = (SheepdogInode *)g_malloc(datalen);
 
     ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
-                      s->inode.nr_copies, datalen, 0, s->cache_enabled);
+                      s->inode.nr_copies, datalen, 0, s->cache_flags);
 
     if (ret < 0) {
         error_report("failed to read new inode info. %s", strerror(errno));
@@ -1798,7 +2033,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 
     memcpy(&s->inode, inode, datalen);
-    dprintf("s->inode: name %s snap_id %x oid %x\n",
+    DPRINTF("s->inode: name %s snap_id %x oid %x\n",
             s->inode.name, s->inode.snap_id, s->inode.vdi_id);
 
 cleanup:
@@ -1806,70 +2041,47 @@ cleanup:
     return ret;
 }
 
+/*
+ * We implement rollback(loadvm) operation to the specified snapshot by
+ * 1) switch to the snapshot
+ * 2) rely on sd_create_branch to delete working VDI and
+ * 3) create a new working VDI based on the speicified snapshot
+ */
 static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 {
     BDRVSheepdogState *s = bs->opaque;
     BDRVSheepdogState *old_s;
-    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
-    char *buf = NULL;
-    uint32_t vid;
+    char tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid = 0;
-    int ret = 0, fd;
+    int ret = 0;
 
     old_s = g_malloc(sizeof(BDRVSheepdogState));
 
     memcpy(old_s, s, sizeof(BDRVSheepdogState));
 
-    pstrcpy(vdi, sizeof(vdi), s->name);
-
     snapid = strtoul(snapshot_id, NULL, 10);
     if (snapid) {
         tag[0] = 0;
     } else {
-        pstrcpy(tag, sizeof(tag), s->name);
+        pstrcpy(tag, sizeof(tag), snapshot_id);
     }
 
-    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
+    ret = reload_inode(s, snapid, tag);
     if (ret) {
-        error_report("Failed to find_vdi_name");
         goto out;
     }
 
-    fd = connect_to_sdog(s->addr, s->port);
-    if (fd < 0) {
-        error_report("failed to connect");
-        ret = fd;
-        goto out;
-    }
-
-    buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0, s->cache_enabled);
-
-    closesocket(fd);
-
+    ret = sd_create_branch(s);
     if (ret) {
         goto out;
     }
 
-    memcpy(&s->inode, buf, sizeof(s->inode));
-
-    if (!s->inode.vm_state_size) {
-        error_report("Invalid snapshot");
-        ret = -ENOENT;
-        goto out;
-    }
-
-    s->is_snapshot = true;
-
-    g_free(buf);
     g_free(old_s);
 
     return 0;
 out:
     /* recover bdrv_sd_state */
     memcpy(s, old_s, sizeof(BDRVSheepdogState));
-    g_free(buf);
     g_free(old_s);
 
     error_report("failed to open. recover old bdrv_sd_state.");
@@ -1899,7 +2111,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
     vdi_inuse = g_malloc(max);
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         ret = fd;
         goto out;
@@ -1926,9 +2138,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
     hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
     start_nr = hval & (SD_NR_VDIS - 1);
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
-        error_report("failed to connect");
         ret = fd;
         goto out;
     }
@@ -1941,7 +2152,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
         /* we don't need to read entire object */
         ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
                           0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
-                          s->cache_enabled);
+                          s->cache_flags);
 
         if (ret) {
             continue;
@@ -1982,10 +2193,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     int fd, ret = 0, remaining = size;
     unsigned int data_len;
     uint64_t vmstate_oid;
-    uint32_t vdi_index;
     uint64_t offset;
+    uint32_t vdi_index;
+    uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
 
-    fd = connect_to_sdog(s->addr, s->port);
+    fd = connect_to_sdog(s);
     if (fd < 0) {
         return fd;
     }
@@ -1996,17 +2208,17 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
 
         data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
 
-        vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
+        vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
 
         create = (offset == 0);
         if (load) {
             ret = read_object(fd, (char *)data, vmstate_oid,
                               s->inode.nr_copies, data_len, offset,
-                              s->cache_enabled);
+                              s->cache_flags);
         } else {
             ret = write_object(fd, (char *)data, vmstate_oid,
                                s->inode.nr_copies, data_len, offset, create,
-                               s->cache_enabled);
+                               s->cache_flags);
         }
 
         if (ret < 0) {
@@ -2024,12 +2236,19 @@ cleanup:
     return ret;
 }
 
-static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
-                           int64_t pos, int size)
+static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                           int64_t pos)
 {
     BDRVSheepdogState *s = bs->opaque;
+    void *buf;
+    int ret;
+
+    buf = qemu_blockalign(bs, qiov->size);
+    qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+    ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
+    qemu_vfree(buf);
 
-    return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
+    return ret;
 }
 
 static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
@@ -2041,6 +2260,67 @@ static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
 }
 
 
+static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors)
+{
+    SheepdogAIOCB *acb;
+    QEMUIOVector dummy;
+    BDRVSheepdogState *s = bs->opaque;
+    int ret;
+
+    if (!s->discard_supported) {
+            return 0;
+    }
+
+    acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors);
+    acb->aiocb_type = AIOCB_DISCARD_OBJ;
+    acb->aio_done_func = sd_finish_aiocb;
+
+    ret = sd_co_rw_vector(acb);
+    if (ret <= 0) {
+        qemu_aio_release(acb);
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+
+    return acb->ret;
+}
+
+static coroutine_fn int
+sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                   int *pnum)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogInode *inode = &s->inode;
+    unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+                  end = DIV_ROUND_UP((sector_num + nb_sectors) *
+                                     BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+    unsigned long idx;
+    int ret = 1;
+
+    for (idx = start; idx < end; idx++) {
+        if (inode->data_vdi_id[idx] == 0) {
+            break;
+        }
+    }
+    if (idx == start) {
+        /* Get the longest length of unallocated sectors */
+        ret = 0;
+        for (idx = start + 1; idx < end; idx++) {
+            if (inode->data_vdi_id[idx] != 0) {
+                break;
+            }
+        }
+    }
+
+    *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+    if (*pnum > nb_sectors) {
+        *pnum = nb_sectors;
+    }
+    return ret;
+}
+
 static QEMUOptionParameter sd_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -2060,19 +2340,78 @@ static QEMUOptionParameter sd_create_options[] = {
     { NULL }
 };
 
-BlockDriver bdrv_sheepdog = {
+static BlockDriver bdrv_sheepdog = {
     .format_name    = "sheepdog",
     .protocol_name  = "sheepdog",
     .instance_size  = sizeof(BDRVSheepdogState),
     .bdrv_file_open = sd_open,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
+    .bdrv_getlength = sd_getlength,
+    .bdrv_truncate  = sd_truncate,
+
+    .bdrv_co_readv  = sd_co_readv,
+    .bdrv_co_writev = sd_co_writev,
+    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
+    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_is_allocated = sd_co_is_allocated,
+
+    .bdrv_snapshot_create   = sd_snapshot_create,
+    .bdrv_snapshot_goto     = sd_snapshot_goto,
+    .bdrv_snapshot_delete   = sd_snapshot_delete,
+    .bdrv_snapshot_list     = sd_snapshot_list,
+
+    .bdrv_save_vmstate  = sd_save_vmstate,
+    .bdrv_load_vmstate  = sd_load_vmstate,
+
+    .create_options = sd_create_options,
+};
+
+static BlockDriver bdrv_sheepdog_tcp = {
+    .format_name    = "sheepdog",
+    .protocol_name  = "sheepdog+tcp",
+    .instance_size  = sizeof(BDRVSheepdogState),
+    .bdrv_file_open = sd_open,
+    .bdrv_close     = sd_close,
+    .bdrv_create    = sd_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
+    .bdrv_getlength = sd_getlength,
+    .bdrv_truncate  = sd_truncate,
+
+    .bdrv_co_readv  = sd_co_readv,
+    .bdrv_co_writev = sd_co_writev,
+    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
+    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_is_allocated = sd_co_is_allocated,
+
+    .bdrv_snapshot_create   = sd_snapshot_create,
+    .bdrv_snapshot_goto     = sd_snapshot_goto,
+    .bdrv_snapshot_delete   = sd_snapshot_delete,
+    .bdrv_snapshot_list     = sd_snapshot_list,
+
+    .bdrv_save_vmstate  = sd_save_vmstate,
+    .bdrv_load_vmstate  = sd_load_vmstate,
+
+    .create_options = sd_create_options,
+};
+
+static BlockDriver bdrv_sheepdog_unix = {
+    .format_name    = "sheepdog",
+    .protocol_name  = "sheepdog+unix",
+    .instance_size  = sizeof(BDRVSheepdogState),
+    .bdrv_file_open = sd_open,
+    .bdrv_close     = sd_close,
+    .bdrv_create    = sd_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_getlength = sd_getlength,
     .bdrv_truncate  = sd_truncate,
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
+    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_is_allocated = sd_co_is_allocated,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
@@ -2088,5 +2427,7 @@ BlockDriver bdrv_sheepdog = {
 static void bdrv_sheepdog_init(void)
 {
     bdrv_register(&bdrv_sheepdog);
+    bdrv_register(&bdrv_sheepdog_tcp);
+    bdrv_register(&bdrv_sheepdog_unix);
 }
 block_init(bdrv_sheepdog_init);
diff --git a/block/snapshot.c b/block/snapshot.c
new file mode 100644
index 000000000..6c6d9deea
--- /dev/null
+++ b/block/snapshot.c
@@ -0,0 +1,157 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/snapshot.h"
+#include "block/block_int.h"
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+                       const char *name)
+{
+    QEMUSnapshotInfo *sn_tab, *sn;
+    int nb_sns, i, ret;
+
+    ret = -ENOENT;
+    nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+    if (nb_sns < 0) {
+        return ret;
+    }
+    for (i = 0; i < nb_sns; i++) {
+        sn = &sn_tab[i];
+        if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) {
+            *sn_info = *sn;
+            ret = 0;
+            break;
+        }
+    }
+    g_free(sn_tab);
+    return ret;
+}
+
+int bdrv_can_snapshot(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    if (!drv->bdrv_snapshot_create) {
+        if (bs->file != NULL) {
+            return bdrv_can_snapshot(bs->file);
+        }
+        return 0;
+    }
+
+    return 1;
+}
+
+int bdrv_snapshot_create(BlockDriverState *bs,
+                         QEMUSnapshotInfo *sn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_create) {
+        return drv->bdrv_snapshot_create(bs, sn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_create(bs->file, sn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_goto(BlockDriverState *bs,
+                       const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    int ret, open_ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_goto) {
+        return drv->bdrv_snapshot_goto(bs, snapshot_id);
+    }
+
+    if (bs->file) {
+        drv->bdrv_close(bs);
+        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
+        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
+        if (open_ret < 0) {
+            bdrv_delete(bs->file);
+            bs->drv = NULL;
+            return open_ret;
+        }
+        return ret;
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_delete) {
+        return drv->bdrv_snapshot_delete(bs, snapshot_id);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_delete(bs->file, snapshot_id);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_list(BlockDriverState *bs,
+                       QEMUSnapshotInfo **psn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_list) {
+        return drv->bdrv_snapshot_list(bs, psn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_list(bs->file, psn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+        const char *snapshot_name)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!bs->read_only) {
+        return -EINVAL;
+    }
+    if (drv->bdrv_snapshot_load_tmp) {
+        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+    }
+    return -ENOTSUP;
+}
diff --git a/block/ssh.c b/block/ssh.c
new file mode 100644
index 000000000..d7e7bf8dd
--- /dev/null
+++ b/block/ssh.c
@@ -0,0 +1,1076 @@
+/*
+ * Secure Shell (ssh) backend for QEMU.
+ *
+ * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones <rjones@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include <libssh2.h>
+#include <libssh2_sftp.h>
+
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
+#include "qapi/qmp/qint.h"
+
+/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
+ * this block driver code.
+ *
+ * TRACE_LIBSSH2=<bitmask> enables tracing in libssh2 itself.  Note
+ * that this requires that libssh2 was specially compiled with the
+ * `./configure --enable-debug' option, so most likely you will have
+ * to compile it yourself.  The meaning of <bitmask> is described
+ * here: http://www.libssh2.org/libssh2_trace.html
+ */
+#define DEBUG_SSH     0
+#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */
+
+#define DPRINTF(fmt, ...)                           \
+    do {                                            \
+        if (DEBUG_SSH) {                            \
+            fprintf(stderr, "ssh: %-15s " fmt "\n", \
+                    __func__, ##__VA_ARGS__);       \
+        }                                           \
+    } while (0)
+
+typedef struct BDRVSSHState {
+    /* Coroutine. */
+    CoMutex lock;
+
+    /* SSH connection. */
+    int sock;                         /* socket */
+    LIBSSH2_SESSION *session;         /* ssh session */
+    LIBSSH2_SFTP *sftp;               /* sftp session */
+    LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */
+
+    /* See ssh_seek() function below. */
+    int64_t offset;
+    bool offset_op_read;
+
+    /* File attributes at open.  We try to keep the .filesize field
+     * updated if it changes (eg by writing at the end of the file).
+     */
+    LIBSSH2_SFTP_ATTRIBUTES attrs;
+
+    /* Used to warn if 'flush' is not supported. */
+    char *hostport;
+    bool unsafe_flush_warning;
+} BDRVSSHState;
+
+static void ssh_state_init(BDRVSSHState *s)
+{
+    memset(s, 0, sizeof *s);
+    s->sock = -1;
+    s->offset = -1;
+    qemu_co_mutex_init(&s->lock);
+}
+
+static void ssh_state_free(BDRVSSHState *s)
+{
+    g_free(s->hostport);
+    if (s->sftp_handle) {
+        libssh2_sftp_close(s->sftp_handle);
+    }
+    if (s->sftp) {
+        libssh2_sftp_shutdown(s->sftp);
+    }
+    if (s->session) {
+        libssh2_session_disconnect(s->session,
+                                   "from qemu ssh client: "
+                                   "user closed the connection");
+        libssh2_session_free(s->session);
+    }
+    if (s->sock >= 0) {
+        close(s->sock);
+    }
+}
+
+/* Wrappers around error_report which make sure to dump as much
+ * information from libssh2 as possible.
+ */
+static void GCC_FMT_ATTR(2, 3)
+session_error_report(BDRVSSHState *s, const char *fs, ...)
+{
+    va_list args;
+
+    va_start(args, fs);
+    error_vprintf(fs, args);
+
+    if ((s)->session) {
+        char *ssh_err;
+        int ssh_err_code;
+
+        libssh2_session_last_error((s)->session, &ssh_err, NULL, 0);
+        /* This is not an errno.  See <libssh2.h>. */
+        ssh_err_code = libssh2_session_last_errno((s)->session);
+
+        error_printf(": %s (libssh2 error code: %d)", ssh_err, ssh_err_code);
+    }
+
+    va_end(args);
+    error_printf("\n");
+}
+
+static void GCC_FMT_ATTR(2, 3)
+sftp_error_report(BDRVSSHState *s, const char *fs, ...)
+{
+    va_list args;
+
+    va_start(args, fs);
+    error_vprintf(fs, args);
+
+    if ((s)->sftp) {
+        char *ssh_err;
+        int ssh_err_code;
+        unsigned long sftp_err_code;
+
+        libssh2_session_last_error((s)->session, &ssh_err, NULL, 0);
+        /* This is not an errno.  See <libssh2.h>. */
+        ssh_err_code = libssh2_session_last_errno((s)->session);
+        /* See <libssh2_sftp.h>. */
+        sftp_err_code = libssh2_sftp_last_error((s)->sftp);
+
+        error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)",
+                     ssh_err, ssh_err_code, sftp_err_code);
+    }
+
+    va_end(args);
+    error_printf("\n");
+}
+
+static int parse_uri(const char *filename, QDict *options, Error **errp)
+{
+    URI *uri = NULL;
+    QueryParams *qp = NULL;
+    int i;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    if (strcmp(uri->scheme, "ssh") != 0) {
+        error_setg(errp, "URI scheme must be 'ssh'");
+        goto err;
+    }
+
+    if (!uri->server || strcmp(uri->server, "") == 0) {
+        error_setg(errp, "missing hostname in URI");
+        goto err;
+    }
+
+    if (!uri->path || strcmp(uri->path, "") == 0) {
+        error_setg(errp, "missing remote path in URI");
+        goto err;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (!qp) {
+        error_setg(errp, "could not parse query parameters");
+        goto err;
+    }
+
+    if(uri->user && strcmp(uri->user, "") != 0) {
+        qdict_put(options, "user", qstring_from_str(uri->user));
+    }
+
+    qdict_put(options, "host", qstring_from_str(uri->server));
+
+    if (uri->port) {
+        qdict_put(options, "port", qint_from_int(uri->port));
+    }
+
+    qdict_put(options, "path", qstring_from_str(uri->path));
+
+    /* Pick out any query parameters that we understand, and ignore
+     * the rest.
+     */
+    for (i = 0; i < qp->n; ++i) {
+        if (strcmp(qp->p[i].name, "host_key_check") == 0) {
+            qdict_put(options, "host_key_check",
+                      qstring_from_str(qp->p[i].value));
+        }
+    }
+
+    query_params_free(qp);
+    uri_free(uri);
+    return 0;
+
+ err:
+    if (qp) {
+      query_params_free(qp);
+    }
+    if (uri) {
+      uri_free(uri);
+    }
+    return -EINVAL;
+}
+
+static void ssh_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    if (qdict_haskey(options, "user") ||
+        qdict_haskey(options, "host") ||
+        qdict_haskey(options, "port") ||
+        qdict_haskey(options, "path") ||
+        qdict_haskey(options, "host_key_check")) {
+        error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option");
+        return;
+    }
+
+    parse_uri(filename, options, errp);
+}
+
+static int check_host_key_knownhosts(BDRVSSHState *s,
+                                     const char *host, int port)
+{
+    const char *home;
+    char *knh_file = NULL;
+    LIBSSH2_KNOWNHOSTS *knh = NULL;
+    struct libssh2_knownhost *found;
+    int ret, r;
+    const char *hostkey;
+    size_t len;
+    int type;
+
+    hostkey = libssh2_session_hostkey(s->session, &len, &type);
+    if (!hostkey) {
+        ret = -EINVAL;
+        session_error_report(s, "failed to read remote host key");
+        goto out;
+    }
+
+    knh = libssh2_knownhost_init(s->session);
+    if (!knh) {
+        ret = -EINVAL;
+        session_error_report(s, "failed to initialize known hosts support");
+        goto out;
+    }
+
+    home = getenv("HOME");
+    if (home) {
+        knh_file = g_strdup_printf("%s/.ssh/known_hosts", home);
+    } else {
+        knh_file = g_strdup_printf("/root/.ssh/known_hosts");
+    }
+
+    /* Read all known hosts from OpenSSH-style known_hosts file. */
+    libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH);
+
+    r = libssh2_knownhost_checkp(knh, host, port, hostkey, len,
+                                 LIBSSH2_KNOWNHOST_TYPE_PLAIN|
+                                 LIBSSH2_KNOWNHOST_KEYENC_RAW,
+                                 &found);
+    switch (r) {
+    case LIBSSH2_KNOWNHOST_CHECK_MATCH:
+        /* OK */
+        DPRINTF("host key OK: %s", found->key);
+        break;
+    case LIBSSH2_KNOWNHOST_CHECK_MISMATCH:
+        ret = -EINVAL;
+        session_error_report(s, "host key does not match the one in known_hosts (found key %s)",
+                             found->key);
+        goto out;
+    case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND:
+        ret = -EINVAL;
+        session_error_report(s, "no host key was found in known_hosts");
+        goto out;
+    case LIBSSH2_KNOWNHOST_CHECK_FAILURE:
+        ret = -EINVAL;
+        session_error_report(s, "failure matching the host key with known_hosts");
+        goto out;
+    default:
+        ret = -EINVAL;
+        session_error_report(s, "unknown error matching the host key with known_hosts (%d)",
+                             r);
+        goto out;
+    }
+
+    /* known_hosts checking successful. */
+    ret = 0;
+
+ out:
+    if (knh != NULL) {
+        libssh2_knownhost_free(knh);
+    }
+    g_free(knh_file);
+    return ret;
+}
+
+static unsigned hex2decimal(char ch)
+{
+    if (ch >= '0' && ch <= '9') {
+        return (ch - '0');
+    } else if (ch >= 'a' && ch <= 'f') {
+        return 10 + (ch - 'a');
+    } else if (ch >= 'A' && ch <= 'F') {
+        return 10 + (ch - 'A');
+    }
+
+    return -1;
+}
+
+/* Compare the binary fingerprint (hash of host key) with the
+ * host_key_check parameter.
+ */
+static int compare_fingerprint(const unsigned char *fingerprint, size_t len,
+                               const char *host_key_check)
+{
+    unsigned c;
+
+    while (len > 0) {
+        while (*host_key_check == ':')
+            host_key_check++;
+        if (!qemu_isxdigit(host_key_check[0]) ||
+            !qemu_isxdigit(host_key_check[1]))
+            return 1;
+        c = hex2decimal(host_key_check[0]) * 16 +
+            hex2decimal(host_key_check[1]);
+        if (c - *fingerprint != 0)
+            return c - *fingerprint;
+        fingerprint++;
+        len--;
+        host_key_check += 2;
+    }
+    return *host_key_check - '\0';
+}
+
+static int
+check_host_key_hash(BDRVSSHState *s, const char *hash,
+                    int hash_type, size_t fingerprint_len)
+{
+    const char *fingerprint;
+
+    fingerprint = libssh2_hostkey_hash(s->session, hash_type);
+    if (!fingerprint) {
+        session_error_report(s, "failed to read remote host key");
+        return -EINVAL;
+    }
+
+    if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len,
+                           hash) != 0) {
+        error_report("remote host key does not match host_key_check '%s'",
+                     hash);
+        return -EPERM;
+    }
+
+    return 0;
+}
+
+static int check_host_key(BDRVSSHState *s, const char *host, int port,
+                          const char *host_key_check)
+{
+    /* host_key_check=no */
+    if (strcmp(host_key_check, "no") == 0) {
+        return 0;
+    }
+
+    /* host_key_check=md5:xx:yy:zz:... */
+    if (strncmp(host_key_check, "md5:", 4) == 0) {
+        return check_host_key_hash(s, &host_key_check[4],
+                                   LIBSSH2_HOSTKEY_HASH_MD5, 16);
+    }
+
+    /* host_key_check=sha1:xx:yy:zz:... */
+    if (strncmp(host_key_check, "sha1:", 5) == 0) {
+        return check_host_key_hash(s, &host_key_check[5],
+                                   LIBSSH2_HOSTKEY_HASH_SHA1, 20);
+    }
+
+    /* host_key_check=yes */
+    if (strcmp(host_key_check, "yes") == 0) {
+        return check_host_key_knownhosts(s, host, port);
+    }
+
+    error_report("unknown host_key_check setting (%s)", host_key_check);
+    return -EINVAL;
+}
+
+static int authenticate(BDRVSSHState *s, const char *user)
+{
+    int r, ret;
+    const char *userauthlist;
+    LIBSSH2_AGENT *agent = NULL;
+    struct libssh2_agent_publickey *identity;
+    struct libssh2_agent_publickey *prev_identity = NULL;
+
+    userauthlist = libssh2_userauth_list(s->session, user, strlen(user));
+    if (strstr(userauthlist, "publickey") == NULL) {
+        ret = -EPERM;
+        error_report("remote server does not support \"publickey\" authentication");
+        goto out;
+    }
+
+    /* Connect to ssh-agent and try each identity in turn. */
+    agent = libssh2_agent_init(s->session);
+    if (!agent) {
+        ret = -EINVAL;
+        session_error_report(s, "failed to initialize ssh-agent support");
+        goto out;
+    }
+    if (libssh2_agent_connect(agent)) {
+        ret = -ECONNREFUSED;
+        session_error_report(s, "failed to connect to ssh-agent");
+        goto out;
+    }
+    if (libssh2_agent_list_identities(agent)) {
+        ret = -EINVAL;
+        session_error_report(s, "failed requesting identities from ssh-agent");
+        goto out;
+    }
+
+    for(;;) {
+        r = libssh2_agent_get_identity(agent, &identity, prev_identity);
+        if (r == 1) {           /* end of list */
+            break;
+        }
+        if (r < 0) {
+            ret = -EINVAL;
+            session_error_report(s, "failed to obtain identity from ssh-agent");
+            goto out;
+        }
+        r = libssh2_agent_userauth(agent, user, identity);
+        if (r == 0) {
+            /* Authenticated! */
+            ret = 0;
+            goto out;
+        }
+        /* Failed to authenticate with this identity, try the next one. */
+        prev_identity = identity;
+    }
+
+    ret = -EPERM;
+    error_report("failed to authenticate using publickey authentication "
+                 "and the identities held by your ssh-agent");
+
+ out:
+    if (agent != NULL) {
+        /* Note: libssh2 implementation implicitly calls
+         * libssh2_agent_disconnect if necessary.
+         */
+        libssh2_agent_free(agent);
+    }
+
+    return ret;
+}
+
+static int connect_to_ssh(BDRVSSHState *s, QDict *options,
+                          int ssh_flags, int creat_mode)
+{
+    int r, ret;
+    Error *err = NULL;
+    const char *host, *user, *path, *host_key_check;
+    int port;
+
+    host = qdict_get_str(options, "host");
+
+    if (qdict_haskey(options, "port")) {
+        port = qdict_get_int(options, "port");
+    } else {
+        port = 22;
+    }
+
+    path = qdict_get_str(options, "path");
+
+    if (qdict_haskey(options, "user")) {
+        user = qdict_get_str(options, "user");
+    } else {
+        user = g_get_user_name();
+        if (!user) {
+            ret = -errno;
+            goto err;
+        }
+    }
+
+    if (qdict_haskey(options, "host_key_check")) {
+        host_key_check = qdict_get_str(options, "host_key_check");
+    } else {
+        host_key_check = "yes";
+    }
+
+    /* Construct the host:port name for inet_connect. */
+    g_free(s->hostport);
+    s->hostport = g_strdup_printf("%s:%d", host, port);
+
+    /* Open the socket and connect. */
+    s->sock = inet_connect(s->hostport, &err);
+    if (err != NULL) {
+        ret = -errno;
+        qerror_report_err(err);
+        error_free(err);
+        goto err;
+    }
+
+    /* Create SSH session. */
+    s->session = libssh2_session_init();
+    if (!s->session) {
+        ret = -EINVAL;
+        session_error_report(s, "failed to initialize libssh2 session");
+        goto err;
+    }
+
+#if TRACE_LIBSSH2 != 0
+    libssh2_trace(s->session, TRACE_LIBSSH2);
+#endif
+
+    r = libssh2_session_handshake(s->session, s->sock);
+    if (r != 0) {
+        ret = -EINVAL;
+        session_error_report(s, "failed to establish SSH session");
+        goto err;
+    }
+
+    /* Check the remote host's key against known_hosts. */
+    ret = check_host_key(s, host, port, host_key_check);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Authenticate. */
+    ret = authenticate(s, user);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Start SFTP. */
+    s->sftp = libssh2_sftp_init(s->session);
+    if (!s->sftp) {
+        session_error_report(s, "failed to initialize sftp handle");
+        ret = -EINVAL;
+        goto err;
+    }
+
+    /* Open the remote file. */
+    DPRINTF("opening file %s flags=0x%x creat_mode=0%o",
+            path, ssh_flags, creat_mode);
+    s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode);
+    if (!s->sftp_handle) {
+        session_error_report(s, "failed to open remote file '%s'", path);
+        ret = -EINVAL;
+        goto err;
+    }
+
+    r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs);
+    if (r < 0) {
+        sftp_error_report(s, "failed to read file attributes");
+        return -EINVAL;
+    }
+
+    /* Delete the options we've used; any not deleted will cause the
+     * block layer to give an error about unused options.
+     */
+    qdict_del(options, "host");
+    qdict_del(options, "port");
+    qdict_del(options, "user");
+    qdict_del(options, "path");
+    qdict_del(options, "host_key_check");
+
+    return 0;
+
+ err:
+    if (s->sftp_handle) {
+        libssh2_sftp_close(s->sftp_handle);
+    }
+    s->sftp_handle = NULL;
+    if (s->sftp) {
+        libssh2_sftp_shutdown(s->sftp);
+    }
+    s->sftp = NULL;
+    if (s->session) {
+        libssh2_session_disconnect(s->session,
+                                   "from qemu ssh client: "
+                                   "error opening connection");
+        libssh2_session_free(s->session);
+    }
+    s->session = NULL;
+
+    return ret;
+}
+
+static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags)
+{
+    BDRVSSHState *s = bs->opaque;
+    int ret;
+    int ssh_flags;
+
+    ssh_state_init(s);
+
+    ssh_flags = LIBSSH2_FXF_READ;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        ssh_flags |= LIBSSH2_FXF_WRITE;
+    }
+
+    /* Start up SSH. */
+    ret = connect_to_ssh(s, options, ssh_flags, 0);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Go non-blocking. */
+    libssh2_session_set_blocking(s->session, 0);
+
+    return 0;
+
+ err:
+    if (s->sock >= 0) {
+        close(s->sock);
+    }
+    s->sock = -1;
+
+    return ret;
+}
+
+static QEMUOptionParameter ssh_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static int ssh_create(const char *filename, QEMUOptionParameter *options)
+{
+    int r, ret;
+    Error *local_err = NULL;
+    int64_t total_size = 0;
+    QDict *uri_options = NULL;
+    BDRVSSHState s;
+    ssize_t r2;
+    char c[1] = { '\0' };
+
+    ssh_state_init(&s);
+
+    /* Get desired file size. */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n;
+        }
+        options++;
+    }
+    DPRINTF("total_size=%" PRIi64, total_size);
+
+    uri_options = qdict_new();
+    r = parse_uri(filename, uri_options, &local_err);
+    if (r < 0) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = r;
+        goto out;
+    }
+
+    r = connect_to_ssh(&s, uri_options,
+                       LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE|
+                       LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, 0644);
+    if (r < 0) {
+        ret = r;
+        goto out;
+    }
+
+    if (total_size > 0) {
+        libssh2_sftp_seek64(s.sftp_handle, total_size-1);
+        r2 = libssh2_sftp_write(s.sftp_handle, c, 1);
+        if (r2 < 0) {
+            sftp_error_report(&s, "truncate failed");
+            ret = -EINVAL;
+            goto out;
+        }
+        s.attrs.filesize = total_size;
+    }
+
+    ret = 0;
+
+ out:
+    ssh_state_free(&s);
+    if (uri_options != NULL) {
+        QDECREF(uri_options);
+    }
+    return ret;
+}
+
+static void ssh_close(BlockDriverState *bs)
+{
+    BDRVSSHState *s = bs->opaque;
+
+    ssh_state_free(s);
+}
+
+static int ssh_has_zero_init(BlockDriverState *bs)
+{
+    BDRVSSHState *s = bs->opaque;
+    /* Assume false, unless we can positively prove it's true. */
+    int has_zero_init = 0;
+
+    if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) {
+        if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) {
+            has_zero_init = 1;
+        }
+    }
+
+    return has_zero_init;
+}
+
+static void restart_coroutine(void *opaque)
+{
+    Coroutine *co = opaque;
+
+    DPRINTF("co=%p", co);
+
+    qemu_coroutine_enter(co, NULL);
+}
+
+/* Always true because when we have called set_fd_handler there is
+ * always a request being processed.
+ */
+static int return_true(void *opaque)
+{
+    return 1;
+}
+
+static coroutine_fn void set_fd_handler(BDRVSSHState *s)
+{
+    int r;
+    IOHandler *rd_handler = NULL, *wr_handler = NULL;
+    Coroutine *co = qemu_coroutine_self();
+
+    r = libssh2_session_block_directions(s->session);
+
+    if (r & LIBSSH2_SESSION_BLOCK_INBOUND) {
+        rd_handler = restart_coroutine;
+    }
+    if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) {
+        wr_handler = restart_coroutine;
+    }
+
+    DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock,
+            rd_handler, wr_handler);
+
+    qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co);
+}
+
+static coroutine_fn void clear_fd_handler(BDRVSSHState *s)
+{
+    DPRINTF("s->sock=%d", s->sock);
+    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
+}
+
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s)
+{
+    set_fd_handler(s);
+    qemu_coroutine_yield();
+    clear_fd_handler(s);
+}
+
+/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
+ * in the remote file.  Notice that it just updates a field in the
+ * sftp_handle structure, so there is no network traffic and it cannot
+ * fail.
+ *
+ * However, `libssh2_sftp_seek64' does have a catastrophic effect on
+ * performance since it causes the handle to throw away all in-flight
+ * reads and buffered readahead data.  Therefore this function tries
+ * to be intelligent about when to call the underlying libssh2 function.
+ */
+#define SSH_SEEK_WRITE 0
+#define SSH_SEEK_READ  1
+#define SSH_SEEK_FORCE 2
+
+static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags)
+{
+    bool op_read = (flags & SSH_SEEK_READ) != 0;
+    bool force = (flags & SSH_SEEK_FORCE) != 0;
+
+    if (force || op_read != s->offset_op_read || offset != s->offset) {
+        DPRINTF("seeking to offset=%" PRIi64, offset);
+        libssh2_sftp_seek64(s->sftp_handle, offset);
+        s->offset = offset;
+        s->offset_op_read = op_read;
+    }
+}
+
+static coroutine_fn int ssh_read(BDRVSSHState *s,
+                                 int64_t offset, size_t size,
+                                 QEMUIOVector *qiov)
+{
+    ssize_t r;
+    size_t got;
+    char *buf, *end_of_vec;
+    struct iovec *i;
+
+    DPRINTF("offset=%" PRIi64 " size=%zu", offset, size);
+
+    ssh_seek(s, offset, SSH_SEEK_READ);
+
+    /* This keeps track of the current iovec element ('i'), where we
+     * will write to next ('buf'), and the end of the current iovec
+     * ('end_of_vec').
+     */
+    i = &qiov->iov[0];
+    buf = i->iov_base;
+    end_of_vec = i->iov_base + i->iov_len;
+
+    /* libssh2 has a hard-coded limit of 2000 bytes per request,
+     * although it will also do readahead behind our backs.  Therefore
+     * we may have to do repeated reads here until we have read 'size'
+     * bytes.
+     */
+    for (got = 0; got < size; ) {
+    again:
+        DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf);
+        r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf);
+        DPRINTF("sftp_read returned %zd", r);
+
+        if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+            co_yield(s);
+            goto again;
+        }
+        if (r < 0) {
+            sftp_error_report(s, "read failed");
+            s->offset = -1;
+            return -EIO;
+        }
+        if (r == 0) {
+            /* EOF: Short read so pad the buffer with zeroes and return it. */
+            qemu_iovec_memset(qiov, got, 0, size - got);
+            return 0;
+        }
+
+        got += r;
+        buf += r;
+        s->offset += r;
+        if (buf >= end_of_vec && got < size) {
+            i++;
+            buf = i->iov_base;
+            end_of_vec = i->iov_base + i->iov_len;
+        }
+    }
+
+    return 0;
+}
+
+static coroutine_fn int ssh_co_readv(BlockDriverState *bs,
+                                     int64_t sector_num,
+                                     int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVSSHState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = ssh_read(s, sector_num * BDRV_SECTOR_SIZE,
+                   nb_sectors * BDRV_SECTOR_SIZE, qiov);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
+static int ssh_write(BDRVSSHState *s,
+                     int64_t offset, size_t size,
+                     QEMUIOVector *qiov)
+{
+    ssize_t r;
+    size_t written;
+    char *buf, *end_of_vec;
+    struct iovec *i;
+
+    DPRINTF("offset=%" PRIi64 " size=%zu", offset, size);
+
+    ssh_seek(s, offset, SSH_SEEK_WRITE);
+
+    /* This keeps track of the current iovec element ('i'), where we
+     * will read from next ('buf'), and the end of the current iovec
+     * ('end_of_vec').
+     */
+    i = &qiov->iov[0];
+    buf = i->iov_base;
+    end_of_vec = i->iov_base + i->iov_len;
+
+    for (written = 0; written < size; ) {
+    again:
+        DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf);
+        r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf);
+        DPRINTF("sftp_write returned %zd", r);
+
+        if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+            co_yield(s);
+            goto again;
+        }
+        if (r < 0) {
+            sftp_error_report(s, "write failed");
+            s->offset = -1;
+            return -EIO;
+        }
+        /* The libssh2 API is very unclear about this.  A comment in
+         * the code says "nothing was acked, and no EAGAIN was
+         * received!" which apparently means that no data got sent
+         * out, and the underlying channel didn't return any EAGAIN
+         * indication.  I think this is a bug in either libssh2 or
+         * OpenSSH (server-side).  In any case, forcing a seek (to
+         * discard libssh2 internal buffers), and then trying again
+         * works for me.
+         */
+        if (r == 0) {
+            ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE);
+            co_yield(s);
+            goto again;
+        }
+
+        written += r;
+        buf += r;
+        s->offset += r;
+        if (buf >= end_of_vec && written < size) {
+            i++;
+            buf = i->iov_base;
+            end_of_vec = i->iov_base + i->iov_len;
+        }
+
+        if (offset + written > s->attrs.filesize)
+            s->attrs.filesize = offset + written;
+    }
+
+    return 0;
+}
+
+static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVSSHState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = ssh_write(s, sector_num * BDRV_SECTOR_SIZE,
+                    nb_sectors * BDRV_SECTOR_SIZE, qiov);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
+static void unsafe_flush_warning(BDRVSSHState *s, const char *what)
+{
+    if (!s->unsafe_flush_warning) {
+        error_report("warning: ssh server %s does not support fsync",
+                     s->hostport);
+        if (what) {
+            error_report("to support fsync, you need %s", what);
+        }
+        s->unsafe_flush_warning = true;
+    }
+}
+
+#ifdef HAS_LIBSSH2_SFTP_FSYNC
+
+static coroutine_fn int ssh_flush(BDRVSSHState *s)
+{
+    int r;
+
+    DPRINTF("fsync");
+ again:
+    r = libssh2_sftp_fsync(s->sftp_handle);
+    if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) {
+        co_yield(s);
+        goto again;
+    }
+    if (r == LIBSSH2_ERROR_SFTP_PROTOCOL &&
+        libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) {
+        unsafe_flush_warning(s, "OpenSSH >= 6.3");
+        return 0;
+    }
+    if (r < 0) {
+        sftp_error_report(s, "fsync failed");
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static coroutine_fn int ssh_co_flush(BlockDriverState *bs)
+{
+    BDRVSSHState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = ssh_flush(s);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
+#else /* !HAS_LIBSSH2_SFTP_FSYNC */
+
+static coroutine_fn int ssh_co_flush(BlockDriverState *bs)
+{
+    BDRVSSHState *s = bs->opaque;
+
+    unsafe_flush_warning(s, "libssh2 >= 1.4.4");
+    return 0;
+}
+
+#endif /* !HAS_LIBSSH2_SFTP_FSYNC */
+
+static int64_t ssh_getlength(BlockDriverState *bs)
+{
+    BDRVSSHState *s = bs->opaque;
+    int64_t length;
+
+    /* Note we cannot make a libssh2 call here. */
+    length = (int64_t) s->attrs.filesize;
+    DPRINTF("length=%" PRIi64, length);
+
+    return length;
+}
+
+static BlockDriver bdrv_ssh = {
+    .format_name                  = "ssh",
+    .protocol_name                = "ssh",
+    .instance_size                = sizeof(BDRVSSHState),
+    .bdrv_parse_filename          = ssh_parse_filename,
+    .bdrv_file_open               = ssh_file_open,
+    .bdrv_create                  = ssh_create,
+    .bdrv_close                   = ssh_close,
+    .bdrv_has_zero_init           = ssh_has_zero_init,
+    .bdrv_co_readv                = ssh_co_readv,
+    .bdrv_co_writev               = ssh_co_writev,
+    .bdrv_getlength               = ssh_getlength,
+    .bdrv_co_flush_to_disk        = ssh_co_flush,
+    .create_options               = ssh_create_options,
+};
+
+static void bdrv_ssh_init(void)
+{
+    int r;
+
+    r = libssh2_init(0);
+    if (r != 0) {
+        fprintf(stderr, "libssh2 initialization failed, %d\n", r);
+        exit(EXIT_FAILURE);
+    }
+
+    bdrv_register(&bdrv_ssh);
+}
+
+block_init(bdrv_ssh_init);
diff --git a/block/stream.c b/block/stream.c
index 0c0fc7a13..7fe9e486b 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -12,8 +12,8 @@
  */
 
 #include "trace.h"
-#include "block_int.h"
-#include "blockjob.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -108,7 +108,7 @@ static void coroutine_fn stream_run(void *opaque)
 
 wait:
         /* Note that even when no rate limit is applied we need to yield
-         * with no pending I/O here so that qemu_aio_flush() returns.
+         * with no pending I/O here so that bdrv_drain_all() returns.
          */
         block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
@@ -198,7 +198,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static BlockJobType stream_job_type = {
+static const BlockJobType stream_job_type = {
     .instance_size = sizeof(StreamBlockJob),
     .job_type      = "stream",
     .set_speed     = stream_set_speed,
diff --git a/block/vdi.c b/block/vdi.c
index c8330b7ea..8a915257e 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -50,15 +50,15 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #else
 /* TODO: move uuid emulation to some central place in QEMU. */
-#include "sysemu.h"     /* UUID_FMT */
+#include "sysemu/sysemu.h"     /* UUID_FMT */
 typedef unsigned char uuid_t[16];
 #endif
 
@@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header)
 {
     char uuid[37];
     logout("text        %s", header->text);
-    logout("signature   0x%04x\n", header->signature);
+    logout("signature   0x%08x\n", header->signature);
     logout("header size 0x%04x\n", header->header_size);
     logout("image type  0x%04x\n", header->image_type);
     logout("image flags 0x%04x\n", header->image_flags);
@@ -364,15 +364,17 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
     return result;
 }
 
-static int vdi_open(BlockDriverState *bs, int flags)
+static int vdi_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVVdiState *s = bs->opaque;
     VdiHeader header;
     size_t bmap_size;
+    int ret;
 
     logout("\n");
 
-    if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) {
+    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
+    if (ret < 0) {
         goto fail;
     }
 
@@ -390,33 +392,45 @@ static int vdi_open(BlockDriverState *bs, int flags)
         header.disk_size &= ~(SECTOR_SIZE - 1);
     }
 
-    if (header.version != VDI_VERSION_1_1) {
+    if (header.signature != VDI_SIGNATURE) {
+        logout("bad vdi signature %08x\n", header.signature);
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    } else if (header.version != VDI_VERSION_1_1) {
         logout("unsupported version %u.%u\n",
                header.version >> 16, header.version & 0xffff);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_bmap % SECTOR_SIZE != 0) {
         /* We only support block maps which start on a sector boundary. */
         logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_data % SECTOR_SIZE != 0) {
         /* We only support data blocks which start on a sector boundary. */
         logout("unsupported data offset 0x%x B\n", header.offset_data);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.sector_size != SECTOR_SIZE) {
         logout("unsupported sector size %u B\n", header.sector_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.block_size != 1 * MiB) {
         logout("unsupported block size %u B\n", header.block_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.disk_size >
                (uint64_t)header.blocks_in_image * header.block_size) {
         logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_link)) {
         logout("link uuid != 0, unsupported\n");
+        ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_parent)) {
         logout("parent uuid != 0, unsupported\n");
+        ret = -ENOTSUP;
         goto fail;
     }
 
@@ -429,10 +443,9 @@ static int vdi_open(BlockDriverState *bs, int flags)
 
     bmap_size = header.blocks_in_image * sizeof(uint32_t);
     bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
-    if (bmap_size > 0) {
-        s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
-    }
-    if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) {
+    s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
+    if (ret < 0) {
         goto fail_free_bmap;
     }
 
@@ -448,7 +461,7 @@ static int vdi_open(BlockDriverState *bs, int flags)
     g_free(s->bmap);
 
  fail:
-    return -1;
+    return ret;
 }
 
 static int vdi_reopen_prepare(BDRVReopenState *state,
@@ -766,6 +779,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_close = vdi_close,
     .bdrv_reopen_prepare = vdi_reopen_prepare,
     .bdrv_create = vdi_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
diff --git a/block/vhdx.c b/block/vhdx.c
new file mode 100644
index 000000000..e9704b1fd
--- /dev/null
+++ b/block/vhdx.c
@@ -0,0 +1,972 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/crc32c.h"
+#include "block/vhdx.h"
+
+
+/* Several metadata and region table data entries are identified by
+ * guids in  a MS-specific GUID format. */
+
+
+/* ------- Known Region Table GUIDs ---------------------- */
+static const MSGUID bat_guid =      { .data1 = 0x2dc27766,
+                                      .data2 = 0xf623,
+                                      .data3 = 0x4200,
+                                      .data4 = { 0x9d, 0x64, 0x11, 0x5e,
+                                                 0x9b, 0xfd, 0x4a, 0x08} };
+
+static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
+                                      .data2 = 0x4790,
+                                      .data3 = 0x4b9a,
+                                      .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
+                                                 0x05, 0x0f, 0x88, 0x6e} };
+
+
+
+/* ------- Known Metadata Entry GUIDs ---------------------- */
+static const MSGUID file_param_guid =   { .data1 = 0xcaa16737,
+                                          .data2 = 0xfa36,
+                                          .data3 = 0x4d43,
+                                          .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
+                                                     0xaa, 0x44, 0xe7, 0x6b} };
+
+static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
+                                          .data2 = 0xcd1b,
+                                          .data3 = 0x4876,
+                                          .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
+                                                     0xd8, 0x3b, 0xf4, 0xb8} };
+
+static const MSGUID page83_guid =       { .data1 = 0xbeca12ab,
+                                          .data2 = 0xb2e6,
+                                          .data3 = 0x4523,
+                                          .data4 = { 0x93, 0xef, 0xc3, 0x09,
+                                                     0xe0, 0x00, 0xc7, 0x46} };
+
+
+static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7,
+                                          .data2 = 0x445d,
+                                          .data3 = 0x4471,
+                                          .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
+                                                     0x52, 0x51, 0xc5, 0x56} };
+
+static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
+                                            .data2 = 0xb30b,
+                                            .data3 = 0x454d,
+                                            .data4 = { 0xab, 0xf7, 0xd3,
+                                                       0xd8, 0x48, 0x34,
+                                                       0xab, 0x0c} };
+
+static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
+                                            .data2 = 0xa96f,
+                                            .data3 = 0x4709,
+                                            .data4 = { 0xba, 0x47, 0xf2,
+                                                       0x33, 0xa8, 0xfa,
+                                                       0xab, 0x5f} };
+
+/* Each parent type must have a valid GUID; this is for parent images
+ * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
+ * need to make up our own QCOW2 GUID type */
+static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
+                                         .data2 = 0xd19e,
+                                         .data3 = 0x4a81,
+                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8,
+                                                    0xe9, 0x44, 0x59, 0x13} };
+
+
+#define META_FILE_PARAMETER_PRESENT      0x01
+#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
+#define META_PAGE_83_PRESENT             0x04
+#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
+#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
+#define META_PARENT_LOCATOR_PRESENT      0x20
+
+#define META_ALL_PRESENT    \
+    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
+     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
+     META_PHYS_SECTOR_SIZE_PRESENT)
+
+typedef struct VHDXMetadataEntries {
+    VHDXMetadataTableEntry file_parameters_entry;
+    VHDXMetadataTableEntry virtual_disk_size_entry;
+    VHDXMetadataTableEntry page83_data_entry;
+    VHDXMetadataTableEntry logical_sector_size_entry;
+    VHDXMetadataTableEntry phys_sector_size_entry;
+    VHDXMetadataTableEntry parent_locator_entry;
+    uint16_t present;
+} VHDXMetadataEntries;
+
+
+typedef struct VHDXSectorInfo {
+    uint32_t bat_idx;       /* BAT entry index */
+    uint32_t sectors_avail; /* sectors available in payload block */
+    uint32_t bytes_left;    /* bytes left in the block after data to r/w */
+    uint32_t bytes_avail;   /* bytes available in payload block */
+    uint64_t file_offset;   /* absolute offset in bytes, in file */
+    uint64_t block_offset;  /* block offset, in bytes */
+} VHDXSectorInfo;
+
+
+
+typedef struct BDRVVHDXState {
+    CoMutex lock;
+
+    int curr_header;
+    VHDXHeader *headers[2];
+
+    VHDXRegionTableHeader rt;
+    VHDXRegionTableEntry bat_rt;         /* region table for the BAT */
+    VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */
+
+    VHDXMetadataTableHeader metadata_hdr;
+    VHDXMetadataEntries metadata_entries;
+
+    VHDXFileParameters params;
+    uint32_t block_size;
+    uint32_t block_size_bits;
+    uint32_t sectors_per_block;
+    uint32_t sectors_per_block_bits;
+
+    uint64_t virtual_disk_size;
+    uint32_t logical_sector_size;
+    uint32_t physical_sector_size;
+
+    uint64_t chunk_ratio;
+    uint32_t chunk_ratio_bits;
+    uint32_t logical_sector_size_bits;
+
+    uint32_t bat_entries;
+    VHDXBatEntry *bat;
+    uint64_t bat_offset;
+
+    VHDXParentLocatorHeader parent_header;
+    VHDXParentLocatorEntry *parent_entries;
+
+} BDRVVHDXState;
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset)
+{
+    uint32_t crc_new;
+    uint32_t crc_orig;
+    assert(buf != NULL);
+
+    if (crc_offset > 0) {
+        memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+        memset(buf + crc_offset, 0, sizeof(crc_orig));
+    }
+
+    crc_new = crc32c(crc, buf, size);
+    if (crc_offset > 0) {
+        memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
+    }
+
+    return crc_new;
+}
+
+/* Validates the checksum of the buffer, with an in-place CRC.
+ *
+ * Zero is substituted during crc calculation for the original crc field,
+ * and the crc field is restored afterwards.  But the buffer will be modifed
+ * during the calculation, so this may not be not suitable for multi-threaded
+ * use.
+ *
+ * crc_offset: byte offset in buf of the buffer crc
+ * buf: buffer pointer
+ * size: size of buffer (must be > crc_offset+4)
+ *
+ * returns true if checksum is valid, false otherwise
+ */
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
+{
+    uint32_t crc_orig;
+    uint32_t crc;
+
+    assert(buf != NULL);
+    assert(size > (crc_offset + 4));
+
+    memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+    crc_orig = le32_to_cpu(crc_orig);
+
+    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
+
+    return crc == crc_orig;
+}
+
+
+/*
+ * Per the MS VHDX Specification, for every VHDX file:
+ *      - The header section is fixed size - 1 MB
+ *      - The header section is always the first "object"
+ *      - The first 64KB of the header is the File Identifier
+ *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
+ *      - The following 512 bytes constitute a UTF-16 string identifiying the
+ *        software that created the file, and is optional and diagnostic only.
+ *
+ *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
+ */
+static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
+        return 100;
+    }
+    return 0;
+}
+
+/* All VHDX structures on disk are little endian */
+static void vhdx_header_le_import(VHDXHeader *h)
+{
+    assert(h != NULL);
+
+    le32_to_cpus(&h->signature);
+    le32_to_cpus(&h->checksum);
+    le64_to_cpus(&h->sequence_number);
+
+    leguid_to_cpus(&h->file_write_guid);
+    leguid_to_cpus(&h->data_write_guid);
+    leguid_to_cpus(&h->log_guid);
+
+    le16_to_cpus(&h->log_version);
+    le16_to_cpus(&h->version);
+    le32_to_cpus(&h->log_length);
+    le64_to_cpus(&h->log_offset);
+}
+
+
+/* opens the specified header block from the VHDX file header section */
+static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    VHDXHeader *header1;
+    VHDXHeader *header2;
+    bool h1_valid = false;
+    bool h2_valid = false;
+    uint64_t h1_seq = 0;
+    uint64_t h2_seq = 0;
+    uint8_t *buffer;
+
+    header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
+    header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
+
+    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
+
+    s->headers[0] = header1;
+    s->headers[1] = header2;
+
+    /* We have to read the whole VHDX_HEADER_SIZE instead of
+     * sizeof(VHDXHeader), because the checksum is over the whole
+     * region */
+    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header1, buffer, sizeof(VHDXHeader));
+    vhdx_header_le_import(header1);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        !memcmp(&header1->signature, "head", 4)             &&
+        header1->version == 1) {
+        h1_seq = header1->sequence_number;
+        h1_valid = true;
+    }
+
+    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header2, buffer, sizeof(VHDXHeader));
+    vhdx_header_le_import(header2);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        !memcmp(&header2->signature, "head", 4)             &&
+        header2->version == 1) {
+        h2_seq = header2->sequence_number;
+        h2_valid = true;
+    }
+
+    /* If there is only 1 valid header (or no valid headers), we
+     * don't care what the sequence numbers are */
+    if (h1_valid && !h2_valid) {
+        s->curr_header = 0;
+    } else if (!h1_valid && h2_valid) {
+        s->curr_header = 1;
+    } else if (!h1_valid && !h2_valid) {
+        ret = -EINVAL;
+        goto fail;
+    } else {
+        /* If both headers are valid, then we choose the active one by the
+         * highest sequence number.  If the sequence numbers are equal, that is
+         * invalid */
+        if (h1_seq > h2_seq) {
+            s->curr_header = 0;
+        } else if (h2_seq > h1_seq) {
+            s->curr_header = 1;
+        } else {
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
+    ret = 0;
+
+    goto exit;
+
+fail:
+    qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
+    qemu_vfree(header1);
+    qemu_vfree(header2);
+    s->headers[0] = NULL;
+    s->headers[1] = NULL;
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    VHDXRegionTableEntry rt_entry;
+    uint32_t i;
+    bool bat_rt_found = false;
+    bool metadata_rt_found = false;
+
+    /* We have to read the whole 64KB block, because the crc32 is over the
+     * whole block */
+    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
+
+    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+                     VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    memcpy(&s->rt, buffer, sizeof(s->rt));
+    le32_to_cpus(&s->rt.signature);
+    le32_to_cpus(&s->rt.checksum);
+    le32_to_cpus(&s->rt.entry_count);
+    le32_to_cpus(&s->rt.reserved);
+    offset += sizeof(s->rt);
+
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
+        memcmp(&s->rt.signature, "regi", 4)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Per spec, maximum region table entry count is 2047 */
+    if (s->rt.entry_count > 2047) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    for (i = 0; i < s->rt.entry_count; i++) {
+        memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
+        offset += sizeof(rt_entry);
+
+        leguid_to_cpus(&rt_entry.guid);
+        le64_to_cpus(&rt_entry.file_offset);
+        le32_to_cpus(&rt_entry.length);
+        le32_to_cpus(&rt_entry.data_bits);
+
+        /* see if we recognize the entry */
+        if (guid_eq(rt_entry.guid, bat_guid)) {
+            /* must be unique; if we have already found it this is invalid */
+            if (bat_rt_found) {
+                ret = -EINVAL;
+                goto fail;
+            }
+            bat_rt_found = true;
+            s->bat_rt = rt_entry;
+            continue;
+        }
+
+        if (guid_eq(rt_entry.guid, metadata_guid)) {
+            /* must be unique; if we have already found it this is invalid */
+            if (metadata_rt_found) {
+                ret = -EINVAL;
+                goto fail;
+            }
+            metadata_rt_found = true;
+            s->metadata_rt = rt_entry;
+            continue;
+        }
+
+        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+
+/* Metadata initial parser
+ *
+ * This loads all the metadata entry fields.  This may cause additional
+ * fields to be processed (e.g. parent locator, etc..).
+ *
+ * There are 5 Metadata items that are always required:
+ *      - File Parameters (block size, has a parent)
+ *      - Virtual Disk Size (size, in bytes, of the virtual drive)
+ *      - Page 83 Data (scsi page 83 guid)
+ *      - Logical Sector Size (logical sector size in bytes, either 512 or
+ *                             4096.  We only support 512 currently)
+ *      - Physical Sector Size (512 or 4096)
+ *
+ * Also, if the File Parameters indicate this is a differencing file,
+ * we must also look for the Parent Locator metadata item.
+ */
+static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    uint32_t i = 0;
+    VHDXMetadataTableEntry md_entry;
+
+    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
+
+    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+                     VHDX_METADATA_TABLE_MAX_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
+    offset += sizeof(s->metadata_hdr);
+
+    le64_to_cpus(&s->metadata_hdr.signature);
+    le16_to_cpus(&s->metadata_hdr.reserved);
+    le16_to_cpus(&s->metadata_hdr.entry_count);
+
+    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->metadata_entries.present = 0;
+
+    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
+        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
+        memcpy(&md_entry, buffer + offset, sizeof(md_entry));
+        offset += sizeof(md_entry);
+
+        leguid_to_cpus(&md_entry.item_id);
+        le32_to_cpus(&md_entry.offset);
+        le32_to_cpus(&md_entry.length);
+        le32_to_cpus(&md_entry.data_bits);
+        le32_to_cpus(&md_entry.reserved2);
+
+        if (guid_eq(md_entry.item_id, file_param_guid)) {
+            if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.file_parameters_entry = md_entry;
+            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
+            if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.virtual_disk_size_entry = md_entry;
+            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, page83_guid)) {
+            if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.page83_data_entry = md_entry;
+            s->metadata_entries.present |= META_PAGE_83_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
+            if (s->metadata_entries.present &
+                META_LOGICAL_SECTOR_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.logical_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
+            if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.phys_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
+            if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.parent_locator_entry = md_entry;
+            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
+            continue;
+        }
+
+        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto exit;
+        }
+    }
+
+    if (s->metadata_entries.present != META_ALL_PRESENT) {
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.file_parameters_entry.offset
+                                         + s->metadata_rt.file_offset,
+                     &s->params,
+                     sizeof(s->params));
+
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le32_to_cpus(&s->params.block_size);
+    le32_to_cpus(&s->params.data_bits);
+
+
+    /* We now have the file parameters, so we can tell if this is a
+     * differencing file (i.e.. has_parent), is dynamic or fixed
+     * sized (leave_blocks_allocated), and the block size */
+
+    /* The parent locator required iff the file parameters has_parent set */
+    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+        if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+            /* TODO: parse  parent locator fields */
+            ret = -ENOTSUP; /* temp, until differencing files are supported */
+            goto exit;
+        } else {
+            /* if has_parent is set, but there is not parent locator present,
+             * then that is an invalid combination */
+            ret = -EINVAL;
+            goto exit;
+        }
+    }
+
+    /* determine virtual disk size, logical sector size,
+     * and phys sector size */
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.virtual_disk_size_entry.offset
+                                           + s->metadata_rt.file_offset,
+                     &s->virtual_disk_size,
+                     sizeof(uint64_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.logical_sector_size_entry.offset
+                                             + s->metadata_rt.file_offset,
+                     &s->logical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.phys_sector_size_entry.offset
+                                          + s->metadata_rt.file_offset,
+                     &s->physical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le64_to_cpus(&s->virtual_disk_size);
+    le32_to_cpus(&s->logical_sector_size);
+    le32_to_cpus(&s->physical_sector_size);
+
+    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* both block_size and sector_size are guaranteed powers of 2 */
+    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+                     (uint64_t)s->logical_sector_size /
+                     (uint64_t)s->params.block_size;
+
+    /* These values are ones we will want to use for division / multiplication
+     * later on, and they are all guaranteed (per the spec) to be powers of 2,
+     * so we can take advantage of that for shift operations during
+     * reads/writes */
+    if (s->logical_sector_size & (s->logical_sector_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    if (s->sectors_per_block & (s->sectors_per_block - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    if (s->chunk_ratio & (s->chunk_ratio - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    s->block_size = s->params.block_size;
+    if (s->block_size & (s->block_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
+    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
+    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
+    s->block_size_bits =          31 - clz32(s->block_size);
+
+    ret = 0;
+
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+/* Parse the replay log.  Per the VHDX spec, if the log is present
+ * it must be replayed prior to opening the file, even read-only.
+ *
+ * If read-only, we must replay the log in RAM (or refuse to open
+ * a dirty VHDX file read-only */
+static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    int i;
+    VHDXHeader *hdr;
+
+    hdr = s->headers[s->curr_header];
+
+    /* either the log guid, or log length is zero,
+     * then a replay log is present */
+    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
+        ret |= hdr->log_guid.data4[i];
+    }
+    if (hdr->log_guid.data1 == 0 &&
+        hdr->log_guid.data2 == 0 &&
+        hdr->log_guid.data3 == 0 &&
+        ret == 0) {
+        goto exit;
+    }
+
+    /* per spec, only log version of 0 is supported */
+    if (hdr->log_version != 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    if (hdr->log_length == 0) {
+        goto exit;
+    }
+
+    /* We currently do not support images with logs to replay */
+    ret = -ENOTSUP;
+
+exit:
+    return ret;
+}
+
+
+static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    uint32_t i;
+    uint64_t signature;
+    uint32_t data_blocks_cnt, bitmap_blocks_cnt;
+
+
+    s->bat = NULL;
+
+    qemu_co_mutex_init(&s->lock);
+
+    /* validate the file signature */
+    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+    if (memcmp(&signature, "vhdxfile", 8)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = vhdx_parse_header(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_log(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_open_region_tables(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_metadata(bs, s);
+    if (ret) {
+        goto fail;
+    }
+    s->block_size = s->params.block_size;
+
+    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
+     * logical_sector_size */
+    bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
+
+    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
+    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
+        data_blocks_cnt++;
+    }
+    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
+    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
+        bitmap_blocks_cnt++;
+    }
+
+    if (s->parent_entries) {
+        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
+    } else {
+        s->bat_entries = data_blocks_cnt +
+                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
+    }
+
+    s->bat_offset = s->bat_rt.file_offset;
+
+    if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
+        /* BAT allocation is not large enough for all entries */
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+
+    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < s->bat_entries; i++) {
+        le64_to_cpus(&s->bat[i]);
+    }
+
+    if (flags & BDRV_O_RDWR) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* TODO: differencing files, write */
+
+    return 0;
+fail:
+    qemu_vfree(s->headers[0]);
+    qemu_vfree(s->headers[1]);
+    qemu_vfree(s->bat);
+    qemu_vfree(s->parent_entries);
+    return ret;
+}
+
+static int vhdx_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+
+/*
+ * Perform sector to block offset translations, to get various
+ * sector and file offsets into the image.  See VHDXSectorInfo
+ */
+static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
+                                 int nb_sectors, VHDXSectorInfo *sinfo)
+{
+    uint32_t block_offset;
+
+    sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
+    /* effectively a modulo - this gives us the offset into the block
+     * (in sector sizes) for our sector number */
+    block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
+    /* the chunk ratio gives us the interleaving of the sector
+     * bitmaps, so we need to advance our page block index by the
+     * sector bitmaps entry number */
+    sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
+
+    /* the number of sectors we can read/write in this cycle */
+    sinfo->sectors_avail = s->sectors_per_block - block_offset;
+
+    sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+    if (sinfo->sectors_avail > nb_sectors) {
+        sinfo->sectors_avail = nb_sectors;
+    }
+
+    sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+    sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
+
+    sinfo->block_offset = block_offset << s->logical_sector_size_bits;
+
+    /* The file offset must be past the header section, so must be > 0 */
+    if (sinfo->file_offset == 0) {
+        return;
+    }
+
+    /* block offset is the offset in vhdx logical sectors, in
+     * the payload data block. Convert that to a byte offset
+     * in the block, and add in the payload data block offset
+     * in the file, in bytes, to get the final read address */
+
+    sinfo->file_offset <<= 20;  /* now in bytes, rather than 1MB units */
+    sinfo->file_offset += sinfo->block_offset;
+}
+
+
+
+static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    VHDXSectorInfo sinfo;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors > 0) {
+        /* We are a differencing file, so we need to inspect the sector bitmap
+         * to see if we have the data or not */
+        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+            /* not supported yet */
+            ret = -ENOTSUP;
+            goto exit;
+        } else {
+            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
+
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail);
+
+            /* check the payload block state */
+            switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
+            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
+            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
+            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
+            case PAYLOAD_BLOCK_ZERO:
+                /* return zero */
+                qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
+                break;
+            case PAYLOAD_BLOCK_FULL_PRESENT:
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->file,
+                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
+                                    sinfo.sectors_avail, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto exit;
+                }
+                break;
+            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
+                /* we don't yet support difference files, fall through
+                 * to error */
+            default:
+                ret = -EIO;
+                goto exit;
+                break;
+            }
+            nb_sectors -= sinfo.sectors_avail;
+            sector_num += sinfo.sectors_avail;
+            bytes_done += sinfo.bytes_avail;
+        }
+    }
+    ret = 0;
+exit:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+
+
+static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return -ENOTSUP;
+}
+
+
+static void vhdx_close(BlockDriverState *bs)
+{
+    BDRVVHDXState *s = bs->opaque;
+    qemu_vfree(s->headers[0]);
+    qemu_vfree(s->headers[1]);
+    qemu_vfree(s->bat);
+    qemu_vfree(s->parent_entries);
+}
+
+static BlockDriver bdrv_vhdx = {
+    .format_name            = "vhdx",
+    .instance_size          = sizeof(BDRVVHDXState),
+    .bdrv_probe             = vhdx_probe,
+    .bdrv_open              = vhdx_open,
+    .bdrv_close             = vhdx_close,
+    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_co_readv          = vhdx_co_readv,
+    .bdrv_co_writev         = vhdx_co_writev,
+};
+
+static void bdrv_vhdx_init(void)
+{
+    bdrv_register(&bdrv_vhdx);
+}
+
+block_init(bdrv_vhdx_init);
diff --git a/block/vhdx.h b/block/vhdx.h
new file mode 100644
index 000000000..fb687ed2d
--- /dev/null
+++ b/block/vhdx.h
@@ -0,0 +1,325 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_VHDX_H
+#define BLOCK_VHDX_H
+
+/* Structures and fields present in the VHDX file */
+
+/* The header section has the following blocks,
+ * each block is 64KB:
+ *
+ * _____________________________________________________________________________
+ * | File Id. |   Header 1    | Header 2   | Region Table |  Reserved (768KB)  |
+ * |----------|---------------|------------|--------------|--------------------|
+ * |          |               |            |              |                    |
+ * 0.........64KB...........128KB........192KB..........256KB................1MB
+ */
+
+#define VHDX_HEADER_BLOCK_SIZE      (64*1024)
+
+#define VHDX_FILE_ID_OFFSET         0
+#define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE*1)
+#define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE*2)
+#define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE*3)
+
+
+/*
+ * A note on the use of MS-GUID fields.  For more details on the GUID,
+ * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier.
+ *
+ * The VHDX specification only states that these are MS GUIDs, and which
+ * bytes are data1-data4. It makes no mention of what algorithm should be used
+ * to generate the GUID, nor what standard.  However, looking at the specified
+ * known GUID fields, it appears the GUIDs are:
+ *  Standard/DCE GUID type  (noted by 10b in the MSB of byte 0 of .data4)
+ *  Random algorithm        (noted by 0x4XXX for .data3)
+ */
+
+/* ---- HEADER SECTION STRUCTURES ---- */
+
+/* These structures are ones that are defined in the VHDX specification
+ * document */
+
+typedef struct VHDXFileIdentifier {
+    uint64_t    signature;              /* "vhdxfile" in ASCII */
+    uint16_t    creator[256];           /* optional; utf-16 string to identify
+                                           the vhdx file creator.  Diagnotistic
+                                           only */
+} VHDXFileIdentifier;
+
+
+/* the guid is a 16 byte unique ID - the definition for this used by
+ * Microsoft is not just 16 bytes though - it is a structure that is defined,
+ * so we need to follow it here so that endianness does not trip us up */
+
+typedef struct MSGUID {
+    uint32_t  data1;
+    uint16_t  data2;
+    uint16_t  data3;
+    uint8_t   data4[8];
+} MSGUID;
+
+#define guid_eq(a, b) \
+    (memcmp(&(a), &(b), sizeof(MSGUID)) == 0)
+
+#define VHDX_HEADER_SIZE (4*1024)   /* although the vhdx_header struct in disk
+                                       is only 582 bytes, for purposes of crc
+                                       the header is the first 4KB of the 64KB
+                                       block */
+
+/* The full header is 4KB, although the actual header data is much smaller.
+ * But for the checksum calculation, it is over the entire 4KB structure,
+ * not just the defined portion of it */
+typedef struct QEMU_PACKED VHDXHeader {
+    uint32_t    signature;              /* "head" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the whole header */
+    uint64_t    sequence_number;        /* Seq number of this header.  Each
+                                           VHDX file has 2 of these headers,
+                                           and only the header with the highest
+                                           sequence number is valid */
+    MSGUID      file_write_guid;       /* 128 bit unique identifier. Must be
+                                           updated to new, unique value before
+                                           the first modification is made to
+                                           file */
+    MSGUID      data_write_guid;        /* 128 bit unique identifier. Must be
+                                           updated to new, unique value before
+                                           the first modification is made to
+                                           visible data.   Visbile data is
+                                           defined as:
+                                                    - system & user metadata
+                                                    - raw block data
+                                                    - disk size
+                                                    - any change that will
+                                                      cause the virtual disk
+                                                      sector read to differ
+
+                                           This does not need to change if
+                                           blocks are re-arranged */
+    MSGUID      log_guid;               /* 128 bit unique identifier. If zero,
+                                           there is no valid log. If non-zero,
+                                           log entries with this guid are
+                                           valid. */
+    uint16_t    log_version;            /* version of the log format. Mustn't be
+                                           zero, unless log_guid is also zero */
+    uint16_t    version;                /* version of th evhdx file.  Currently,
+                                           only supported version is "1" */
+    uint32_t    log_length;             /* length of the log.  Must be multiple
+                                           of 1MB */
+    uint64_t    log_offset;             /* byte offset in the file of the log.
+                                           Must also be a multiple of 1MB */
+} VHDXHeader;
+
+/* Header for the region table block */
+typedef struct QEMU_PACKED VHDXRegionTableHeader {
+    uint32_t    signature;              /* "regi" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
+    uint32_t    entry_count;            /* number of valid entries */
+    uint32_t    reserved;
+} VHDXRegionTableHeader;
+
+/* Individual region table entry.  There may be a maximum of 2047 of these
+ *
+ *  There are two known region table properties.  Both are required.
+ *  BAT (block allocation table):  2DC27766F62342009D64115E9BFD4A08
+ *  Metadata:                      8B7CA20647904B9AB8FE575F050F886E
+ */
+#define VHDX_REGION_ENTRY_REQUIRED  0x01    /* if set, parser must understand
+                                               this entry in order to open
+                                               file */
+typedef struct QEMU_PACKED VHDXRegionTableEntry {
+    MSGUID      guid;                   /* 128-bit unique identifier */
+    uint64_t    file_offset;            /* offset of the object in the file.
+                                           Must be multiple of 1MB */
+    uint32_t    length;                 /* length, in bytes, of the object */
+    uint32_t    data_bits;
+} VHDXRegionTableEntry;
+
+
+/* ---- LOG ENTRY STRUCTURES ---- */
+#define VHDX_LOG_HDR_SIZE 64
+typedef struct QEMU_PACKED VHDXLogEntryHeader {
+    uint32_t    signature;              /* "loge" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
+    uint32_t    entry_length;           /* length in bytes, multiple of 1MB */
+    uint32_t    tail;                   /* byte offset of first log entry of a
+                                           seq, where this entry is the last
+                                           entry */
+    uint64_t    sequence_number;        /* incremented with each log entry.
+                                           May not be zero. */
+    uint32_t    descriptor_count;       /* number of descriptors in this log
+                                           entry, must be >= 0 */
+    uint32_t    reserved;
+    MSGUID      log_guid;               /* value of the log_guid from
+                                           vhdx_header.  If not found in
+                                           vhdx_header, it is invalid */
+    uint64_t    flushed_file_offset;    /* see spec for full details - this
+                                           should be vhdx file size in bytes */
+    uint64_t    last_file_offset;       /* size in bytes that all allocated
+                                           file structures fit into */
+} VHDXLogEntryHeader;
+
+#define VHDX_LOG_DESC_SIZE 32
+
+typedef struct QEMU_PACKED VHDXLogDescriptor {
+    uint32_t    signature;              /* "zero" or "desc" in ASCII */
+    union  {
+        uint32_t    reserved;           /* zero desc */
+        uint32_t    trailing_bytes;     /* data desc: bytes 4092-4096 of the
+                                           data sector */
+    };
+    union {
+        uint64_t    zero_length;        /* zero desc: length of the section to
+                                           zero */
+        uint64_t    leading_bytes;      /* data desc: bytes 0-7 of the data
+                                           sector */
+    };
+    uint64_t    file_offset;            /* file offset to write zeros - multiple
+                                           of 4kB */
+    uint64_t    sequence_number;        /* must match same field in
+                                           vhdx_log_entry_header */
+} VHDXLogDescriptor;
+
+typedef struct QEMU_PACKED VHDXLogDataSector {
+    uint32_t    data_signature;         /* "data" in ASCII */
+    uint32_t    sequence_high;          /* 4 MSB of 8 byte sequence_number */
+    uint8_t     data[4084];             /* raw data, bytes 8-4091 (inclusive).
+                                           see the data descriptor field for the
+                                           other mising bytes */
+    uint32_t    sequence_low;           /* 4 LSB of 8 byte sequence_number */
+} VHDXLogDataSector;
+
+
+
+/* block states - different state values depending on whether it is a
+ * payload block, or a sector block. */
+
+#define PAYLOAD_BLOCK_NOT_PRESENT       0
+#define PAYLOAD_BLOCK_UNDEFINED         1
+#define PAYLOAD_BLOCK_ZERO              2
+#define PAYLOAD_BLOCK_UNMAPPED          5
+#define PAYLOAD_BLOCK_FULL_PRESENT      6
+#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7
+
+#define SB_BLOCK_NOT_PRESENT    0
+#define SB_BLOCK_PRESENT        6
+
+/* per the spec */
+#define VHDX_MAX_SECTORS_PER_BLOCK  (1<<23)
+
+/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state
+   other bits are reserved */
+#define VHDX_BAT_STATE_BIT_MASK 0x07
+#define VHDX_BAT_FILE_OFF_BITS (64-44)
+typedef uint64_t VHDXBatEntry;
+
+/* ---- METADATA REGION STRUCTURES ---- */
+
+#define VHDX_METADATA_ENTRY_SIZE 32
+#define VHDX_METADATA_MAX_ENTRIES 2047  /* not including the header */
+#define VHDX_METADATA_TABLE_MAX_SIZE \
+    (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1))
+typedef struct QEMU_PACKED VHDXMetadataTableHeader {
+    uint64_t    signature;              /* "metadata" in ASCII */
+    uint16_t    reserved;
+    uint16_t    entry_count;            /* number table entries. <= 2047 */
+    uint32_t    reserved2[5];
+} VHDXMetadataTableHeader;
+
+#define VHDX_META_FLAGS_IS_USER         0x01    /* max 1024 entries */
+#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02    /* virtual disk metadata if set,
+                                                   otherwise file metdata */
+#define VHDX_META_FLAGS_IS_REQUIRED     0x04    /* parse must understand this
+                                                   entry to open the file */
+typedef struct QEMU_PACKED VHDXMetadataTableEntry {
+    MSGUID      item_id;                /* 128-bit identifier for metadata */
+    uint32_t    offset;                 /* byte offset of the metadata.  At
+                                           least 64kB.  Relative to start of
+                                           metadata region */
+                                        /* note: if length = 0, so is offset */
+    uint32_t    length;                 /* length of metadata. <= 1MB. */
+    uint32_t    data_bits;      /* least-significant 3 bits are flags, the
+                                   rest are reserved (see above) */
+    uint32_t    reserved2;
+} VHDXMetadataTableEntry;
+
+#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01   /* Do not change any blocks to
+                                                   be BLOCK_NOT_PRESENT.
+                                                   If set indicates a fixed
+                                                   size VHDX file */
+#define VHDX_PARAMS_HAS_PARENT           0x02    /* has parent / backing file */
+typedef struct QEMU_PACKED VHDXFileParameters {
+    uint32_t    block_size;             /* size of each payload block, always
+                                           power of 2, <= 256MB and >= 1MB. */
+    uint32_t data_bits;     /* least-significant 2 bits are flags, the rest
+                               are reserved (see above) */
+} VHDXFileParameters;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskSize {
+    uint64_t    virtual_disk_size;      /* Size of the virtual disk, in bytes.
+                                           Must be multiple of the sector size,
+                                           max of 64TB */
+} VHDXVirtualDiskSize;
+
+typedef struct QEMU_PACKED VHDXPage83Data {
+    MSGUID      page_83_data[16];       /* unique id for scsi devices that
+                                           support page 0x83 */
+} VHDXPage83Data;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize {
+    uint32_t    logical_sector_size;    /* virtual disk sector size (in bytes).
+                                           Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskLogicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize {
+    uint32_t    physical_sector_size;   /* physical sector size (in bytes).
+                                           Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskPhysicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXParentLocatorHeader {
+    MSGUID      locator_type[16];       /* type of the parent virtual disk. */
+    uint16_t    reserved;
+    uint16_t    key_value_count;        /* number of key/value pairs for this
+                                           locator */
+} VHDXParentLocatorHeader;
+
+/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */
+typedef struct QEMU_PACKED VHDXParentLocatorEntry {
+    uint32_t    key_offset;             /* offset in metadata for key, > 0 */
+    uint32_t    value_offset;           /* offset in metadata for value, >0 */
+    uint16_t    key_length;             /* length of entry key, > 0 */
+    uint16_t    value_length;           /* length of entry value, > 0 */
+} VHDXParentLocatorEntry;
+
+
+/* ----- END VHDX SPECIFICATION STRUCTURES ---- */
+
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset);
+
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
+
+
+static void leguid_to_cpus(MSGUID *guid)
+{
+    le32_to_cpus(&guid->data1);
+    le16_to_cpus(&guid->data2);
+    le16_to_cpus(&guid->data3);
+}
+
+#endif
diff --git a/block/vmdk.c b/block/vmdk.c
index 51398c0c0..346bb5cad 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -24,19 +24,33 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 #include <zlib.h>
 
 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
 #define VMDK4_COMPRESSION_DEFLATE 1
+#define VMDK4_FLAG_NL_DETECT (1 << 0)
 #define VMDK4_FLAG_RGD (1 << 1)
+/* Zeroed-grain enable bit */
+#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
 #define VMDK4_FLAG_COMPRESS (1 << 16)
 #define VMDK4_FLAG_MARKER (1 << 17)
 #define VMDK4_GD_AT_END 0xffffffffffffffffULL
 
+#define VMDK_GTE_ZEROED 0x1
+
+/* VMDK internal error codes */
+#define VMDK_OK      0
+#define VMDK_ERROR   (-1)
+/* Cluster not allocated */
+#define VMDK_UNALLOC (-2)
+#define VMDK_ZEROED  (-3)
+
+#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
+
 typedef struct {
     uint32_t version;
     uint32_t flags;
@@ -48,19 +62,20 @@ typedef struct {
     uint32_t cylinders;
     uint32_t heads;
     uint32_t sectors_per_track;
-} VMDK3Header;
+} QEMU_PACKED VMDK3Header;
 
 typedef struct {
     uint32_t version;
     uint32_t flags;
-    int64_t capacity;
-    int64_t granularity;
-    int64_t desc_offset;
-    int64_t desc_size;
-    int32_t num_gtes_per_gte;
-    int64_t rgd_offset;
-    int64_t gd_offset;
-    int64_t grain_offset;
+    uint64_t capacity;
+    uint64_t granularity;
+    uint64_t desc_offset;
+    uint64_t desc_size;
+    /* Number of GrainTableEntries per GrainTable */
+    uint32_t num_gtes_per_gt;
+    uint64_t rgd_offset;
+    uint64_t gd_offset;
+    uint64_t grain_offset;
     char filler[1];
     char check_bytes[4];
     uint16_t compressAlgorithm;
@@ -73,6 +88,8 @@ typedef struct VmdkExtent {
     bool flat;
     bool compressed;
     bool has_marker;
+    bool has_zero_grain;
+    int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
@@ -93,7 +110,7 @@ typedef struct VmdkExtent {
 
 typedef struct BDRVVmdkState {
     CoMutex lock;
-    int desc_offset;
+    uint64_t desc_offset;
     bool cid_updated;
     uint32_t parent_cid;
     int num_extents;
@@ -108,13 +125,14 @@ typedef struct VmdkMetaData {
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
+    uint32_t *l2_cache_entry;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
     uint64_t lba;
     uint32_t size;
     uint8_t  data[0];
-} VmdkGrainMarker;
+} QEMU_PACKED VmdkGrainMarker;
 
 enum {
     MARKER_END_OF_STREAM    = 0,
@@ -368,15 +386,22 @@ static int vmdk_parent_open(BlockDriverState *bs)
 
 /* Create and append extent to the extent array. Return the added VmdkExtent
  * address. return NULL if allocation failed. */
-static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
+static int vmdk_add_extent(BlockDriverState *bs,
                            BlockDriverState *file, bool flat, int64_t sectors,
                            int64_t l1_offset, int64_t l1_backup_offset,
                            uint32_t l1_size,
-                           int l2_size, unsigned int cluster_sectors)
+                           int l2_size, uint64_t cluster_sectors,
+                           VmdkExtent **new_extent)
 {
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
 
+    if (cluster_sectors > 0x200000) {
+        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
+        error_report("invalid granularity, image may be corrupt");
+        return -EINVAL;
+    }
+
     s->extents = g_realloc(s->extents,
                               (s->num_extents + 1) * sizeof(VmdkExtent));
     extent = &s->extents[s->num_extents];
@@ -399,7 +424,10 @@ static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
         extent->end_sector = extent->sectors;
     }
     bs->total_sectors = extent->end_sector;
-    return extent;
+    if (new_extent) {
+        *new_extent = extent;
+    }
+    return 0;
 }
 
 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
@@ -458,12 +486,17 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
-    extent = vmdk_add_extent(bs,
+
+    ret = vmdk_add_extent(bs,
                              bs->file, false,
                              le32_to_cpu(header.disk_sectors),
                              le32_to_cpu(header.l1dir_offset) << 9,
                              0, 1 << 6, 1 << 9,
-                             le32_to_cpu(header.granularity));
+                             le32_to_cpu(header.granularity),
+                             &extent);
+    if (ret < 0) {
+        return ret;
+    }
     ret = vmdk_init_tables(bs, extent);
     if (ret) {
         /* free extent allocated by vmdk_add_extent */
@@ -473,7 +506,7 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
-                               int64_t desc_offset);
+                               uint64_t desc_offset);
 
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
@@ -490,8 +523,11 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
-    if (header.capacity == 0 && header.desc_offset) {
-        return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
+    if (header.capacity == 0) {
+        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
+        if (desc_offset) {
+            return vmdk_open_desc_file(bs, flags, desc_offset << 9);
+        }
     }
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
@@ -541,26 +577,54 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         header = footer.header;
     }
 
-    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
+    if (le32_to_cpu(header.version) >= 3) {
+        char buf[64];
+        snprintf(buf, sizeof(buf), "VMDK version %d",
+                 le32_to_cpu(header.version));
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                bs->device_name, "vmdk", buf);
+        return -ENOTSUP;
+    }
+
+    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
+        error_report("L2 table size too big");
+        return -EINVAL;
+    }
+
+    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
                         * le64_to_cpu(header.granularity);
     if (l1_entry_sectors == 0) {
         return -EINVAL;
     }
     l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                 / l1_entry_sectors;
+    if (l1_size > 512 * 1024 * 1024) {
+        /* although with big capacity and small l1_entry_sectors, we can get a
+         * big l1_size, we don't want unbounded value to allocate the table.
+         * Limit it to 512M, which is 16PB for default cluster and L2 table
+         * size */
+        error_report("L1 size too big");
+        return -EFBIG;
+    }
     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
     }
-    extent = vmdk_add_extent(bs, file, false,
+    ret = vmdk_add_extent(bs, file, false,
                           le64_to_cpu(header.capacity),
                           le64_to_cpu(header.gd_offset) << 9,
                           l1_backup_offset,
                           l1_size,
-                          le32_to_cpu(header.num_gtes_per_gte),
-                          le64_to_cpu(header.granularity));
+                          le32_to_cpu(header.num_gtes_per_gt),
+                          le64_to_cpu(header.granularity),
+                          &extent);
+    if (ret < 0) {
+        return ret;
+    }
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
+    extent->version = le32_to_cpu(header.version);
+    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
     ret = vmdk_init_tables(bs, extent);
     if (ret) {
         /* free extent allocated by vmdk_add_extent */
@@ -578,22 +642,22 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
 
     opt_pos = strstr(desc, opt_name);
     if (!opt_pos) {
-        return -1;
+        return VMDK_ERROR;
     }
     /* Skip "=\"" following opt_name */
     opt_pos += strlen(opt_name) + 2;
     if (opt_pos >= end) {
-        return -1;
+        return VMDK_ERROR;
     }
     opt_end = opt_pos;
     while (opt_end < end && *opt_end != '"') {
         opt_end++;
     }
     if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
-        return -1;
+        return VMDK_ERROR;
     }
     pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
-    return 0;
+    return VMDK_OK;
 }
 
 /* Open an extent file and append to bs array */
@@ -616,7 +680,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
             return vmdk_open_vmdk4(bs, file, flags);
             break;
         default:
-            return -EINVAL;
+            return -EMEDIUMTYPE;
             break;
     }
 }
@@ -641,7 +705,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
          * RW [size in sectors] SPARSE "file-name.vmdk"
          */
         flat_offset = -1;
-        ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64,
+        ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
                 access, &sectors, type, fname, &flat_offset);
         if (ret < 4 || strcmp(access, "RW")) {
             goto next_line;
@@ -653,14 +717,6 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             return -EINVAL;
         }
 
-        /* trim the quotation marks around */
-        if (fname[0] == '"') {
-            memmove(fname, fname + 1, strlen(fname));
-            if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') {
-                return -EINVAL;
-            }
-            fname[strlen(fname) - 1] = '\0';
-        }
         if (sectors <= 0 ||
             (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
             (strcmp(access, "RW"))) {
@@ -669,7 +725,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
 
         path_combine(extent_path, sizeof(extent_path),
                 desc_file_path, fname);
-        ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags);
+        ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags);
         if (ret) {
             return ret;
         }
@@ -679,8 +735,11 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             /* FLAT extent */
             VmdkExtent *extent;
 
-            extent = vmdk_add_extent(bs, extent_file, true, sectors,
-                            0, 0, 0, 0, sectors);
+            ret = vmdk_add_extent(bs, extent_file, true, sectors,
+                            0, 0, 0, 0, sectors, &extent);
+            if (ret < 0) {
+                return ret;
+            }
             extent->flat_start_offset = flat_offset << 9;
         } else if (!strcmp(type, "SPARSE")) {
             /* SPARSE extent */
@@ -705,33 +764,46 @@ next_line:
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
-                               int64_t desc_offset)
+                               uint64_t desc_offset)
 {
     int ret;
-    char buf[2048];
+    char *buf = NULL;
     char ct[128];
     BDRVVmdkState *s = bs->opaque;
+    int64_t size;
 
-    ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
+    size = bdrv_getlength(bs->file);
+    if (size < 0) {
+        return -EINVAL;
+    }
+
+    size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
+    buf = g_malloc0(size + 1);
+
+    ret = bdrv_pread(bs->file, desc_offset, buf, size);
     if (ret < 0) {
-        return ret;
+        goto exit;
     }
-    buf[2047] = '\0';
     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
-        return -EINVAL;
+        ret = -EMEDIUMTYPE;
+        goto exit;
     }
     if (strcmp(ct, "monolithicFlat") &&
         strcmp(ct, "twoGbMaxExtentSparse") &&
         strcmp(ct, "twoGbMaxExtentFlat")) {
         fprintf(stderr,
                 "VMDK: Not supported image type \"%s\""".\n", ct);
-        return -ENOTSUP;
+        ret = -ENOTSUP;
+        goto exit;
     }
     s->desc_offset = 0;
-    return vmdk_parse_extents(buf, bs, bs->file->filename);
+    ret = vmdk_parse_extents(buf, bs, bs->file->filename);
+exit:
+    g_free(buf);
+    return ret;
 }
 
-static int vmdk_open(BlockDriverState *bs, int flags)
+static int vmdk_open(BlockDriverState *bs, QDict *options, int flags)
 {
     int ret;
     BDRVVmdkState *s = bs->opaque;
@@ -771,16 +843,17 @@ static int get_whole_cluster(BlockDriverState *bs,
                 uint64_t offset,
                 bool allocate)
 {
-    /* 128 sectors * 512 bytes each = grain size 64KB */
-    uint8_t  whole_grain[extent->cluster_sectors * 512];
+    int ret = VMDK_OK;
+    uint8_t *whole_grain = NULL;
 
     /* we will be here if it's first write on non-exist grain(cluster).
      * try to read from parent image, if exist */
     if (bs->backing_hd) {
-        int ret;
-
+        whole_grain =
+            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
         if (!vmdk_is_cid_valid(bs)) {
-            return -1;
+            ret = VMDK_ERROR;
+            goto exit;
         }
 
         /* floor offset to cluster */
@@ -788,30 +861,35 @@ static int get_whole_cluster(BlockDriverState *bs,
         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
                 extent->cluster_sectors);
         if (ret < 0) {
-            return -1;
+            ret = VMDK_ERROR;
+            goto exit;
         }
 
         /* Write grain only into the active image */
         ret = bdrv_write(extent->file, cluster_offset, whole_grain,
                 extent->cluster_sectors);
         if (ret < 0) {
-            return -1;
+            ret = VMDK_ERROR;
+            goto exit;
         }
     }
-    return 0;
+exit:
+    qemu_vfree(whole_grain);
+    return ret;
 }
 
 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
 {
+    uint32_t offset;
+    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
+    offset = cpu_to_le32(m_data->offset);
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
                 ((int64_t)m_data->l2_offset * 512)
                     + (m_data->l2_index * sizeof(m_data->offset)),
-                &(m_data->offset),
-                sizeof(m_data->offset)
-            ) < 0) {
-        return -1;
+                &offset, sizeof(offset)) < 0) {
+        return VMDK_ERROR;
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
@@ -820,13 +898,15 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
                         + (m_data->l2_index * sizeof(m_data->offset)),
-                    &(m_data->offset), sizeof(m_data->offset)
-                ) < 0) {
-            return -1;
+                    &offset, sizeof(offset)) < 0) {
+            return VMDK_ERROR;
         }
     }
+    if (m_data->l2_cache_entry) {
+        *m_data->l2_cache_entry = offset;
+    }
 
-    return 0;
+    return VMDK_OK;
 }
 
 static int get_cluster_offset(BlockDriverState *bs,
@@ -838,24 +918,25 @@ static int get_cluster_offset(BlockDriverState *bs,
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
-    uint32_t min_count, *l2_table, tmp = 0;
+    uint32_t min_count, *l2_table;
+    bool zeroed = false;
 
     if (m_data) {
         m_data->valid = 0;
     }
     if (extent->flat) {
         *cluster_offset = extent->flat_start_offset;
-        return 0;
+        return VMDK_OK;
     }
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
     if (l1_index >= extent->l1_size) {
-        return -1;
+        return VMDK_ERROR;
     }
     l2_offset = extent->l1_table[l1_index];
     if (!l2_offset) {
-        return -1;
+        return VMDK_UNALLOC;
     }
     for (i = 0; i < L2_CACHE_SIZE; i++) {
         if (l2_offset == extent->l2_cache_offsets[i]) {
@@ -885,7 +966,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                 l2_table,
                 extent->l2_size * sizeof(uint32_t)
             ) != extent->l2_size * sizeof(uint32_t)) {
-        return -1;
+        return VMDK_ERROR;
     }
 
     extent->l2_cache_offsets[min_index] = l2_offset;
@@ -894,9 +975,21 @@ static int get_cluster_offset(BlockDriverState *bs,
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
 
-    if (!*cluster_offset) {
+    if (m_data) {
+        m_data->valid = 1;
+        m_data->l1_index = l1_index;
+        m_data->l2_index = l2_index;
+        m_data->offset = *cluster_offset;
+        m_data->l2_offset = l2_offset;
+        m_data->l2_cache_entry = &l2_table[l2_index];
+    }
+    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+        zeroed = true;
+    }
+
+    if (!*cluster_offset || zeroed) {
         if (!allocate) {
-            return -1;
+            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
         }
 
         /* Avoid the L2 tables update for the images that have snapshots. */
@@ -909,8 +1002,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
 
         *cluster_offset >>= 9;
-        tmp = cpu_to_le32(*cluster_offset);
-        l2_table[l2_index] = tmp;
+        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
 
         /* First of all we write grain itself, to avoid race condition
          * that may to corrupt the image.
@@ -919,19 +1011,15 @@ static int get_cluster_offset(BlockDriverState *bs,
          */
         if (get_whole_cluster(
                 bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return -1;
+            return VMDK_ERROR;
         }
 
         if (m_data) {
-            m_data->offset = tmp;
-            m_data->l1_index = l1_index;
-            m_data->l2_index = l2_index;
-            m_data->l2_offset = l2_offset;
-            m_data->valid = 1;
+            m_data->offset = *cluster_offset;
         }
     }
     *cluster_offset <<= 9;
-    return 0;
+    return VMDK_OK;
 }
 
 static VmdkExtent *find_extent(BDRVVmdkState *s,
@@ -967,8 +1055,8 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
     ret = get_cluster_offset(bs, extent, NULL,
                             sector_num * 512, 0, &offset);
     qemu_co_mutex_unlock(&s->lock);
-    /* get_cluster_offset returning 0 means success */
-    ret = !ret;
+
+    ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
 
     index_in_cluster = sector_num % extent->cluster_sectors;
     n = extent->cluster_sectors - index_in_cluster;
@@ -1111,9 +1199,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         if (n > nb_sectors) {
             n = nb_sectors;
         }
-        if (ret) {
+        if (ret != VMDK_OK) {
             /* if not allocated, try to read from parent image, if exist */
-            if (bs->backing_hd) {
+            if (bs->backing_hd && ret != VMDK_ZEROED) {
                 if (!vmdk_is_cid_valid(bs)) {
                     return -EINVAL;
                 }
@@ -1150,8 +1238,19 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
+/**
+ * vmdk_write:
+ * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
+ *                if possible, otherwise return -ENOTSUP.
+ * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
+ *                with each cluster. By dry run we can find if the zero write
+ *                is possible without modifying image data.
+ *
+ * Returns: error code with 0 for success.
+ */
 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
+                      const uint8_t *buf, int nb_sectors,
+                      bool zeroed, bool zero_dry_run)
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
@@ -1181,7 +1280,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                                 sector_num << 9, !extent->compressed,
                                 &cluster_offset);
         if (extent->compressed) {
-            if (ret == 0) {
+            if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
                 fprintf(stderr,
                         "VMDK: can't write to allocated cluster"
@@ -1197,7 +1296,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                                         &cluster_offset);
             }
         }
-        if (ret) {
+        if (ret == VMDK_ERROR) {
             return -EINVAL;
         }
         extent_begin_sector = extent->end_sector - extent->sectors;
@@ -1207,17 +1306,34 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (n > nb_sectors) {
             n = nb_sectors;
         }
-
-        ret = vmdk_write_extent(extent,
-                        cluster_offset, index_in_cluster * 512,
-                        buf, n, sector_num);
-        if (ret) {
-            return ret;
-        }
-        if (m_data.valid) {
-            /* update L2 tables */
-            if (vmdk_L2update(extent, &m_data) == -1) {
-                return -EIO;
+        if (zeroed) {
+            /* Do zeroed write, buf is ignored */
+            if (extent->has_zero_grain &&
+                    index_in_cluster == 0 &&
+                    n >= extent->cluster_sectors) {
+                n = extent->cluster_sectors;
+                if (!zero_dry_run) {
+                    m_data.offset = VMDK_GTE_ZEROED;
+                    /* update L2 tables */
+                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                        return -EIO;
+                    }
+                }
+            } else {
+                return -ENOTSUP;
+            }
+        } else {
+            ret = vmdk_write_extent(extent,
+                            cluster_offset, index_in_cluster * 512,
+                            buf, n, sector_num);
+            if (ret) {
+                return ret;
+            }
+            if (m_data.valid) {
+                /* update L2 tables */
+                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                    return -EIO;
+                }
             }
         }
         nb_sectors -= n;
@@ -1243,14 +1359,31 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
     int ret;
     BDRVVmdkState *s = bs->opaque;
     qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors);
+    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
+}
+
+static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             int nb_sectors)
+{
+    int ret;
+    BDRVVmdkState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    /* write zeroes could fail if sectors not aligned to cluster, test it with
+     * dry_run == true before really updating image */
+    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    if (!ret) {
+        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+    }
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
 
 
 static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress)
+                              bool flat, bool compress, bool zeroed_grain)
 {
     int ret, i;
     int fd = 0;
@@ -1272,18 +1405,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = 1;
-    header.flags =
-        3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
+    header.version = zeroed_grain ? 2 : 1;
+    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / 512;
     header.granularity = 128;
-    header.num_gtes_per_gte = 512;
+    header.num_gtes_per_gt = 512;
 
     grains = (filesize / 512 + header.granularity - 1) / header.granularity;
-    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+    gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
     gt_count =
-        (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+        (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
 
     header.desc_offset = 1;
@@ -1299,7 +1433,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     header.flags = cpu_to_le32(header.flags);
     header.capacity = cpu_to_le64(header.capacity);
     header.granularity = cpu_to_le64(header.granularity);
-    header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
+    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
     header.desc_offset = cpu_to_le64(header.desc_offset);
     header.desc_size = cpu_to_le64(header.desc_size);
     header.rgd_offset = cpu_to_le64(header.rgd_offset);
@@ -1365,7 +1499,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 
     if (filename == NULL || !strlen(filename)) {
         fprintf(stderr, "Vmdk: no filename provided.\n");
-        return -1;
+        return VMDK_ERROR;
     }
     p = strrchr(filename, '/');
     if (p == NULL) {
@@ -1377,7 +1511,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
     if (p != NULL) {
         p++;
         if (p - filename >= buf_len) {
-            return -1;
+            return VMDK_ERROR;
         }
         pstrcpy(path, p - filename + 1, filename);
     } else {
@@ -1390,51 +1524,12 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
         postfix[0] = '\0';
     } else {
         if (q - p >= buf_len) {
-            return -1;
+            return VMDK_ERROR;
         }
         pstrcpy(prefix, q - p + 1, p);
         pstrcpy(postfix, buf_len, q);
     }
-    return 0;
-}
-
-static int relative_path(char *dest, int dest_size,
-        const char *base, const char *target)
-{
-    int i = 0;
-    int n = 0;
-    const char *p, *q;
-#ifdef _WIN32
-    const char *sep = "\\";
-#else
-    const char *sep = "/";
-#endif
-
-    if (!(dest && base && target)) {
-        return -1;
-    }
-    if (path_is_absolute(target)) {
-        pstrcpy(dest, dest_size, target);
-        return 0;
-    }
-    while (base[i] == target[i]) {
-        i++;
-    }
-    p = &base[i];
-    q = &target[i];
-    while (*p) {
-        if (*p == *sep) {
-            n++;
-        }
-        p++;
-    }
-    dest[0] = '\0';
-    for (; n; n--) {
-        pstrcat(dest, dest_size, "..");
-        pstrcat(dest, dest_size, sep);
-    }
-    pstrcat(dest, dest_size, q);
-    return 0;
+    return VMDK_OK;
 }
 
 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
@@ -1442,6 +1537,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     int fd, idx = 0;
     char desc[BUF_SIZE];
     int64_t total_size = 0, filesize;
+    const char *adapter_type = NULL;
     const char *backing_file = NULL;
     const char *fmt = NULL;
     int flags = 0;
@@ -1453,6 +1549,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     const char *desc_extent_line;
     char parent_desc_line[BUF_SIZE] = "";
     uint32_t parent_cid = 0xffffffff;
+    uint32_t number_heads = 16;
+    bool zeroed_grain = false;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
         "version=1\n"
@@ -1469,9 +1567,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         "\n"
         "ddb.virtualHWVersion = \"%d\"\n"
         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
-        "ddb.geometry.heads = \"16\"\n"
+        "ddb.geometry.heads = \"%d\"\n"
         "ddb.geometry.sectors = \"63\"\n"
-        "ddb.adapterType = \"ide\"\n";
+        "ddb.adapterType = \"%s\"\n";
 
     if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
         return -EINVAL;
@@ -1480,15 +1578,33 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
             total_size = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) {
+            adapter_type = options->value.s;
         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
             backing_file = options->value.s;
         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
             fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
+            zeroed_grain |= options->value.n;
         }
         options++;
     }
+    if (!adapter_type) {
+        adapter_type = "ide";
+    } else if (strcmp(adapter_type, "ide") &&
+               strcmp(adapter_type, "buslogic") &&
+               strcmp(adapter_type, "lsilogic") &&
+               strcmp(adapter_type, "legacyESX")) {
+        fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type);
+        return -EINVAL;
+    }
+    if (strcmp(adapter_type, "ide") != 0) {
+        /* that's the number of heads with which vmware operates when
+           creating, exporting, etc. vmdk files with a non-ide adapter type */
+        number_heads = 255;
+    }
     if (!fmt) {
         /* Default format to monolithicSparse */
         fmt = "monolithicSparse";
@@ -1515,9 +1631,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         return -ENOTSUP;
     }
     if (backing_file) {
-        char parent_filename[PATH_MAX];
         BlockDriverState *bs = bdrv_new("");
-        ret = bdrv_open(bs, backing_file, 0, NULL);
+        ret = bdrv_open(bs, backing_file, NULL, 0, NULL);
         if (ret != 0) {
             bdrv_delete(bs);
             return ret;
@@ -1528,10 +1643,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         }
         parent_cid = vmdk_read_cid(bs, 0);
         bdrv_delete(bs);
-        relative_path(parent_filename, sizeof(parent_filename),
-                      filename, backing_file);
         snprintf(parent_desc_line, sizeof(parent_desc_line),
-                "parentFileNameHint=\"%s\"", parent_filename);
+                "parentFileNameHint=\"%s\"", backing_file);
     }
 
     /* Create extents */
@@ -1558,7 +1671,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         snprintf(ext_filename, sizeof(ext_filename), "%s%s",
                 path, desc_filename);
 
-        if (vmdk_create_extent(ext_filename, size, flat, compress)) {
+        if (vmdk_create_extent(ext_filename, size,
+                               flat, compress, zeroed_grain)) {
             return -EINVAL;
         }
         filesize -= size;
@@ -1576,7 +1690,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
             parent_desc_line,
             ext_desc_lines,
             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
-            total_size / (int64_t)(63 * 16 * 512));
+            total_size / (int64_t)(63 * number_heads * 512), number_heads,
+                adapter_type);
     if (split || flat) {
         fd = qemu_open(filename,
                        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
@@ -1654,6 +1769,23 @@ static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
     return ret;
 }
 
+static int vmdk_has_zero_init(BlockDriverState *bs)
+{
+    int i;
+    BDRVVmdkState *s = bs->opaque;
+
+    /* If has a flat extent and its underlying storage doesn't have zero init,
+     * return 0. */
+    for (i = 0; i < s->num_extents; i++) {
+        if (s->extents[i].flat) {
+            if (!bdrv_has_zero_init(s->extents[i].file)) {
+                return 0;
+            }
+        }
+    }
+    return 1;
+}
+
 static QEMUOptionParameter vmdk_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1661,6 +1793,12 @@ static QEMUOptionParameter vmdk_create_options[] = {
         .help = "Virtual disk size"
     },
     {
+        .name = BLOCK_OPT_ADAPTER_TYPE,
+        .type = OPT_STRING,
+        .help = "Virtual adapter type, can be one of "
+                "ide (default), lsilogic, buslogic or legacyESX"
+    },
+    {
         .name = BLOCK_OPT_BACKING_FILE,
         .type = OPT_STRING,
         .help = "File name of a base image"
@@ -1677,24 +1815,31 @@ static QEMUOptionParameter vmdk_create_options[] = {
             "VMDK flat extent format, can be one of "
             "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
     },
+    {
+        .name = BLOCK_OPT_ZEROED_GRAIN,
+        .type = OPT_FLAG,
+        .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
+    },
     { NULL }
 };
 
 static BlockDriver bdrv_vmdk = {
-    .format_name    = "vmdk",
-    .instance_size  = sizeof(BDRVVmdkState),
-    .bdrv_probe     = vmdk_probe,
-    .bdrv_open      = vmdk_open,
-    .bdrv_reopen_prepare = vmdk_reopen_prepare,
-    .bdrv_read      = vmdk_co_read,
-    .bdrv_write     = vmdk_co_write,
-    .bdrv_close     = vmdk_close,
-    .bdrv_create    = vmdk_create,
-    .bdrv_co_flush_to_disk  = vmdk_co_flush,
-    .bdrv_co_is_allocated   = vmdk_co_is_allocated,
-    .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
-
-    .create_options = vmdk_create_options,
+    .format_name                  = "vmdk",
+    .instance_size                = sizeof(BDRVVmdkState),
+    .bdrv_probe                   = vmdk_probe,
+    .bdrv_open                    = vmdk_open,
+    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
+    .bdrv_read                    = vmdk_co_read,
+    .bdrv_write                   = vmdk_co_write,
+    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
+    .bdrv_close                   = vmdk_close,
+    .bdrv_create                  = vmdk_create,
+    .bdrv_co_flush_to_disk        = vmdk_co_flush,
+    .bdrv_co_is_allocated         = vmdk_co_is_allocated,
+    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
+    .bdrv_has_zero_init           = vmdk_has_zero_init,
+
+    .create_options               = vmdk_create_options,
 };
 
 static void bdrv_vmdk_init(void)
diff --git a/block/vpc.c b/block/vpc.c
index b6bf52f14..fe4f311d5 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -23,9 +23,12 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif
 
 /**************************************************************/
 
@@ -152,7 +155,7 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int vpc_open(BlockDriverState *bs, int flags)
+static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVVPCState *s = bs->opaque;
     int i;
@@ -160,24 +163,33 @@ static int vpc_open(BlockDriverState *bs, int flags)
     struct vhd_dyndisk_header* dyndisk_header;
     uint8_t buf[HEADER_SIZE];
     uint32_t checksum;
-    int err = -1;
     int disk_type = VHD_DYNAMIC;
+    int ret;
 
-    if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
+    if (ret < 0) {
         goto fail;
+    }
 
     footer = (struct vhd_footer*) s->footer_buf;
     if (strncmp(footer->creator, "conectix", 8)) {
         int64_t offset = bdrv_getlength(bs->file);
-        if (offset < HEADER_SIZE) {
+        if (offset < 0) {
+            ret = offset;
+            goto fail;
+        } else if (offset < HEADER_SIZE) {
+            ret = -EINVAL;
             goto fail;
         }
+
         /* If a fixed disk, the footer is found only at the end of the file */
-        if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE)
-                != HEADER_SIZE) {
+        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
+                         HEADER_SIZE);
+        if (ret < 0) {
             goto fail;
         }
         if (strncmp(footer->creator, "conectix", 8)) {
+            ret = -EMEDIUMTYPE;
             goto fail;
         }
         disk_type = VHD_FIXED;
@@ -198,20 +210,23 @@ static int vpc_open(BlockDriverState *bs, int flags)
     bs->total_sectors = (int64_t)
         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 
-    if (bs->total_sectors >= 65535 * 16 * 255) {
-        err = -EFBIG;
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (bs->total_sectors >= 65535LL * 255 * 255) {
+        ret = -EFBIG;
         goto fail;
     }
 
     if (disk_type == VHD_DYNAMIC) {
-        if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
-                HEADER_SIZE) != HEADER_SIZE) {
+        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
+                         HEADER_SIZE);
+        if (ret < 0) {
             goto fail;
         }
 
         dyndisk_header = (struct vhd_dyndisk_header *) buf;
 
         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
+            ret = -EINVAL;
             goto fail;
         }
 
@@ -222,8 +237,10 @@ static int vpc_open(BlockDriverState *bs, int flags)
         s->pagetable = g_malloc(s->max_table_entries * 4);
 
         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
-        if (bdrv_pread(bs->file, s->bat_offset, s->pagetable,
-                s->max_table_entries * 4) != s->max_table_entries * 4) {
+
+        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
+                         s->max_table_entries * 4);
+        if (ret < 0) {
             goto fail;
         }
 
@@ -261,8 +278,13 @@ static int vpc_open(BlockDriverState *bs, int flags)
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
- fail:
-    return err;
+
+fail:
+    g_free(s->pagetable);
+#ifdef CACHE
+    g_free(s->pageentry_u8);
+#endif
+    return ret;
 }
 
 static int vpc_reopen_prepare(BDRVReopenState *state,
@@ -524,19 +546,27 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
  * Note that the geometry doesn't always exactly match total_sectors but
  * may round it down.
  *
- * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override
+ * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
+ * and instead allow up to 255 heads.
  */
 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
     uint8_t* heads, uint8_t* secs_per_cyl)
 {
     uint32_t cyls_times_heads;
 
-    if (total_sectors > 65535 * 16 * 255)
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (total_sectors > 65535LL * 255 * 255) {
         return -EFBIG;
+    }
 
     if (total_sectors > 65535 * 16 * 63) {
         *secs_per_cyl = 255;
-        *heads = 16;
+        if (total_sectors > 65535 * 16 * 255) {
+            *heads = 255;
+        } else {
+            *heads = 16;
+        }
         cyls_times_heads = total_sectors / *secs_per_cyl;
     } else {
         *secs_per_cyl = 17;
@@ -739,7 +769,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
 
     footer->type = be32_to_cpu(disk_type);
 
-    /* TODO uuid is missing */
+#if defined(CONFIG_UUID)
+    uuid_generate(footer->uuid);
+#endif
 
     footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
 
@@ -754,6 +786,18 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
     return ret;
 }
 
+static int vpc_has_zero_init(BlockDriverState *bs)
+{
+    BDRVVPCState *s = bs->opaque;
+    struct vhd_footer *footer =  (struct vhd_footer *) s->footer_buf;
+
+    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+        return bdrv_has_zero_init(bs->file);
+    } else {
+        return 1;
+    }
+}
+
 static void vpc_close(BlockDriverState *bs)
 {
     BDRVVPCState *s = bs->opaque;
@@ -786,16 +830,17 @@ static BlockDriver bdrv_vpc = {
     .format_name    = "vpc",
     .instance_size  = sizeof(BDRVVPCState),
 
-    .bdrv_probe     = vpc_probe,
-    .bdrv_open      = vpc_open,
-    .bdrv_close     = vpc_close,
-    .bdrv_reopen_prepare = vpc_reopen_prepare,
-    .bdrv_create    = vpc_create,
+    .bdrv_probe             = vpc_probe,
+    .bdrv_open              = vpc_open,
+    .bdrv_close             = vpc_close,
+    .bdrv_reopen_prepare    = vpc_reopen_prepare,
+    .bdrv_create            = vpc_create,
 
     .bdrv_read              = vpc_co_read,
     .bdrv_write             = vpc_co_write,
 
-    .create_options = vpc_create_options,
+    .create_options         = vpc_create_options,
+    .bdrv_has_zero_init     = vpc_has_zero_init,
 };
 
 static void bdrv_vpc_init(void)
diff --git a/block/vvfat.c b/block/vvfat.c
index 59d3c5b8a..cd3b8edd9 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1,4 +1,4 @@
-/* vim:set shiftwidth=4 ts=8: */
+/* vim:set shiftwidth=4 ts=4: */
 /*
  * QEMU Block driver for virtual VFAT (shadows a local directory)
  *
@@ -25,9 +25,11 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qbool.h"
 
 #ifndef S_IWGRP
 #define S_IWGRP 0
@@ -529,13 +531,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry)
 /* if return_time==0, this returns the fat_date, else the fat_time */
 static uint16_t fat_datetime(time_t time,int return_time) {
     struct tm* t;
-#ifdef _WIN32
-    t=localtime(&time); /* this is not thread safe */
-#else
     struct tm t1;
     t = &t1;
     localtime_r(&time,t);
-#endif
     if(return_time)
 	return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
     return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
@@ -992,10 +990,90 @@ static void vvfat_rebind(BlockDriverState *bs)
     s->bs = bs;
 }
 
-static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+static QemuOptsList runtime_opts = {
+    .name = "vvfat",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "dir",
+            .type = QEMU_OPT_STRING,
+            .help = "Host directory to map to the vvfat device",
+        },
+        {
+            .name = "fat-type",
+            .type = QEMU_OPT_NUMBER,
+            .help = "FAT type (12, 16 or 32)",
+        },
+        {
+            .name = "floppy",
+            .type = QEMU_OPT_BOOL,
+            .help = "Create a floppy rather than a hard disk image",
+        },
+        {
+            .name = "rw",
+            .type = QEMU_OPT_BOOL,
+            .help = "Make the image writable",
+        },
+        { /* end of list */ }
+    },
+};
+
+static void vvfat_parse_filename(const char *filename, QDict *options,
+                                 Error **errp)
+{
+    int fat_type = 0;
+    bool floppy = false;
+    bool rw = false;
+    int i;
+
+    if (!strstart(filename, "fat:", NULL)) {
+        error_setg(errp, "File name string must start with 'fat:'");
+        return;
+    }
+
+    /* Parse options */
+    if (strstr(filename, ":32:")) {
+        fat_type = 32;
+    } else if (strstr(filename, ":16:")) {
+        fat_type = 16;
+    } else if (strstr(filename, ":12:")) {
+        fat_type = 12;
+    }
+
+    if (strstr(filename, ":floppy:")) {
+        floppy = true;
+    }
+
+    if (strstr(filename, ":rw:")) {
+        rw = true;
+    }
+
+    /* Get the directory name without options */
+    i = strrchr(filename, ':') - filename;
+    assert(i >= 3);
+    if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) {
+        /* workaround for DOS drive names */
+        filename += i - 1;
+    } else {
+        filename += i + 1;
+    }
+
+    /* Fill in the options QDict */
+    qdict_put(options, "dir", qstring_from_str(filename));
+    qdict_put(options, "fat-type", qint_from_int(fat_type));
+    qdict_put(options, "floppy", qbool_from_int(floppy));
+    qdict_put(options, "rw", qbool_from_int(rw));
+}
+
+static int vvfat_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVVVFATState *s = bs->opaque;
-    int i, cyls, heads, secs;
+    int cyls, heads, secs;
+    bool floppy;
+    const char *dirname;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    int ret;
 
 #ifdef DEBUG
     vvv = s;
@@ -1006,6 +1084,65 @@ DLOG(if (stderr == NULL) {
     setbuf(stderr, NULL);
 })
 
+    opts = qemu_opts_create_nofail(&runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    dirname = qemu_opt_get(opts, "dir");
+    if (!dirname) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires "
+                      "a 'dir' option");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->fat_type = qemu_opt_get_number(opts, "fat-type", 0);
+    floppy = qemu_opt_get_bool(opts, "floppy", false);
+
+    if (floppy) {
+        /* 1.44MB or 2.88MB floppy.  2.88MB can be FAT12 (default) or FAT16. */
+        if (!s->fat_type) {
+            s->fat_type = 12;
+            secs = 36;
+            s->sectors_per_cluster = 2;
+        } else {
+            secs = s->fat_type == 12 ? 18 : 36;
+            s->sectors_per_cluster = 1;
+        }
+        s->first_sectors_number = 1;
+        cyls = 80;
+        heads = 2;
+    } else {
+        /* 32MB or 504MB disk*/
+        if (!s->fat_type) {
+            s->fat_type = 16;
+        }
+        cyls = s->fat_type == 12 ? 64 : 1024;
+        heads = 16;
+        secs = 63;
+    }
+
+    switch (s->fat_type) {
+    case 32:
+	    fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. "
+                "You are welcome to do so!\n");
+        break;
+    case 16:
+    case 12:
+        break;
+    default:
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only "
+                      "12, 16 and 32");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+
     s->bs = bs;
 
     /* LATER TODO: if FAT32, adjust */
@@ -1021,63 +1158,24 @@ DLOG(if (stderr == NULL) {
     s->fat2 = NULL;
     s->downcase_short_names = 1;
 
-    if (!strstart(dirname, "fat:", NULL))
-	return -1;
-
-    if (strstr(dirname, ":32:")) {
-	fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
-	s->fat_type = 32;
-    } else if (strstr(dirname, ":16:")) {
-	s->fat_type = 16;
-    } else if (strstr(dirname, ":12:")) {
-	s->fat_type = 12;
-    }
-
-    if (strstr(dirname, ":floppy:")) {
-	/* 1.44MB or 2.88MB floppy.  2.88MB can be FAT12 (default) or FAT16. */
-	if (!s->fat_type) {
-	    s->fat_type = 12;
-            secs = 36;
-	    s->sectors_per_cluster=2;
-	} else {
-            secs = s->fat_type == 12 ? 18 : 36;
-	    s->sectors_per_cluster=1;
-	}
-	s->first_sectors_number = 1;
-        cyls = 80;
-        heads = 2;
-    } else {
-	/* 32MB or 504MB disk*/
-	if (!s->fat_type) {
-	    s->fat_type = 16;
-	}
-        cyls = s->fat_type == 12 ? 64 : 1024;
-        heads = 16;
-        secs = 63;
-    }
     fprintf(stderr, "vvfat %s chs %d,%d,%d\n",
             dirname, cyls, heads, secs);
 
     s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);
 
-    if (strstr(dirname, ":rw:")) {
-	if (enable_write_target(s))
-	    return -1;
-	bs->read_only = 0;
+    if (qemu_opt_get_bool(opts, "rw", false)) {
+        ret = enable_write_target(s);
+        if (ret < 0) {
+            goto fail;
+        }
+        bs->read_only = 0;
     }
 
-    i = strrchr(dirname, ':') - dirname;
-    assert(i >= 3);
-    if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
-	/* workaround for DOS drive names */
-	dirname += i-1;
-    else
-	dirname += i+1;
-
     bs->total_sectors = cyls * heads * secs;
 
     if (init_directories(s, dirname, heads, secs)) {
-	return -1;
+        ret = -EIO;
+        goto fail;
     }
 
     s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
@@ -1097,7 +1195,10 @@ DLOG(if (stderr == NULL) {
         migrate_add_blocker(s->migration_blocker);
     }
 
-    return 0;
+    ret = 0;
+fail:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static inline void vvfat_close_current_file(BDRVVVFATState *s)
@@ -2816,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s)
     s->qcow_filename = g_malloc(1024);
     ret = get_tmp_filename(s->qcow_filename, 1024);
     if (ret < 0) {
-        g_free(s->qcow_filename);
-        s->qcow_filename = NULL;
-        return ret;
+        goto err;
     }
 
     bdrv_qcow = bdrv_find_format("qcow");
@@ -2826,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s)
     set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
     set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
 
-    if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
-	return -1;
+    ret = bdrv_create(bdrv_qcow, s->qcow_filename, options);
+    if (ret < 0) {
+        goto err;
+    }
 
     s->qcow = bdrv_new("");
-    if (s->qcow == NULL) {
-        return -1;
-    }
 
-    ret = bdrv_open(s->qcow, s->qcow_filename,
+    ret = bdrv_open(s->qcow, s->qcow_filename, NULL,
             BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
     if (ret < 0) {
-	return ret;
+        bdrv_delete(s->qcow);
+        goto err;
     }
 
 #ifndef _WIN32
@@ -2850,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s)
     *(void**)s->bs->backing_hd->opaque = s;
 
     return 0;
+
+err:
+    g_free(s->qcow_filename);
+    s->qcow_filename = NULL;
+    return ret;
 }
 
 static void vvfat_close(BlockDriverState *bs)
@@ -2869,15 +2973,18 @@ static void vvfat_close(BlockDriverState *bs)
 }
 
 static BlockDriver bdrv_vvfat = {
-    .format_name	= "vvfat",
-    .instance_size	= sizeof(BDRVVVFATState),
-    .bdrv_file_open	= vvfat_open,
-    .bdrv_rebind	= vvfat_rebind,
-    .bdrv_read          = vvfat_co_read,
-    .bdrv_write         = vvfat_co_write,
-    .bdrv_close		= vvfat_close,
-    .bdrv_co_is_allocated = vvfat_co_is_allocated,
-    .protocol_name	= "fat",
+    .format_name            = "vvfat",
+    .protocol_name          = "fat",
+    .instance_size          = sizeof(BDRVVVFATState),
+
+    .bdrv_parse_filename    = vvfat_parse_filename,
+    .bdrv_file_open         = vvfat_open,
+    .bdrv_close             = vvfat_close,
+    .bdrv_rebind            = vvfat_rebind,
+
+    .bdrv_read              = vvfat_co_read,
+    .bdrv_write             = vvfat_co_write,
+    .bdrv_co_is_allocated   = vvfat_co_is_allocated,
 };
 
 static void bdrv_vvfat_init(void)
diff --git a/block/win32-aio.c b/block/win32-aio.c
index 4704ee06c..fcb7c754d 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -22,13 +22,13 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu-common.h"
-#include "qemu-aio.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "block/aio.h"
 #include "raw-aio.h"
-#include "event_notifier.h"
+#include "qemu/event_notifier.h"
+#include "qemu/iov.h"
 #include <windows.h>
 #include <winioctl.h>
 
@@ -80,15 +80,9 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
     if (!waiocb->is_linear) {
         if (ret == 0 && waiocb->is_read) {
             QEMUIOVector *qiov = waiocb->qiov;
-            char *p = waiocb->buf;
-            int i;
-
-            for (i = 0; i < qiov->niov; ++i) {
-                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
-                p += qiov->iov[i].iov_len;
-            }
-            g_free(waiocb->buf);
+            iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
         }
+        qemu_vfree(waiocb->buf);
     }
 
 
@@ -153,13 +147,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
     if (qiov->niov > 1) {
         waiocb->buf = qemu_blockalign(bs, qiov->size);
         if (type & QEMU_AIO_WRITE) {
-            char *p = waiocb->buf;
-            int i;
-
-            for (i = 0; i < qiov->niov; ++i) {
-                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
-                p += qiov->iov[i].iov_len;
-            }
+            iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
         }
         waiocb->is_linear = false;
     } else {