48 files changed, 10437 insertions, 3134 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 4cf9aa499..fd88c03ec 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,8 +1,9 @@
-block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
-block-obj-y += vhdx.o
+block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
+block-obj-$(CONFIG_QUORUM) += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
 block-obj-y += snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
@@ -10,8 +11,9 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 ifeq ($(CONFIG_POSIX),y)
-block-obj-y += nbd.o sheepdog.o
+block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
+block-obj-$(CONFIG_LIBNFS) += nfs.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
@@ -23,4 +25,15 @@ common-obj-y += commit.o
 common-obj-y += mirror.o
 common-obj-y += backup.o
 
-$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
+iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
+iscsi.o-libs       := $(LIBISCSI_LIBS)
+curl.o-cflags      := $(CURL_CFLAGS)
+curl.o-libs        := $(CURL_LIBS)
+rbd.o-cflags       := $(RBD_CFLAGS)
+rbd.o-libs         := $(RBD_LIBS)
+gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
+gluster.o-libs     := $(GLUSTERFS_LIBS)
+ssh.o-cflags       := $(LIBSSH2_CFLAGS)
+ssh.o-libs         := $(LIBSSH2_LIBS)
+qcow.o-libs        := -lz
+linux-aio.o-libs   := -laio
diff --git a/block/backup.c b/block/backup.c
index 6ae8a05a3..15a2e55e8 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 
         if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
             ret = bdrv_co_write_zeroes(job->target,
-                                       start * BACKUP_SECTORS_PER_CLUSTER, n);
+                                       start * BACKUP_SECTORS_PER_CLUSTER,
+                                       n, BDRV_REQ_MAY_UNMAP);
         } else {
             ret = bdrv_co_writev(job->target,
                                  start * BACKUP_SECTORS_PER_CLUSTER, n,
@@ -180,8 +181,13 @@ static int coroutine_fn backup_before_write_notify(
         void *opaque)
 {
     BdrvTrackedRequest *req = opaque;
+    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
 
-    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -202,9 +208,9 @@ static void backup_iostatus_reset(BlockJob *job)
     bdrv_iostatus_reset(s->target);
 }
 
-static const BlockJobType backup_job_type = {
+static const BlockJobDriver backup_job_driver = {
     .instance_size  = sizeof(BackupBlockJob),
-    .job_type       = "backup",
+    .job_type       = BLOCK_JOB_TYPE_BACKUP,
     .set_speed      = backup_set_speed,
     .iostatus_reset = backup_iostatus_reset,
 };
@@ -272,9 +278,9 @@ static void coroutine_fn backup_run(void *opaque)
                 uint64_t delay_ns = ratelimit_calculate_delay(
                         &job->limit, job->sectors_read);
                 job->sectors_read = 0;
-                block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
             } else {
-                block_job_sleep_ns(&job->common, rt_clock, 0);
+                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
             }
 
             if (block_job_is_cancelled(&job->common)) {
@@ -289,14 +295,14 @@ static void coroutine_fn backup_run(void *opaque)
                  * backing file. */
 
                 for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
-                    /* bdrv_co_is_allocated() only returns true/false based
-                     * on the first set of sectors it comes accross that
+                    /* bdrv_is_allocated() only returns true/false based
+                     * on the first set of sectors it comes across that
                      * are are all in the same state.
                      * For that reason we must verify each sector in the
                      * backup cluster length.  We end up copying more than
                      * needed but at some point that is always the case. */
                     alloced =
-                        bdrv_co_is_allocated(bs,
+                        bdrv_is_allocated(bs,
                                 start * BACKUP_SECTORS_PER_CLUSTER + i,
                                 BACKUP_SECTORS_PER_CLUSTER - i, &n);
                     i += n;
@@ -338,7 +344,7 @@ static void coroutine_fn backup_run(void *opaque)
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
-    bdrv_delete(target);
+    bdrv_unref(target);
 
     block_job_completed(&job->common, ret);
 }
@@ -370,7 +376,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
                                            cb, opaque, errp);
     if (!job) {
         return;
diff --git a/block/blkdebug.c b/block/blkdebug.c
index ccb627ad9..380c73610 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -168,6 +168,7 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
 
     [BLKDBG_REFTABLE_LOAD]                  = "reftable_load",
     [BLKDBG_REFTABLE_GROW]                  = "reftable_grow",
+    [BLKDBG_REFTABLE_UPDATE]                = "reftable_update",
 
     [BLKDBG_REFBLOCK_LOAD]                  = "refblock_load",
     [BLKDBG_REFBLOCK_UPDATE]                = "refblock_update",
@@ -185,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
 
     [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os",
     [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk",
+
+    [BLKDBG_PWRITEV_RMW_HEAD]               = "pwritev_rmw.head",
+    [BLKDBG_PWRITEV_RMW_AFTER_HEAD]         = "pwritev_rmw.after_head",
+    [BLKDBG_PWRITEV_RMW_TAIL]               = "pwritev_rmw.tail",
+    [BLKDBG_PWRITEV_RMW_AFTER_TAIL]         = "pwritev_rmw.after_tail",
+    [BLKDBG_PWRITEV]                        = "pwritev",
+    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
+    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -270,19 +279,33 @@ static void remove_rule(BlkdebugRule *rule)
     g_free(rule);
 }
 
-static int read_config(BDRVBlkdebugState *s, const char *filename)
+static int read_config(BDRVBlkdebugState *s, const char *filename,
+                       QDict *options, Error **errp)
 {
-    FILE *f;
+    FILE *f = NULL;
     int ret;
     struct add_rule_data d;
+    Error *local_err = NULL;
 
-    f = fopen(filename, "r");
-    if (f == NULL) {
-        return -errno;
+    if (filename) {
+        f = fopen(filename, "r");
+        if (f == NULL) {
+            error_setg_errno(errp, errno, "Could not read blkdebug config file");
+            return -errno;
+        }
+
+        ret = qemu_config_parse(f, config_groups, filename);
+        if (ret < 0) {
+            error_setg(errp, "Could not parse blkdebug config file");
+            ret = -EINVAL;
+            goto fail;
+        }
     }
 
-    ret = qemu_config_parse(f, config_groups, filename);
-    if (ret < 0) {
+    qemu_config_parse_qdict(options, config_groups, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
         goto fail;
     }
 
@@ -297,7 +320,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename)
 fail:
     qemu_opts_reset(&inject_error_opts);
     qemu_opts_reset(&set_state_opts);
-    fclose(f);
+    if (f) {
+        fclose(f);
+    }
     return ret;
 }
 
@@ -309,7 +334,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options,
 
     /* Parse the blkdebug: prefix */
     if (!strstart(filename, "blkdebug:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkdebug:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
         return;
     }
 
@@ -345,53 +372,68 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "[internal use only, will be removed]",
         },
+        {
+            .name = "align",
+            .type = QEMU_OPT_SIZE,
+            .help = "Required alignment in bytes",
+        },
         { /* end of list */ }
     },
 };
 
-static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags)
+static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp)
 {
     BDRVBlkdebugState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *filename, *config;
+    const char *config;
+    uint64_t align;
     int ret;
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
-        goto fail;
+        goto out;
     }
 
-    /* Read rules from config file */
+    /* Read rules from config file or command line options */
     config = qemu_opt_get(opts, "config");
-    if (config) {
-        ret = read_config(s, config);
-        if (ret < 0) {
-            goto fail;
-        }
+    ret = read_config(s, config, options, errp);
+    if (ret) {
+        goto out;
     }
 
     /* Set initial state */
     s->state = 1;
 
     /* Open the backing file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        ret = -EINVAL;
-        goto fail;
+    assert(bs->file == NULL);
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
+                          flags | BDRV_O_PROTOCOL, false, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto out;
     }
 
-    ret = bdrv_file_open(&bs->file, filename, NULL, flags);
-    if (ret < 0) {
-        goto fail;
+    /* Set request alignment */
+    align = qemu_opt_get_size(opts, "align", bs->request_alignment);
+    if (align > 0 && align < INT_MAX && !(align & (align - 1))) {
+        bs->request_alignment = align;
+    } else {
+        error_setg(errp, "Invalid alignment");
+        ret = -EINVAL;
+        goto fail_unref;
     }
 
     ret = 0;
-fail:
+    goto out;
+
+fail_unref:
+    bdrv_unref(bs->file);
+out:
     qemu_opts_del(opts);
     return ret;
 }
@@ -590,9 +632,9 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
 static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugSuspendedReq *r;
+    BlkdebugSuspendedReq *r, *next;
 
-    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) {
         if (!strcmp(r->tag, tag)) {
             qemu_coroutine_enter(r->co, NULL);
             return 0;
@@ -601,6 +643,31 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
     return -ENOENT;
 }
 
+static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
+                                            const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r, *r_next;
+    BlkdebugRule *rule, *next;
+    int i, ret = -ENOENT;
+
+    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
+            if (rule->action == ACTION_SUSPEND &&
+                !strcmp(rule->options.suspend.tag, tag)) {
+                remove_rule(rule);
+                ret = 0;
+            }
+        }
+    }
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) {
+        if (!strcmp(r->tag, tag)) {
+            qemu_coroutine_enter(r->co, NULL);
+            ret = 0;
+        }
+    }
+    return ret;
+}
 
 static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
 {
@@ -635,6 +702,8 @@ static BlockDriver bdrv_blkdebug = {
 
     .bdrv_debug_event           = blkdebug_debug_event,
     .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
+    .bdrv_debug_remove_breakpoint
+                                = blkdebug_debug_remove_breakpoint,
     .bdrv_debug_resume          = blkdebug_debug_resume,
     .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended,
 };
diff --git a/block/blkverify.c b/block/blkverify.c
index 1d58cc393..e1c31171c 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options,
 
     /* Parse the blkverify: prefix */
     if (!strstart(filename, "blkverify:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkverify:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
         return;
     }
 
@@ -116,46 +118,37 @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static int blkverify_open(BlockDriverState *bs, QDict *options, int flags)
+static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
+                          Error **errp)
 {
     BDRVBlkverifyState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *filename, *raw;
     int ret;
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
 
-    /* Parse the raw image filename */
-    raw = qemu_opt_get(opts, "x-raw");
-    if (raw == NULL) {
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = bdrv_file_open(&bs->file, raw, NULL, flags);
+    /* Open the raw file */
+    assert(bs->file == NULL);
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
+                          "raw", flags | BDRV_O_PROTOCOL, false, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         goto fail;
     }
 
     /* Open the test file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    s->test_file = bdrv_new("");
-    ret = bdrv_open(s->test_file, filename, NULL, flags, NULL);
+    assert(s->test_file == NULL);
+    ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
+                          "test", flags, false, &local_err);
     if (ret < 0) {
-        bdrv_delete(s->test_file);
+        error_propagate(errp, local_err);
         s->test_file = NULL;
         goto fail;
     }
@@ -169,7 +162,7 @@ static void blkverify_close(BlockDriverState *bs)
 {
     BDRVBlkverifyState *s = bs->opaque;
 
-    bdrv_delete(s->test_file);
+    bdrv_unref(s->test_file);
     s->test_file = NULL;
 }
 
@@ -180,110 +173,6 @@ static int64_t blkverify_getlength(BlockDriverState *bs)
     return bdrv_getlength(s->test_file);
 }
 
-/**
- * Check that I/O vector contents are identical
- *
- * @a:          I/O vector
- * @b:          I/O vector
- * @ret:        Offset to first mismatching byte or -1 if match
- */
-static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
-{
-    int i;
-    ssize_t offset = 0;
-
-    assert(a->niov == b->niov);
-    for (i = 0; i < a->niov; i++) {
-        size_t len = 0;
-        uint8_t *p = (uint8_t *)a->iov[i].iov_base;
-        uint8_t *q = (uint8_t *)b->iov[i].iov_base;
-
-        assert(a->iov[i].iov_len == b->iov[i].iov_len);
-        while (len < a->iov[i].iov_len && *p++ == *q++) {
-            len++;
-        }
-
-        offset += len;
-
-        if (len != a->iov[i].iov_len) {
-            return offset;
-        }
-    }
-    return -1;
-}
-
-typedef struct {
-    int src_index;
-    struct iovec *src_iov;
-    void *dest_base;
-} IOVectorSortElem;
-
-static int sortelem_cmp_src_base(const void *a, const void *b)
-{
-    const IOVectorSortElem *elem_a = a;
-    const IOVectorSortElem *elem_b = b;
-
-    /* Don't overflow */
-    if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
-        return -1;
-    } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-static int sortelem_cmp_src_index(const void *a, const void *b)
-{
-    const IOVectorSortElem *elem_a = a;
-    const IOVectorSortElem *elem_b = b;
-
-    return elem_a->src_index - elem_b->src_index;
-}
-
-/**
- * Copy contents of I/O vector
- *
- * The relative relationships of overlapping iovecs are preserved.  This is
- * necessary to ensure identical semantics in the cloned I/O vector.
- */
-static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src,
-                                  void *buf)
-{
-    IOVectorSortElem sortelems[src->niov];
-    void *last_end;
-    int i;
-
-    /* Sort by source iovecs by base address */
-    for (i = 0; i < src->niov; i++) {
-        sortelems[i].src_index = i;
-        sortelems[i].src_iov = &src->iov[i];
-    }
-    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
-
-    /* Allocate buffer space taking into account overlapping iovecs */
-    last_end = NULL;
-    for (i = 0; i < src->niov; i++) {
-        struct iovec *cur = sortelems[i].src_iov;
-        ptrdiff_t rewind = 0;
-
-        /* Detect overlap */
-        if (last_end && last_end > cur->iov_base) {
-            rewind = last_end - cur->iov_base;
-        }
-
-        sortelems[i].dest_base = buf - rewind;
-        buf += cur->iov_len - MIN(rewind, cur->iov_len);
-        last_end = MAX(cur->iov_base + cur->iov_len, last_end);
-    }
-
-    /* Sort by source iovec index and build destination iovec */
-    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
-    for (i = 0; i < src->niov; i++) {
-        qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
-    }
-}
-
 static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
                                          int64_t sector_num, QEMUIOVector *qiov,
                                          int nb_sectors,
@@ -347,7 +236,7 @@ static void blkverify_aio_cb(void *opaque, int ret)
 
 static void blkverify_verify_readv(BlkverifyAIOCB *acb)
 {
-    ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov);
+    ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov);
     if (offset != -1) {
         blkverify_err(acb, "contents mismatch in sector %" PRId64,
                       acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE));
@@ -365,7 +254,7 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
     acb->verify = blkverify_verify_readv;
     acb->buf = qemu_blockalign(bs->file, qiov->size);
     qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
-    blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
+    qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
 
     bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
                    blkverify_aio_cb, acb);
@@ -399,6 +288,20 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
     return bdrv_aio_flush(s->test_file, cb, opaque);
 }
 
+static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
+                                                  BlockDriverState *candidate)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    bool perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
+
+    if (perm) {
+        return true;
+    }
+
+    return bdrv_recurse_is_first_non_filter(s->test_file, candidate);
+}
+
 static BlockDriver bdrv_blkverify = {
     .format_name            = "blkverify",
     .protocol_name          = "blkverify",
@@ -412,6 +315,9 @@ static BlockDriver bdrv_blkverify = {
     .bdrv_aio_readv         = blkverify_aio_readv,
     .bdrv_aio_writev        = blkverify_aio_writev,
     .bdrv_aio_flush         = blkverify_aio_flush,
+
+    .is_filter              = true,
+    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
 
 static void bdrv_blkverify_init(void)
diff --git a/block/bochs.c b/block/bochs.c
index d7078c077..eacf956e7 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -39,56 +39,41 @@
 // not allocated: 0xffffffff
 
 // always little-endian
-struct bochs_header_v1 {
-    char magic[32]; // "Bochs Virtual HD Image"
-    char type[16]; // "Redolog"
-    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
-    uint32_t version;
-    uint32_t header; // size of header
-
-    union {
-	struct {
-	    uint32_t catalog; // num of entries
-	    uint32_t bitmap; // bitmap size
-	    uint32_t extent; // extent size
-	    uint64_t disk; // disk size
-	    char padding[HEADER_SIZE - 64 - 8 - 20];
-	} redolog;
-	char padding[HEADER_SIZE - 64 - 8];
-    } extra;
-};
-
-// always little-endian
 struct bochs_header {
-    char magic[32]; // "Bochs Virtual HD Image"
-    char type[16]; // "Redolog"
-    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    char magic[32];     /* "Bochs Virtual HD Image" */
+    char type[16];      /* "Redolog" */
+    char subtype[16];   /* "Undoable" / "Volatile" / "Growing" */
     uint32_t version;
-    uint32_t header; // size of header
+    uint32_t header;    /* size of header */
+
+    uint32_t catalog;   /* num of entries */
+    uint32_t bitmap;    /* bitmap size */
+    uint32_t extent;    /* extent size */
 
     union {
-	struct {
-	    uint32_t catalog; // num of entries
-	    uint32_t bitmap; // bitmap size
-	    uint32_t extent; // extent size
-	    uint32_t reserved; // for ???
-	    uint64_t disk; // disk size
-	    char padding[HEADER_SIZE - 64 - 8 - 24];
-	} redolog;
-	char padding[HEADER_SIZE - 64 - 8];
+        struct {
+            uint32_t reserved;  /* for ??? */
+            uint64_t disk;      /* disk size */
+            char padding[HEADER_SIZE - 64 - 20 - 12];
+        } QEMU_PACKED redolog;
+        struct {
+            uint64_t disk;      /* disk size */
+            char padding[HEADER_SIZE - 64 - 20 - 8];
+        } QEMU_PACKED redolog_v1;
+        char padding[HEADER_SIZE - 64 - 20];
     } extra;
-};
+} QEMU_PACKED;
 
 typedef struct BDRVBochsState {
     CoMutex lock;
     uint32_t *catalog_bitmap;
-    int catalog_size;
+    uint32_t catalog_size;
 
-    int data_offset;
+    uint32_t data_offset;
 
-    int bitmap_blocks;
-    int extent_blocks;
-    int extent_size;
+    uint32_t bitmap_blocks;
+    uint32_t extent_blocks;
+    uint32_t extent_size;
 } BDRVBochsState;
 
 static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -108,12 +93,12 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int bochs_open(BlockDriverState *bs, QDict *options, int flags)
+static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVBochsState *s = bs->opaque;
-    int i;
+    uint32_t i;
     struct bochs_header bochs;
-    struct bochs_header_v1 header_v1;
     int ret;
 
     bs->read_only = 1; // no write support yet
@@ -128,17 +113,24 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags)
         strcmp(bochs.subtype, GROWING_TYPE) ||
 	((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
 	(le32_to_cpu(bochs.version) != HEADER_V1))) {
-        return -EMEDIUMTYPE;
+        error_setg(errp, "Image not in Bochs format");
+        return -EINVAL;
     }
 
     if (le32_to_cpu(bochs.version) == HEADER_V1) {
-      memcpy(&header_v1, &bochs, sizeof(bochs));
-      bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+        bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512;
     } else {
-      bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+        bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+    }
+
+    /* Limit to 1M entries to avoid unbounded allocation. This is what is
+     * needed for the largest image that bximage can create (~8 TB). */
+    s->catalog_size = le32_to_cpu(bochs.catalog);
+    if (s->catalog_size > 0x100000) {
+        error_setg(errp, "Catalog size is too large");
+        return -EFBIG;
     }
 
-    s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
     s->catalog_bitmap = g_malloc(s->catalog_size * 4);
 
     ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
@@ -152,10 +144,34 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags)
 
     s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);
 
-    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
-    s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512;
+    s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512;
 
-    s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+    s->extent_size = le32_to_cpu(bochs.extent);
+    if (s->extent_size < BDRV_SECTOR_SIZE) {
+        /* bximage actually never creates extents smaller than 4k */
+        error_setg(errp, "Extent size must be at least 512");
+        ret = -EINVAL;
+        goto fail;
+    } else if (!is_power_of_2(s->extent_size)) {
+        error_setg(errp, "Extent size %" PRIu32 " is not a power of two",
+                   s->extent_size);
+        ret = -EINVAL;
+        goto fail;
+    } else if (s->extent_size > 0x800000) {
+        error_setg(errp, "Extent size %" PRIu32 " is too large",
+                   s->extent_size);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors,
+                                       s->extent_size / BDRV_SECTOR_SIZE))
+    {
+        error_setg(errp, "Catalog size is too small for this disk size");
+        ret = -EINVAL;
+        goto fail;
+    }
 
     qemu_co_mutex_init(&s->lock);
     return 0;
@@ -168,8 +184,8 @@ fail:
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
     BDRVBochsState *s = bs->opaque;
-    int64_t offset = sector_num * 512;
-    int64_t extent_index, extent_offset, bitmap_offset;
+    uint64_t offset = sector_num * 512;
+    uint64_t extent_index, extent_offset, bitmap_offset;
     char bitmap_entry;
 
     // seek to sector
@@ -180,8 +196,9 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 	return -1; /* not allocated */
     }
 
-    bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
-	(s->extent_blocks + s->bitmap_blocks));
+    bitmap_offset = s->data_offset +
+        (512 * (uint64_t) s->catalog_bitmap[extent_index] *
+        (s->extent_blocks + s->bitmap_blocks));
 
     /* read in bitmap for current extent */
     if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
diff --git a/block/cloop.c b/block/cloop.c
index 6ea7cf404..b6ad50fbb 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,9 @@
 #include "qemu/module.h"
 #include <zlib.h>
 
+/* Maximum compressed block size */
+#define MAX_BLOCK_SIZE (64 * 1024 * 1024)
+
 typedef struct BDRVCloopState {
     CoMutex lock;
     uint32_t block_size;
@@ -53,7 +56,8 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
+static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVCloopState *s = bs->opaque;
     uint32_t offsets_size, max_compressed_block_size = 1, i;
@@ -67,6 +71,26 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
         return ret;
     }
     s->block_size = be32_to_cpu(s->block_size);
+    if (s->block_size % 512) {
+        error_setg(errp, "block_size %u must be a multiple of 512",
+                   s->block_size);
+        return -EINVAL;
+    }
+    if (s->block_size == 0) {
+        error_setg(errp, "block_size cannot be zero");
+        return -EINVAL;
+    }
+
+    /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but
+     * we can accept more.  Prevent ridiculous values like 4 GB - 1 since we
+     * need a buffer this big.
+     */
+    if (s->block_size > MAX_BLOCK_SIZE) {
+        error_setg(errp, "block_size %u must be %u MB or less",
+                   s->block_size,
+                   MAX_BLOCK_SIZE / (1024 * 1024));
+        return -EINVAL;
+    }
 
     ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
     if (ret < 0) {
@@ -75,7 +99,23 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
     s->n_blocks = be32_to_cpu(s->n_blocks);
 
     /* read offsets */
-    offsets_size = s->n_blocks * sizeof(uint64_t);
+    if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) {
+        /* Prevent integer overflow */
+        error_setg(errp, "n_blocks %u must be %zu or less",
+                   s->n_blocks,
+                   (UINT32_MAX - 1) / sizeof(uint64_t));
+        return -EINVAL;
+    }
+    offsets_size = (s->n_blocks + 1) * sizeof(uint64_t);
+    if (offsets_size > 512 * 1024 * 1024) {
+        /* Prevent ridiculous offsets_size which causes memory allocation to
+         * fail or overflows bdrv_pread() size.  In practice the 512 MB
+         * offsets[] limit supports 16 TB images at 256 KB block size.
+         */
+        error_setg(errp, "image requires too many offsets, "
+                   "try increasing block size");
+        return -EINVAL;
+    }
     s->offsets = g_malloc(offsets_size);
 
     ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
@@ -83,13 +123,37 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
-    for(i=0;i<s->n_blocks;i++) {
+    for (i = 0; i < s->n_blocks + 1; i++) {
+        uint64_t size;
+
         s->offsets[i] = be64_to_cpu(s->offsets[i]);
-        if (i > 0) {
-            uint32_t size = s->offsets[i] - s->offsets[i - 1];
-            if (size > max_compressed_block_size) {
-                max_compressed_block_size = size;
-            }
+        if (i == 0) {
+            continue;
+        }
+
+        if (s->offsets[i] < s->offsets[i - 1]) {
+            error_setg(errp, "offsets not monotonically increasing at "
+                       "index %u, image file is corrupt", i);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        size = s->offsets[i] - s->offsets[i - 1];
+
+        /* Compressed blocks should be smaller than the uncompressed block size
+         * but maybe compression performed poorly so the compressed block is
+         * actually bigger.  Clamp down on unrealistic values to prevent
+         * ridiculous s->compressed_block allocation.
+         */
+        if (size > 2 * MAX_BLOCK_SIZE) {
+            error_setg(errp, "invalid compressed block size at index %u, "
+                       "image file is corrupt", i);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        if (size > max_compressed_block_size) {
+            max_compressed_block_size = size;
         }
     }
 
@@ -179,9 +243,7 @@ static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
 static void cloop_close(BlockDriverState *bs)
 {
     BDRVCloopState *s = bs->opaque;
-    if (s->n_blocks > 0) {
-        g_free(s->offsets);
-    }
+    g_free(s->offsets);
     g_free(s->compressed_block);
     g_free(s->uncompressed_block);
     inflateEnd(&s->zstream);
diff --git a/block/commit.c b/block/commit.c
index 2227fc2e6..acec4ac5a 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -103,14 +103,14 @@ wait:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
-        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
             break;
         }
         /* Copy if allocated above the base */
-        ret = bdrv_co_is_allocated_above(top, base, sector_num,
-                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
-                                         &n);
+        ret = bdrv_is_allocated_above(top, base, sector_num,
+                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                      &n);
         copy = (ret == 1);
         trace_commit_one_iteration(s, sector_num, n, ret);
         if (copy) {
@@ -173,9 +173,9 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static const BlockJobType commit_job_type = {
+static const BlockJobDriver commit_job_driver = {
     .instance_size = sizeof(CommitBlockJob),
-    .job_type      = "commit",
+    .job_type      = BLOCK_JOB_TYPE_COMMIT,
     .set_speed     = commit_set_speed,
 };
 
@@ -198,13 +198,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    /* Once we support top == active layer, remove this check */
-    if (top == bs) {
-        error_setg(errp,
-                   "Top image as the active layer is currently unsupported");
-        return;
-    }
-
+    assert(top != bs);
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
         return;
@@ -238,7 +232,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
     }
 
 
-    s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
diff --git a/block/cow.c b/block/cow.c
index 1cc2e89c7..30deb88de 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -58,7 +58,8 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
         return 0;
 }
 
-static int cow_open(BlockDriverState *bs, QDict *options, int flags)
+static int cow_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVCowState *s = bs->opaque;
     struct cow_header_v2 cow_header;
@@ -73,7 +74,8 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
     }
 
     if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "Image not in COW format");
+        ret = -EINVAL;
         goto fail;
     }
 
@@ -81,7 +83,7 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
         char version[64];
         snprintf(version, sizeof(version),
                "COW version %d", cow_header.version);
-        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
             bs->device_name, "cow", version);
         ret = -ENOTSUP;
         goto fail;
@@ -102,42 +104,45 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
     return ret;
 }
 
-/*
- * XXX(hch): right now these functions are extremely inefficient.
- * We should just read the whole bitmap we'll need in one go instead.
- */
-static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors)
 {
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
-
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
+    int64_t bitnum = start, last = start + nb_sectors;
+    while (bitnum < last) {
+        if ((bitnum & 7) == 0 && bitnum + 8 <= last) {
+            bitmap[bitnum / 8] = 0xFF;
+            bitnum += 8;
+            continue;
+        }
+        bitmap[bitnum/8] |= (1 << (bitnum % 8));
+        bitnum++;
     }
+}
 
-    bitmap |= (1 << (bitnum % 8));
+#define BITS_PER_BITMAP_SECTOR (512 * 8)
 
-    ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-    return 0;
+/* Cannot use bitmap.c on big-endian machines.  */
+static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
+{
+    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
 }
 
-static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors)
 {
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
-
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
+    int streak_value = value ? 0xFF : 0;
+    int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR);
+    int bitnum = start;
+    while (bitnum < last) {
+        if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) {
+            bitnum += 8;
+            continue;
+        }
+        if (cow_test_bit(bitnum, bitmap) == value) {
+            bitnum++;
+            continue;
+        }
+        break;
     }
-
-    return !!(bitmap & (1 << (bitnum % 8)));
+    return MIN(bitnum, last) - start;
 }
 
 /* Return true if first block has been changed (ie. current version is
@@ -146,40 +151,100 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
 static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *num_same)
 {
-    int changed;
+    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
+    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
+    bool first = true;
+    int changed = 0, same = 0;
 
-    if (nb_sectors == 0) {
-	*num_same = nb_sectors;
-	return 0;
-    }
+    do {
+        int ret;
+        uint8_t bitmap[BDRV_SECTOR_SIZE];
 
-    changed = is_bit_set(bs, sector_num);
-    if (changed < 0) {
-        return 0; /* XXX: how to return I/O errors? */
-    }
+        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
+        int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
 
-    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
-	if (is_bit_set(bs, sector_num + *num_same) != changed)
-	    break;
-    }
+        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (first) {
+            changed = cow_test_bit(bitnum, bitmap);
+            first = false;
+        }
+
+        same += cow_find_streak(bitmap, changed, bitnum, nb_sectors);
+
+        bitnum += sector_bits;
+        nb_sectors -= sector_bits;
+        offset += BDRV_SECTOR_SIZE;
+    } while (nb_sectors);
 
+    *num_same = same;
     return changed;
 }
 
+static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same);
+    int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS);
+    if (ret < 0) {
+        return ret;
+    }
+    return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID;
+}
+
 static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
         int nb_sectors)
 {
-    int error = 0;
-    int i;
+    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
+    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
+    bool first = true;
+    int sector_bits;
+
+    for ( ; nb_sectors;
+            bitnum += sector_bits,
+            nb_sectors -= sector_bits,
+            offset += BDRV_SECTOR_SIZE) {
+        int ret, set;
+        uint8_t bitmap[BDRV_SECTOR_SIZE];
+
+        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
+        sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
+
+        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Skip over any already set bits */
+        set = cow_find_streak(bitmap, 1, bitnum, sector_bits);
+        bitnum += set;
+        sector_bits -= set;
+        nb_sectors -= set;
+        if (!sector_bits) {
+            continue;
+        }
 
-    for (i = 0; i < nb_sectors; i++) {
-        error = cow_set_bit(bs, sector_num + i);
-        if (error) {
-            break;
+        if (first) {
+            ret = bdrv_flush(bs->file);
+            if (ret < 0) {
+                return ret;
+            }
+            first = false;
+        }
+
+        cow_set_bits(bitmap, bitnum, sector_bits);
+
+        ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
         }
     }
 
-    return error;
+    return 0;
 }
 
 static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
@@ -189,7 +254,11 @@ static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
     int ret, n;
 
     while (nb_sectors > 0) {
-        if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
+        ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            return ret;
+        }
+        if (ret) {
             ret = bdrv_pread(bs->file,
                         s->cow_sectors_offset + sector_num * 512,
                         buf, n * 512);
@@ -255,12 +324,14 @@ static void cow_close(BlockDriverState *bs)
 {
 }
 
-static int cow_create(const char *filename, QEMUOptionParameter *options)
+static int cow_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     struct cow_header_v2 cow_header;
     struct stat st;
     int64_t image_sectors = 0;
     const char *image_filename = NULL;
+    Error *local_err = NULL;
     int ret;
     BlockDriverState *cow_bs;
 
@@ -274,13 +345,17 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    ret = bdrv_create_file(filename, options);
+    ret = bdrv_create_file(filename, options, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
-    ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR);
+    cow_bs = NULL;
+    ret = bdrv_open(&cow_bs, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
@@ -314,7 +389,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
     }
 
 exit:
-    bdrv_delete(cow_bs);
+    bdrv_unref(cow_bs);
     return ret;
 }
 
@@ -344,7 +419,7 @@ static BlockDriver bdrv_cow = {
 
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
-    .bdrv_co_is_allocated   = cow_co_is_allocated,
+    .bdrv_co_get_block_status   = cow_co_get_block_status,
 
     .create_options = cow_create_options,
 };
diff --git a/block/curl.c b/block/curl.c
index 82d39ff53..1b9b1f634 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -34,6 +34,11 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif
 
+#if LIBCURL_VERSION_NUM >= 0x071000
+/* The multi interface timer callback was introduced in 7.16.0 */
+#define NEED_CURL_TIMER_CALLBACK
+#endif
+
 #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
                    CURLPROTO_FTP | CURLPROTO_FTPS | \
                    CURLPROTO_TFTP)
@@ -77,6 +82,7 @@ typedef struct CURLState
 
 typedef struct BDRVCURLState {
     CURLM *multi;
+    QEMUTimer timer;
     size_t len;
     CURLState states[CURL_NUM_STATES];
     char *url;
@@ -86,7 +92,23 @@ typedef struct BDRVCURLState {
 
 static void curl_clean_state(CURLState *s);
 static void curl_multi_do(void *arg);
-static int curl_aio_flush(void *opaque);
+
+#ifdef NEED_CURL_TIMER_CALLBACK
+static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque)
+{
+    BDRVCURLState *s = opaque;
+
+    DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms);
+    if (timeout_ms == -1) {
+        timer_del(&s->timer);
+    } else {
+        int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000;
+        timer_mod(&s->timer,
+                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns);
+    }
+    return 0;
+}
+#endif
 
 static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
                         void *s, void *sp)
@@ -94,17 +116,16 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd);
     switch (action) {
         case CURL_POLL_IN:
-            qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, curl_aio_flush, s);
+            qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, s);
             break;
         case CURL_POLL_OUT:
-            qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, curl_aio_flush, s);
+            qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, s);
             break;
         case CURL_POLL_INOUT:
-            qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do,
-                                    curl_aio_flush, s);
+            qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, s);
             break;
         case CURL_POLL_REMOVE:
-            qemu_aio_set_fd_handler(fd, NULL, NULL, NULL, NULL);
+            qemu_aio_set_fd_handler(fd, NULL, NULL, NULL);
             break;
     }
 
@@ -136,6 +157,11 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
     if (!s || !s->orig_buf)
         goto read_end;
 
+    if (s->buf_off >= s->buf_len) {
+        /* buffer full, read nothing */
+        return 0;
+    }
+    realsize = MIN(realsize, s->buf_len - s->buf_off);
     memcpy(s->orig_buf + s->buf_off, ptr, realsize);
     s->buf_off += realsize;
 
@@ -211,20 +237,10 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
     return FIND_RET_NONE;
 }
 
-static void curl_multi_do(void *arg)
+static void curl_multi_read(BDRVCURLState *s)
 {
-    BDRVCURLState *s = (BDRVCURLState *)arg;
-    int running;
-    int r;
     int msgs_in_queue;
 
-    if (!s->multi)
-        return;
-
-    do {
-        r = curl_multi_socket_all(s->multi, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
     /* Try to find done transfers, so we can free the easy
      * handle again. */
     do {
@@ -268,6 +284,41 @@ static void curl_multi_do(void *arg)
     } while(msgs_in_queue);
 }
 
+static void curl_multi_do(void *arg)
+{
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+    int r;
+
+    if (!s->multi) {
+        return;
+    }
+
+    do {
+        r = curl_multi_socket_all(s->multi, &running);
+    } while(r == CURLM_CALL_MULTI_PERFORM);
+
+    curl_multi_read(s);
+}
+
+static void curl_multi_timeout_do(void *arg)
+{
+#ifdef NEED_CURL_TIMER_CALLBACK
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+
+    if (!s->multi) {
+        return;
+    }
+
+    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+    curl_multi_read(s);
+#else
+    abort();
+#endif
+}
+
 static CURLState *curl_init_state(BDRVCURLState *s)
 {
     CURLState *state = NULL;
@@ -397,7 +448,8 @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static int curl_open(BlockDriverState *bs, QDict *options, int flags)
+static int curl_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
     BDRVCURLState *s = bs->opaque;
     CURLState *state = NULL;
@@ -409,30 +461,27 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags)
     static int inited = 0;
 
     if (flags & BDRV_O_RDWR) {
-        qerror_report(ERROR_CLASS_GENERIC_ERROR,
-                      "curl block device does not support writes");
+        error_setg(errp, "curl block device does not support writes");
         return -EROFS;
     }
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         goto out_noclean;
     }
 
     s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE);
     if ((s->readahead_size & 0x1ff) != 0) {
-        fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n",
-                s->readahead_size);
+        error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512",
+                   s->readahead_size);
         goto out_noclean;
     }
 
     file = qemu_opt_get(opts, "url");
     if (file == NULL) {
-        qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires "
-                      "an 'url' option");
+        error_setg(errp, "curl block driver requires an 'url' option");
         goto out_noclean;
     }
 
@@ -474,12 +523,20 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags)
     curl_easy_cleanup(state->curl);
     state->curl = NULL;
 
+    aio_timer_init(bdrv_get_aio_context(bs), &s->timer,
+                   QEMU_CLOCK_REALTIME, SCALE_NS,
+                   curl_multi_timeout_do, s);
+
     // Now we know the file exists and its size, so let's
     // initialize the multi interface!
 
     s->multi = curl_multi_init();
     curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
     curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
+#ifdef NEED_CURL_TIMER_CALLBACK
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s);
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb);
+#endif
     curl_multi_do(s);
 
     qemu_opts_del(opts);
@@ -495,21 +552,6 @@ out_noclean:
     return -EINVAL;
 }
 
-static int curl_aio_flush(void *opaque)
-{
-    BDRVCURLState *s = opaque;
-    int i, j;
-
-    for (i=0; i < CURL_NUM_STATES; i++) {
-        for(j=0; j < CURL_NUM_ACB; j++) {
-            if (s->states[i].acb[j]) {
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
 static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     // Do we have to implement canceling? Seems to work without...
@@ -589,12 +631,6 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
     acb->nb_sectors = nb_sectors;
 
     acb->bh = qemu_bh_new(curl_readv_bh_cb, acb);
-
-    if (!acb->bh) {
-        DPRINTF("CURL: qemu_bh_new failed\n");
-        return NULL;
-    }
-
     qemu_bh_schedule(acb->bh);
     return &acb->common;
 }
@@ -619,6 +655,9 @@ static void curl_close(BlockDriverState *bs)
     }
     if (s->multi)
         curl_multi_cleanup(s->multi);
+
+    timer_del(&s->timer);
+
     g_free(s->url);
 }
 
diff --git a/block/dmg.c b/block/dmg.c
index 3141cb5b8..856402e1f 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -27,6 +27,14 @@
 #include "qemu/module.h"
 #include <zlib.h>
 
+enum {
+    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
+     * or truncating when converting to 32-bit types
+     */
+    DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
+    DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
+};
+
 typedef struct BDRVDMGState {
     CoMutex lock;
     /* each chunk contains a certain number of sectors,
@@ -92,12 +100,44 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
     return 0;
 }
 
-static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
+/* Increase max chunk sizes, if necessary.  This function is used to calculate
+ * the buffer sizes needed for compressed/uncompressed chunk I/O.
+ */
+static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
+                                  uint32_t *max_compressed_size,
+                                  uint32_t *max_sectors_per_chunk)
+{
+    uint32_t compressed_size = 0;
+    uint32_t uncompressed_sectors = 0;
+
+    switch (s->types[chunk]) {
+    case 0x80000005: /* zlib compressed */
+        compressed_size = s->lengths[chunk];
+        uncompressed_sectors = s->sectorcounts[chunk];
+        break;
+    case 1: /* copy */
+        uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
+        break;
+    case 2: /* zero */
+        uncompressed_sectors = s->sectorcounts[chunk];
+        break;
+    }
+
+    if (compressed_size > *max_compressed_size) {
+        *max_compressed_size = compressed_size;
+    }
+    if (uncompressed_sectors > *max_sectors_per_chunk) {
+        *max_sectors_per_chunk = uncompressed_sectors;
+    }
+}
+
+static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVDMGState *s = bs->opaque;
-    uint64_t info_begin,info_end,last_in_offset,last_out_offset;
+    uint64_t info_begin, info_end, last_in_offset, last_out_offset;
     uint32_t count, tmp;
-    uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+    uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i;
     int64_t offset;
     int ret;
 
@@ -159,37 +199,40 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
             goto fail;
         }
 
-	if (type == 0x6d697368 && count >= 244) {
-	    int new_size, chunk_count;
+        if (type == 0x6d697368 && count >= 244) {
+            size_t new_size;
+            uint32_t chunk_count;
 
             offset += 4;
             offset += 200;
 
-	    chunk_count = (count-204)/40;
-	    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
-	    s->types = g_realloc(s->types, new_size/2);
-	    s->offsets = g_realloc(s->offsets, new_size);
-	    s->lengths = g_realloc(s->lengths, new_size);
-	    s->sectors = g_realloc(s->sectors, new_size);
-	    s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+            chunk_count = (count - 204) / 40;
+            new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+            s->types = g_realloc(s->types, new_size / 2);
+            s->offsets = g_realloc(s->offsets, new_size);
+            s->lengths = g_realloc(s->lengths, new_size);
+            s->sectors = g_realloc(s->sectors, new_size);
+            s->sectorcounts = g_realloc(s->sectorcounts, new_size);
 
             for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
                 ret = read_uint32(bs, offset, &s->types[i]);
                 if (ret < 0) {
                     goto fail;
                 }
-		offset += 4;
-		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
-		    if(s->types[i]==0xffffffff) {
-			last_in_offset = s->offsets[i-1]+s->lengths[i-1];
-			last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
-		    }
-		    chunk_count--;
-		    i--;
-		    offset += 36;
-		    continue;
-		}
-		offset += 4;
+                offset += 4;
+                if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
+                    s->types[i] != 2) {
+                    if (s->types[i] == 0xffffffff && i > 0) {
+                        last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
+                        last_out_offset = s->sectors[i - 1] +
+                                          s->sectorcounts[i - 1];
+                    }
+                    chunk_count--;
+                    i--;
+                    offset += 36;
+                    continue;
+                }
+                offset += 4;
 
                 ret = read_uint64(bs, offset, &s->sectors[i]);
                 if (ret < 0) {
@@ -204,6 +247,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
                 }
                 offset += 8;
 
+                if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+                    error_report("sector count %" PRIu64 " for chunk %u is "
+                                 "larger than max (%u)",
+                                 s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+                    ret = -EINVAL;
+                    goto fail;
+                }
+
                 ret = read_uint64(bs, offset, &s->offsets[i]);
                 if (ret < 0) {
                     goto fail;
@@ -217,19 +268,25 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags)
                 }
                 offset += 8;
 
-		if(s->lengths[i]>max_compressed_size)
-		    max_compressed_size = s->lengths[i];
-		if(s->sectorcounts[i]>max_sectors_per_chunk)
-		    max_sectors_per_chunk = s->sectorcounts[i];
-	    }
-	    s->n_chunks+=chunk_count;
-	}
+                if (s->lengths[i] > DMG_LENGTHS_MAX) {
+                    error_report("length %" PRIu64 " for chunk %u is larger "
+                                 "than max (%u)",
+                                 s->lengths[i], i, DMG_LENGTHS_MAX);
+                    ret = -EINVAL;
+                    goto fail;
+                }
+
+                update_max_chunk_size(s, i, &max_compressed_size,
+                                      &max_sectors_per_chunk);
+            }
+            s->n_chunks += chunk_count;
+        }
     }
 
     /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size+1);
-    s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
-    if(inflateInit(&s->zstream) != Z_OK) {
+    s->compressed_chunk = g_malloc(max_compressed_size + 1);
+    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
+    if (inflateInit(&s->zstream) != Z_OK) {
         ret = -EINVAL;
         goto fail;
     }
@@ -251,83 +308,82 @@ fail:
 }
 
 static inline int is_sector_in_chunk(BDRVDMGState* s,
-		uint32_t chunk_num,int sector_num)
+                uint32_t chunk_num, uint64_t sector_num)
 {
-    if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
-	    s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
-	return 0;
-    else
-	return -1;
+    if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num ||
+            s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) {
+        return 0;
+    } else {
+        return -1;
+    }
 }
 
-static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num)
 {
     /* binary search */
-    uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
-    while(chunk1!=chunk2) {
-	chunk3 = (chunk1+chunk2)/2;
-	if(s->sectors[chunk3]>sector_num)
-	    chunk2 = chunk3;
-	else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
-	    return chunk3;
-	else
-	    chunk1 = chunk3;
+    uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3;
+    while (chunk1 != chunk2) {
+        chunk3 = (chunk1 + chunk2) / 2;
+        if (s->sectors[chunk3] > sector_num) {
+            chunk2 = chunk3;
+        } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) {
+            return chunk3;
+        } else {
+            chunk1 = chunk3;
+        }
     }
     return s->n_chunks; /* error */
 }
 
-static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num)
+static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
 {
     BDRVDMGState *s = bs->opaque;
 
-    if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
-	int ret;
-	uint32_t chunk = search_chunk(s,sector_num);
-
-	if(chunk>=s->n_chunks)
-	    return -1;
-
-	s->current_chunk = s->n_chunks;
-	switch(s->types[chunk]) {
-	case 0x80000005: { /* zlib compressed */
-	    int i;
-
-	    /* we need to buffer, because only the chunk as whole can be
-	     * inflated. */
-	    i=0;
-	    do {
-                ret = bdrv_pread(bs->file, s->offsets[chunk] + i,
-                                 s->compressed_chunk+i, s->lengths[chunk]-i);
-		if(ret<0 && errno==EINTR)
-		    ret=0;
-		i+=ret;
-	    } while(ret>=0 && ret+i<s->lengths[chunk]);
-
-	    if (ret != s->lengths[chunk])
-		return -1;
-
-	    s->zstream.next_in = s->compressed_chunk;
-	    s->zstream.avail_in = s->lengths[chunk];
-	    s->zstream.next_out = s->uncompressed_chunk;
-	    s->zstream.avail_out = 512*s->sectorcounts[chunk];
-	    ret = inflateReset(&s->zstream);
-	    if(ret != Z_OK)
-		return -1;
-	    ret = inflate(&s->zstream, Z_FINISH);
-	    if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
-		return -1;
-	    break; }
-	case 1: /* copy */
-	    ret = bdrv_pread(bs->file, s->offsets[chunk],
+    if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
+        int ret;
+        uint32_t chunk = search_chunk(s, sector_num);
+
+        if (chunk >= s->n_chunks) {
+            return -1;
+        }
+
+        s->current_chunk = s->n_chunks;
+        switch (s->types[chunk]) {
+        case 0x80000005: { /* zlib compressed */
+            /* we need to buffer, because only the chunk as whole can be
+             * inflated. */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
+                             s->compressed_chunk, s->lengths[chunk]);
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }
+
+            s->zstream.next_in = s->compressed_chunk;
+            s->zstream.avail_in = s->lengths[chunk];
+            s->zstream.next_out = s->uncompressed_chunk;
+            s->zstream.avail_out = 512 * s->sectorcounts[chunk];
+            ret = inflateReset(&s->zstream);
+            if (ret != Z_OK) {
+                return -1;
+            }
+            ret = inflate(&s->zstream, Z_FINISH);
+            if (ret != Z_STREAM_END ||
+                s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
+                return -1;
+            }
+            break; }
+        case 1: /* copy */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->uncompressed_chunk, s->lengths[chunk]);
-	    if (ret != s->lengths[chunk])
-		return -1;
-	    break;
-	case 2: /* zero */
-	    memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
-	    break;
-	}
-	s->current_chunk = chunk;
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }
+            break;
+        case 2: /* zero */
+            memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]);
+            break;
+        }
+        s->current_chunk = chunk;
     }
     return 0;
 }
@@ -338,12 +394,14 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num,
     BDRVDMGState *s = bs->opaque;
     int i;
 
-    for(i=0;i<nb_sectors;i++) {
-	uint32_t sector_offset_in_chunk;
-	if(dmg_read_chunk(bs, sector_num+i) != 0)
-	    return -1;
-	sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
-	memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+    for (i = 0; i < nb_sectors; i++) {
+        uint32_t sector_offset_in_chunk;
+        if (dmg_read_chunk(bs, sector_num + i) != 0) {
+            return -1;
+        }
+        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
+        memcpy(buf + i * 512,
+               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
     }
     return 0;
 }
@@ -375,12 +433,12 @@ static void dmg_close(BlockDriverState *bs)
 }
 
 static BlockDriver bdrv_dmg = {
-    .format_name	= "dmg",
-    .instance_size	= sizeof(BDRVDMGState),
-    .bdrv_probe		= dmg_probe,
-    .bdrv_open		= dmg_open,
-    .bdrv_read          = dmg_co_read,
-    .bdrv_close		= dmg_close,
+    .format_name    = "dmg",
+    .instance_size  = sizeof(BDRVDMGState),
+    .bdrv_probe     = dmg_probe,
+    .bdrv_open      = dmg_open,
+    .bdrv_read      = dmg_co_read,
+    .bdrv_close     = dmg_close,
 };
 
 static void bdrv_dmg_init(void)
diff --git a/block/gluster.c b/block/gluster.c
index 645b7f12a..883608564 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -3,43 +3,26 @@
  *
  * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
  *
- * Pipe handling mechanism in AIO implementation is derived from
- * block/rbd.c. Hence,
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  *
- * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
- *                         Josh Durgin <josh.durgin@dreamhost.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
  */
 #include <glusterfs/api/glfs.h>
 #include "block/block_int.h"
-#include "qemu/sockets.h"
 #include "qemu/uri.h"
 
 typedef struct GlusterAIOCB {
-    BlockDriverAIOCB common;
     int64_t size;
     int ret;
-    bool *finished;
     QEMUBH *bh;
+    Coroutine *coroutine;
 } GlusterAIOCB;
 
 typedef struct BDRVGlusterState {
     struct glfs *glfs;
-    int fds[2];
     struct glfs_fd *fd;
-    int qemu_aio_count;
-    int event_reader_pos;
-    GlusterAIOCB *event_acb;
 } BDRVGlusterState;
 
-#define GLUSTER_FD_READ  0
-#define GLUSTER_FD_WRITE 1
-
 typedef struct GlusterConf {
     char *server;
     int port;
@@ -50,11 +33,13 @@ typedef struct GlusterConf {
 
 static void qemu_gluster_gconf_free(GlusterConf *gconf)
 {
-    g_free(gconf->server);
-    g_free(gconf->volname);
-    g_free(gconf->image);
-    g_free(gconf->transport);
-    g_free(gconf);
+    if (gconf) {
+        g_free(gconf->server);
+        g_free(gconf->volname);
+        g_free(gconf->image);
+        g_free(gconf->transport);
+        g_free(gconf);
+    }
 }
 
 static int parse_volume_options(GlusterConf *gconf, char *path)
@@ -95,7 +80,7 @@ static int parse_volume_options(GlusterConf *gconf, char *path)
  * 'server' specifies the server where the volume file specification for
  * the given volume resides. This can be either hostname, ipv4 address
  * or ipv6 address. ipv6 address needs to be within square brackets [ ].
- * If transport type is 'unix', then 'server' field should not be specifed.
+ * If transport type is 'unix', then 'server' field should not be specified.
  * The 'socket' field needs to be populated with the path to unix domain
  * socket.
  *
@@ -132,7 +117,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
     }
 
     /* transport */
-    if (!strcmp(uri->scheme, "gluster")) {
+    if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
         gconf->transport = g_strdup("tcp");
     } else if (!strcmp(uri->scheme, "gluster+tcp")) {
         gconf->transport = g_strdup("tcp");
@@ -168,7 +153,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
         }
         gconf->server = g_strdup(qp->p[0].value);
     } else {
-        gconf->server = g_strdup(uri->server);
+        gconf->server = g_strdup(uri->server ? uri->server : "localhost");
         gconf->port = uri->port;
     }
 
@@ -180,7 +165,8 @@ out:
     return ret;
 }
 
-static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
+static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
+                                      Error **errp)
 {
     struct glfs *glfs = NULL;
     int ret;
@@ -188,8 +174,8 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
 
     ret = qemu_gluster_parseuri(gconf, filename);
     if (ret < 0) {
-        error_report("Usage: file=gluster[+transport]://[server[:port]]/"
-            "volname/image[?socket=...]");
+        error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
+                   "volname/image[?socket=...]");
         errno = -ret;
         goto out;
     }
@@ -216,9 +202,11 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
 
     ret = glfs_init(glfs);
     if (ret) {
-        error_report("Gluster connection failed for server=%s port=%d "
-             "volume=%s image=%s transport=%s", gconf->server, gconf->port,
-             gconf->volname, gconf->image, gconf->transport);
+        error_setg_errno(errp, errno,
+                         "Gluster connection failed for server=%s port=%d "
+                         "volume=%s image=%s transport=%s", gconf->server,
+                         gconf->port, gconf->volname, gconf->image,
+                         gconf->transport);
         goto out;
     }
     return glfs;
@@ -232,54 +220,32 @@ out:
     return NULL;
 }
 
-static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+static void qemu_gluster_complete_aio(void *opaque)
 {
-    int ret;
-    bool *finished = acb->finished;
-    BlockDriverCompletionFunc *cb = acb->common.cb;
-    void *opaque = acb->common.opaque;
-
-    if (!acb->ret || acb->ret == acb->size) {
-        ret = 0; /* Success */
-    } else if (acb->ret < 0) {
-        ret = acb->ret; /* Read/Write failed */
-    } else {
-        ret = -EIO; /* Partial read/write - fail it */
-    }
+    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
 
-    s->qemu_aio_count--;
-    qemu_aio_release(acb);
-    cb(opaque, ret);
-    if (finished) {
-        *finished = true;
-    }
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_coroutine_enter(acb->coroutine, NULL);
 }
 
-static void qemu_gluster_aio_event_reader(void *opaque)
+/*
+ * AIO callback routine called from GlusterFS thread.
+ */
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
 {
-    BDRVGlusterState *s = opaque;
-    ssize_t ret;
-
-    do {
-        char *p = (char *)&s->event_acb;
-
-        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_acb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_acb)) {
-                s->event_reader_pos = 0;
-                qemu_gluster_complete_aio(s->event_acb, s);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
-}
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
 
-static int qemu_gluster_aio_flush_cb(void *opaque)
-{
-    BDRVGlusterState *s = opaque;
+    if (!ret || ret == acb->size) {
+        acb->ret = 0; /* Success */
+    } else if (ret < 0) {
+        acb->ret = ret; /* Read/Write failed */
+    } else {
+        acb->ret = -EIO; /* Partial read/write - fail it */
+    }
 
-    return (s->qemu_aio_count > 0);
+    acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
+    qemu_bh_schedule(acb->bh);
 }
 
 /* TODO Convert to fine grained options */
@@ -296,60 +262,57 @@ static QemuOptsList runtime_opts = {
     },
 };
 
+static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
+{
+    assert(open_flags != NULL);
+
+    *open_flags |= O_BINARY;
+
+    if (bdrv_flags & BDRV_O_RDWR) {
+        *open_flags |= O_RDWR;
+    } else {
+        *open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        *open_flags |= O_DIRECT;
+    }
+}
+
 static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
-                             int bdrv_flags)
+                             int bdrv_flags, Error **errp)
 {
     BDRVGlusterState *s = bs->opaque;
-    int open_flags = O_BINARY;
+    int open_flags = 0;
     int ret = 0;
     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto out;
     }
 
     filename = qemu_opt_get(opts, "filename");
 
-
-    s->glfs = qemu_gluster_init(gconf, filename);
+    s->glfs = qemu_gluster_init(gconf, filename, errp);
     if (!s->glfs) {
         ret = -errno;
         goto out;
     }
 
-    if (bdrv_flags & BDRV_O_RDWR) {
-        open_flags |= O_RDWR;
-    } else {
-        open_flags |= O_RDONLY;
-    }
-
-    if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        open_flags |= O_DIRECT;
-    }
+    qemu_gluster_parse_flags(bdrv_flags, &open_flags);
 
     s->fd = glfs_open(s->glfs, gconf->image, open_flags);
     if (!s->fd) {
         ret = -errno;
-        goto out;
     }
 
-    ret = qemu_pipe(s->fds);
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
-    }
-    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
-        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
-
 out:
     qemu_opts_del(opts);
     qemu_gluster_gconf_free(gconf);
@@ -365,24 +328,180 @@ out:
     return ret;
 }
 
+typedef struct BDRVGlusterReopenState {
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+} BDRVGlusterReopenState;
+
+
+static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
+                                       BlockReopenQueue *queue, Error **errp)
+{
+    int ret = 0;
+    BDRVGlusterReopenState *reop_s;
+    GlusterConf *gconf = NULL;
+    int open_flags = 0;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState));
+    reop_s = state->opaque;
+
+    qemu_gluster_parse_flags(state->flags, &open_flags);
+
+    gconf = g_malloc0(sizeof(GlusterConf));
+
+    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
+    if (reop_s->glfs == NULL) {
+        ret = -errno;
+        goto exit;
+    }
+
+    reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
+    if (reop_s->fd == NULL) {
+        /* reops->glfs will be cleaned up in _abort */
+        ret = -errno;
+        goto exit;
+    }
+
+exit:
+    /* state->opaque will be freed in either the _abort or _commit */
+    qemu_gluster_gconf_free(gconf);
+    return ret;
+}
+
+static void qemu_gluster_reopen_commit(BDRVReopenState *state)
+{
+    BDRVGlusterReopenState *reop_s = state->opaque;
+    BDRVGlusterState *s = state->bs->opaque;
+
+
+    /* close the old */
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+
+    /* use the newly opened image / connection */
+    s->fd         = reop_s->fd;
+    s->glfs       = reop_s->glfs;
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+
+    return;
+}
+
+
+static void qemu_gluster_reopen_abort(BDRVReopenState *state)
+{
+    BDRVGlusterReopenState *reop_s = state->opaque;
+
+    if (reop_s == NULL) {
+        return;
+    }
+
+    if (reop_s->fd) {
+        glfs_close(reop_s->fd);
+    }
+
+    if (reop_s->glfs) {
+        glfs_fini(reop_s->glfs);
+    }
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+
+    return;
+}
+
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    BDRVGlusterState *s = bs->opaque;
+    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+
+    acb->size = size;
+    acb->ret = 0;
+    acb->coroutine = qemu_coroutine_self();
+
+    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
+
+out:
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
+}
+
+static inline bool gluster_supports_zerofill(void)
+{
+    return 1;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return glfs_zerofill(fd, offset, size);
+}
+
+#else
+static inline bool gluster_supports_zerofill(void)
+{
+    return 0;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return 0;
+}
+#endif
+
 static int qemu_gluster_create(const char *filename,
-        QEMUOptionParameter *options)
+        QEMUOptionParameter *options, Error **errp)
 {
     struct glfs *glfs;
     struct glfs_fd *fd;
     int ret = 0;
+    int prealloc = 0;
     int64_t total_size = 0;
     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
 
-    glfs = qemu_gluster_init(gconf, filename);
+    glfs = qemu_gluster_init(gconf, filename, errp);
     if (!glfs) {
-        ret = -errno;
+        ret = -EINVAL;
         goto out;
     }
 
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
             total_size = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+            if (!options->value.s || !strcmp(options->value.s, "off")) {
+                prealloc = 0;
+            } else if (!strcmp(options->value.s, "full") &&
+                    gluster_supports_zerofill()) {
+                prealloc = 1;
+            } else {
+                error_setg(errp, "Invalid preallocation mode: '%s'"
+                    " or GlusterFS doesn't support zerofill API",
+                           options->value.s);
+                ret = -EINVAL;
+                goto out;
+            }
         }
         options++;
     }
@@ -392,9 +511,15 @@ static int qemu_gluster_create(const char *filename,
     if (!fd) {
         ret = -errno;
     } else {
-        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+        if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
+            if (prealloc && qemu_gluster_zerofill(fd, 0,
+                    total_size * BDRV_SECTOR_SIZE)) {
+                ret = -errno;
+            }
+        } else {
             ret = -errno;
         }
+
         if (glfs_close(fd) != 0) {
             ret = -errno;
         }
@@ -407,72 +532,18 @@ out:
     return ret;
 }
 
-static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
-    bool finished = false;
-
-    acb->finished = &finished;
-    while (!finished) {
-        qemu_aio_wait();
-    }
-}
-
-static const AIOCBInfo gluster_aiocb_info = {
-    .aiocb_size = sizeof(GlusterAIOCB),
-    .cancel = qemu_gluster_aio_cancel,
-};
-
-static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
-    BlockDriverState *bs = acb->common.bs;
-    BDRVGlusterState *s = bs->opaque;
-    int retval;
-
-    acb->ret = ret;
-    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
-    if (retval != sizeof(acb)) {
-        /*
-         * Gluster AIO callback thread failed to notify the waiting
-         * QEMU thread about IO completion.
-         *
-         * Complete this IO request and make the disk inaccessible for
-         * subsequent reads and writes.
-         */
-        error_report("Gluster failed to notify QEMU about IO completion");
-
-        qemu_mutex_lock_iothread(); /* We are in gluster thread context */
-        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_release(acb);
-        s->qemu_aio_count--;
-        close(s->fds[GLUSTER_FD_READ]);
-        close(s->fds[GLUSTER_FD_WRITE]);
-        qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
-            NULL);
-        bs->drv = NULL; /* Make the disk inaccessible */
-        qemu_mutex_unlock_iothread();
-    }
-}
-
-static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int write)
+static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
-
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-    s->qemu_aio_count++;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = size;
     acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();
 
     if (write) {
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
@@ -483,14 +554,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
     }
 
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    s->qemu_aio_count--;
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 
 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -506,75 +579,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
     return 0;
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
 
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = 0;
     acb->ret = 0;
-    acb->finished = NULL;
-    s->qemu_aio_count++;
+    acb->coroutine = qemu_coroutine_self();
 
     ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    s->qemu_aio_count--;
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 
 #ifdef CONFIG_GLUSTERFS_DISCARD
-static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
-        void *opaque)
+static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = 0;
     acb->ret = 0;
-    acb->finished = NULL;
-    s->qemu_aio_count++;
+    acb->coroutine = qemu_coroutine_self();
 
     ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    s->qemu_aio_count--;
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 #endif
 
@@ -609,10 +675,6 @@ static void qemu_gluster_close(BlockDriverState *bs)
 {
     BDRVGlusterState *s = bs->opaque;
 
-    close(s->fds[GLUSTER_FD_READ]);
-    close(s->fds[GLUSTER_FD_WRITE]);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
-
     if (s->fd) {
         glfs_close(s->fd);
         s->fd = NULL;
@@ -632,6 +694,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = {
         .type = OPT_SIZE,
         .help = "Virtual disk size"
     },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, full)"
+    },
     { NULL }
 };
 
@@ -639,18 +706,25 @@ static BlockDriver bdrv_gluster = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster",
     .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_needs_filename          = true,
     .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
+    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
+    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
     .bdrv_close                   = qemu_gluster_close,
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -659,18 +733,25 @@ static BlockDriver bdrv_gluster_tcp = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+tcp",
     .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_needs_filename          = true,
     .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
+    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
+    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
     .bdrv_close                   = qemu_gluster_close,
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -679,18 +760,25 @@ static BlockDriver bdrv_gluster_unix = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+unix",
     .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_needs_filename          = true,
     .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
+    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
+    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
     .bdrv_close                   = qemu_gluster_close,
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -699,18 +787,25 @@ static BlockDriver bdrv_gluster_rdma = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+rdma",
     .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_needs_filename          = true,
     .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
+    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
+    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
     .bdrv_close                   = qemu_gluster_close,
     .bdrv_create                  = qemu_gluster_create,
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
diff --git a/block/iscsi.c b/block/iscsi.c
index e7c1c2b53..f425573df 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,6 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
+ * Copyright (c) 2012-2013 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
 #include "trace.h"
 #include "block/scsi.h"
 #include "qemu/iov.h"
+#include "sysemu/sysemu.h"
+#include "qmp-commands.h"
 
 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
@@ -50,8 +53,24 @@ typedef struct IscsiLun {
     uint64_t num_blocks;
     int events;
     QEMUTimer *nop_timer;
+    uint8_t lbpme;
+    uint8_t lbprz;
+    uint8_t has_write_same;
+    struct scsi_inquiry_logical_block_provisioning lbp;
+    struct scsi_inquiry_block_limits bl;
+    unsigned char *zeroblock;
 } IscsiLun;
 
+typedef struct IscsiTask {
+    int status;
+    int complete;
+    int retries;
+    int do_retry;
+    struct scsi_task *task;
+    Coroutine *co;
+    QEMUBH *bh;
+} IscsiTask;
+
 typedef struct IscsiAIOCB {
     BlockDriverAIOCB common;
     QEMUIOVector *qiov;
@@ -105,6 +124,50 @@ iscsi_schedule_bh(IscsiAIOCB *acb)
     qemu_bh_schedule(acb->bh);
 }
 
+static void iscsi_co_generic_bh_cb(void *opaque)
+{
+    struct IscsiTask *iTask = opaque;
+    qemu_bh_delete(iTask->bh);
+    qemu_coroutine_enter(iTask->co, NULL);
+}
+
+static void
+iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
+                        void *command_data, void *opaque)
+{
+    struct IscsiTask *iTask = opaque;
+    struct scsi_task *task = command_data;
+
+    iTask->complete = 1;
+    iTask->status = status;
+    iTask->do_retry = 0;
+    iTask->task = task;
+
+    if (iTask->retries-- > 0 && status == SCSI_STATUS_CHECK_CONDITION
+        && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) {
+        error_report("iSCSI CheckCondition: %s", iscsi_get_error(iscsi));
+        iTask->do_retry = 1;
+        goto out;
+    }
+
+    if (status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+    }
+
+out:
+    if (iTask->co) {
+        iTask->bh = qemu_bh_new(iscsi_co_generic_bh_cb, iTask);
+        qemu_bh_schedule(iTask->bh);
+    }
+}
+
+static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask)
+{
+    *iTask = (struct IscsiTask) {
+        .co         = qemu_coroutine_self(),
+        .retries    = ISCSI_CMD_RETRIES,
+    };
+}
 
 static void
 iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
@@ -146,13 +209,6 @@ static const AIOCBInfo iscsi_aiocb_info = {
 static void iscsi_process_read(void *arg);
 static void iscsi_process_write(void *arg);
 
-static int iscsi_process_flush(void *arg)
-{
-    IscsiLun *iscsilun = arg;
-
-    return iscsi_queue_length(iscsilun->iscsi) > 0;
-}
-
 static void
 iscsi_set_events(IscsiLun *iscsilun)
 {
@@ -166,7 +222,6 @@ iscsi_set_events(IscsiLun *iscsilun)
         qemu_aio_set_fd_handler(iscsi_get_fd(iscsi),
                       iscsi_process_read,
                       (ev & POLLOUT) ? iscsi_process_write : NULL,
-                      iscsi_process_flush,
                       iscsilun);
 
     }
@@ -194,44 +249,6 @@ iscsi_process_write(void *arg)
     iscsi_set_events(iscsilun);
 }
 
-static int
-iscsi_aio_writev_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
-                     void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled);
-
-    g_free(acb->buf);
-    acb->buf = NULL;
-
-    if (acb->canceled != 0) {
-        return;
-    }
-
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_writev_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to write16 data to iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
-    }
-
-    iscsi_schedule_bh(acb);
-}
-
 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
 {
     return sector * iscsilun->block_size / BDRV_SECTOR_SIZE;
@@ -256,406 +273,182 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
     return 1;
 }
 
-static int
-iscsi_aio_writev_acb(IscsiAIOCB *acb)
+static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        QEMUIOVector *iov)
 {
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-    size_t size;
-    uint32_t num_sectors;
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
     uint64_t lba;
-#if !defined(LIBISCSI_FEATURE_IOVECTOR)
-    struct iscsi_data data;
-#endif
-    int ret;
-
-    acb->canceled   = 0;
-    acb->bh         = NULL;
-    acb->status     = -EINPROGRESS;
-    acb->buf        = NULL;
+    uint32_t num_sectors;
+    uint8_t *data = NULL;
+    uint8_t *buf = NULL;
 
-    /* this will allow us to get rid of 'buf' completely */
-    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }
 
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
 #if !defined(LIBISCSI_FEATURE_IOVECTOR)
-    data.size = MIN(size, acb->qiov->size);
-
     /* if the iovec only contains one buffer we can pass it directly */
-    if (acb->qiov->niov == 1) {
-        data.data = acb->qiov->iov[0].iov_base;
+    if (iov->niov == 1) {
+        data = iov->iov[0].iov_base;
     } else {
-        acb->buf = g_malloc(data.size);
-        qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size);
-        data.data = acb->buf;
+        size_t size = MIN(nb_sectors * BDRV_SECTOR_SIZE, iov->size);
+        buf = g_malloc(size);
+        qemu_iovec_to_buf(iov, 0, buf, size);
+        data = buf;
     }
 #endif
-
-    acb->task = malloc(sizeof(struct scsi_task));
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to allocate task for scsi WRITE16 "
-                     "command. %s", iscsi_get_error(iscsi));
-        return -1;
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                    data, num_sectors * iscsilun->block_size,
+                                    iscsilun->block_size, 0, 0, 0, 0, 0,
+                                    iscsi_co_generic_cb, &iTask);
+    if (iTask.task == NULL) {
+        g_free(buf);
+        return -ENOMEM;
     }
-    memset(acb->task, 0, sizeof(struct scsi_task));
-
-    acb->task->xfer_dir = SCSI_XFER_WRITE;
-    acb->task->cdb_size = 16;
-    acb->task->cdb[0] = 0x8a;
-    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
-    *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
-    *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
-    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
-    *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
-    acb->task->expxferlen = size;
-
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_write16_cb,
-                                   NULL,
-                                   acb);
-#else
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_write16_cb,
-                                   &data,
-                                   acb);
+    scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
+                          iov->niov);
 #endif
-    if (ret != 0) {
-        scsi_free_scsi_task(acb->task);
-        g_free(acb->buf);
-        return -1;
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
     }
 
-#if defined(LIBISCSI_FEATURE_IOVECTOR)
-    scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
-#endif
-
-    return 0;
-}
-
-static BlockDriverAIOCB *
-iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                 QEMUIOVector *qiov, int nb_sectors,
-                 BlockDriverCompletionFunc *cb,
-                 void *opaque)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    IscsiAIOCB *acb;
-
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return NULL;
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
     }
 
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->iscsilun    = iscsilun;
-    acb->qiov        = qiov;
-    acb->nb_sectors  = nb_sectors;
-    acb->sector_num  = sector_num;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_writev_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+    if (iTask.do_retry) {
+        iTask.complete = 0;
+        goto retry;
     }
 
-    iscsi_set_events(iscsilun);
-    return &acb->common;
-}
-
-static int
-iscsi_aio_readv_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
-                    void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);
+    g_free(buf);
 
-    if (acb->canceled != 0) {
-        return;
-    }
-
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_readv_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to read16 data from iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
     }
 
-    iscsi_schedule_bh(acb);
+    return 0;
 }
 
-static int
-iscsi_aio_readv_acb(IscsiAIOCB *acb)
+static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       QEMUIOVector *iov)
 {
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-    size_t size;
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
     uint64_t lba;
     uint32_t num_sectors;
-    int ret;
 #if !defined(LIBISCSI_FEATURE_IOVECTOR)
     int i;
 #endif
 
-    acb->canceled    = 0;
-    acb->bh          = NULL;
-    acb->status      = -EINPROGRESS;
-    acb->buf         = NULL;
-
-    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb->task = malloc(sizeof(struct scsi_task));
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to allocate task for scsi READ16 "
-                     "command. %s", iscsi_get_error(iscsi));
-        return -1;
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
     }
-    memset(acb->task, 0, sizeof(struct scsi_task));
 
-    acb->task->xfer_dir = SCSI_XFER_READ;
-    acb->task->expxferlen = size;
-    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
-    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
 
-    switch (acb->iscsilun->type) {
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    switch (iscsilun->type) {
     case TYPE_DISK:
-        acb->task->cdb_size = 16;
-        acb->task->cdb[0]  = 0x88;
-        *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
-        *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
-        *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
+        iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                       num_sectors * iscsilun->block_size,
+                                       iscsilun->block_size, 0, 0, 0, 0, 0,
+                                       iscsi_co_generic_cb, &iTask);
         break;
     default:
-        acb->task->cdb_size = 10;
-        acb->task->cdb[0]  = 0x28;
-        *(uint32_t *)&acb->task->cdb[2] = htonl(lba);
-        *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
+        iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                       num_sectors * iscsilun->block_size,
+                                       iscsilun->block_size,
+#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */
+                                       0, 0, 0, 0, 0,
+#endif
+                                       iscsi_co_generic_cb, &iTask);
         break;
     }
-
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_read16_cb,
-                                   NULL,
-                                   acb);
-    if (ret != 0) {
-        scsi_free_scsi_task(acb->task);
-        return -1;
+    if (iTask.task == NULL) {
+        return -ENOMEM;
     }
-
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
-    scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+    scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
 #else
-    for (i = 0; i < acb->qiov->niov; i++) {
-        scsi_task_add_data_in_buffer(acb->task,
-                acb->qiov->iov[i].iov_len,
-                acb->qiov->iov[i].iov_base);
+    for (i = 0; i < iov->niov; i++) {
+        scsi_task_add_data_in_buffer(iTask.task,
+                                     iov->iov[i].iov_len,
+                                     iov->iov[i].iov_base);
     }
 #endif
-    return 0;
-}
 
-static BlockDriverAIOCB *
-iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                QEMUIOVector *qiov, int nb_sectors,
-                BlockDriverCompletionFunc *cb,
-                void *opaque)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    IscsiAIOCB *acb;
-
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return NULL;
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
     }
 
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->nb_sectors  = nb_sectors;
-    acb->sector_num  = sector_num;
-    acb->iscsilun    = iscsilun;
-    acb->qiov        = qiov;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_readv_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
     }
 
-    iscsi_set_events(iscsilun);
-    return &acb->common;
-}
-
-static int
-iscsi_aio_flush_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
-                     void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    if (acb->canceled != 0) {
-        return;
+    if (iTask.do_retry) {
+        iTask.complete = 0;
+        goto retry;
     }
 
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_flush_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to sync10 data on iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
-    }
-
-    iscsi_schedule_bh(acb);
-}
-
-static int
-iscsi_aio_flush_acb(IscsiAIOCB *acb)
-{
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-
-    acb->canceled   = 0;
-    acb->bh         = NULL;
-    acb->status     = -EINPROGRESS;
-    acb->buf        = NULL;
-
-    acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun,
-                                         0, 0, 0, 0,
-                                         iscsi_synccache10_cb,
-                                         acb);
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to send synchronizecache10 command. %s",
-                     iscsi_get_error(iscsi));
-        return -1;
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
     }
 
     return 0;
 }
 
-static BlockDriverAIOCB *
-iscsi_aio_flush(BlockDriverState *bs,
-                BlockDriverCompletionFunc *cb, void *opaque)
+static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 {
     IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
 
-    IscsiAIOCB *acb;
-
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
-    acb->iscsilun    = iscsilun;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_flush_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+    if (bs->sg) {
+        return 0;
     }
 
-    iscsi_set_events(iscsilun);
-
-    return &acb->common;
-}
-
-static int iscsi_aio_discard_acb(IscsiAIOCB *acb);
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
 
-static void
-iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
-                     void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    if (acb->canceled != 0) {
-        return;
+retry:
+    if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
+                                      0, iscsi_co_generic_cb, &iTask) == NULL) {
+        return -ENOMEM;
     }
 
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_discard_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to unmap data on iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
     }
 
-    iscsi_schedule_bh(acb);
-}
-
-static int iscsi_aio_discard_acb(IscsiAIOCB *acb) {
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-    struct unmap_list list[1];
-
-    acb->canceled   = 0;
-    acb->bh         = NULL;
-    acb->status     = -EINPROGRESS;
-    acb->buf        = NULL;
-
-    list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
-    list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size;
-
-    acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun,
-                                 0, 0, &list[0], 1,
-                                 iscsi_unmap_cb,
-                                 acb);
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to send unmap command. %s",
-                     iscsi_get_error(iscsi));
-        return -1;
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
     }
 
-    return 0;
-}
-
-static BlockDriverAIOCB *
-iscsi_aio_discard(BlockDriverState *bs,
-                  int64_t sector_num, int nb_sectors,
-                  BlockDriverCompletionFunc *cb, void *opaque)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    IscsiAIOCB *acb;
-
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
-    acb->iscsilun    = iscsilun;
-    acb->nb_sectors  = nb_sectors;
-    acb->sector_num  = sector_num;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_discard_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+    if (iTask.do_retry) {
+        iTask.complete = 0;
+        goto retry;
     }
 
-    iscsi_set_events(iscsilun);
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
+    }
 
-    return &acb->common;
+    return 0;
 }
 
 #ifdef __linux__
@@ -850,7 +643,234 @@ iscsi_getlength(BlockDriverState *bs)
     return len;
 }
 
-static int parse_chap(struct iscsi_context *iscsi, const char *target)
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+
+static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
+                                                  int64_t sector_num,
+                                                  int nb_sectors, int *pnum)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct scsi_get_lba_status *lbas = NULL;
+    struct scsi_lba_status_descriptor *lbasd = NULL;
+    struct IscsiTask iTask;
+    int64_t ret;
+
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* default to all sectors allocated */
+    ret = BDRV_BLOCK_DATA;
+    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
+    *pnum = nb_sectors;
+
+    /* LUN does not support logical block provisioning */
+    if (iscsilun->lbpme == 0) {
+        goto out;
+    }
+
+retry:
+    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
+                                  sector_qemu2lun(sector_num, iscsilun),
+                                  8 + 16, iscsi_co_generic_cb,
+                                  &iTask) == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }
+
+    if (iTask.do_retry) {
+        if (iTask.task != NULL) {
+            scsi_free_scsi_task(iTask.task);
+            iTask.task = NULL;
+        }
+        iTask.complete = 0;
+        goto retry;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        /* in case the get_lba_status_callout fails (i.e.
+         * because the device is busy or the cmd is not
+         * supported) we pretend all blocks are allocated
+         * for backwards compatibility */
+        goto out;
+    }
+
+    lbas = scsi_datain_unmarshall(iTask.task);
+    if (lbas == NULL) {
+        ret = -EIO;
+        goto out;
+    }
+
+    lbasd = &lbas->descriptors[0];
+
+    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
+        ret = -EIO;
+        goto out;
+    }
+
+    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
+    if (*pnum > nb_sectors) {
+        *pnum = nb_sectors;
+    }
+
+    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
+        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
+        ret &= ~BDRV_BLOCK_DATA;
+        if (iscsilun->lbprz) {
+            ret |= BDRV_BLOCK_ZERO;
+        }
+    }
+
+out:
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+    }
+    return ret;
+}
+
+#endif /* LIBISCSI_FEATURE_IOVECTOR */
+
+static int
+coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
+    struct unmap_list list;
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }
+
+    if (!iscsilun->lbp.lbpu) {
+        /* UNMAP is not supported by the target */
+        return 0;
+    }
+
+    list.lba = sector_qemu2lun(sector_num, iscsilun);
+    list.num = sector_qemu2lun(nb_sectors, iscsilun);
+
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
+                     iscsi_co_generic_cb, &iTask) == NULL) {
+        return -ENOMEM;
+    }
+
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }
+
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }
+
+    if (iTask.do_retry) {
+        iTask.complete = 0;
+        goto retry;
+    }
+
+    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
+        /* the target might fail with a check condition if it
+           is not happy with the alignment of the UNMAP request
+           we silently fail in this case */
+        return 0;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED)
+
+static int
+coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, BdrvRequestFlags flags)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
+    uint64_t lba;
+    uint32_t nb_blocks;
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }
+
+    if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) {
+        /* WRITE SAME without UNMAP is not supported by the target */
+        return -ENOTSUP;
+    }
+
+    if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) {
+        /* WRITE SAME with UNMAP is not supported by the target */
+        return -ENOTSUP;
+    }
+
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+
+    if (iscsilun->zeroblock == NULL) {
+        iscsilun->zeroblock = g_malloc0(iscsilun->block_size);
+    }
+
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    if (iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                               iscsilun->zeroblock, iscsilun->block_size,
+                               nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP),
+                               0, 0, iscsi_co_generic_cb, &iTask) == NULL) {
+        return -ENOMEM;
+    }
+
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }
+
+    if (iTask.status == SCSI_STATUS_CHECK_CONDITION &&
+        iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST &&
+        (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE ||
+         iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) {
+        /* WRITE SAME is not supported by the target */
+        iscsilun->has_write_same = false;
+        scsi_free_scsi_task(iTask.task);
+        return -ENOTSUP;
+    }
+
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }
+
+    if (iTask.do_retry) {
+        iTask.complete = 0;
+        goto retry;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+#endif /* SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED */
+
+static void parse_chap(struct iscsi_context *iscsi, const char *target,
+                       Error **errp)
 {
     QemuOptsList *list;
     QemuOpts *opts;
@@ -859,37 +879,35 @@ static int parse_chap(struct iscsi_context *iscsi, const char *target)
 
     list = qemu_find_opts("iscsi");
     if (!list) {
-        return 0;
+        return;
     }
 
     opts = qemu_opts_find(list, target);
     if (opts == NULL) {
         opts = QTAILQ_FIRST(&list->head);
         if (!opts) {
-            return 0;
+            return;
         }
     }
 
     user = qemu_opt_get(opts, "user");
     if (!user) {
-        return 0;
+        return;
     }
 
     password = qemu_opt_get(opts, "password");
     if (!password) {
-        error_report("CHAP username specified but no password was given");
-        return -1;
+        error_setg(errp, "CHAP username specified but no password was given");
+        return;
     }
 
     if (iscsi_set_initiator_username_pwd(iscsi, user, password)) {
-        error_report("Failed to set initiator username and password");
-        return -1;
+        error_setg(errp, "Failed to set initiator username and password");
     }
-
-    return 0;
 }
 
-static void parse_header_digest(struct iscsi_context *iscsi, const char *target)
+static void parse_header_digest(struct iscsi_context *iscsi, const char *target,
+                                Error **errp)
 {
     QemuOptsList *list;
     QemuOpts *opts;
@@ -922,7 +940,7 @@ static void parse_header_digest(struct iscsi_context *iscsi, const char *target)
     } else if (!strcmp(digest, "NONE-CRC32C")) {
         iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
     } else {
-        error_report("Invalid header-digest setting : %s", digest);
+        error_setg(errp, "Invalid header-digest setting : %s", digest);
     }
 }
 
@@ -930,8 +948,9 @@ static char *parse_initiator_name(const char *target)
 {
     QemuOptsList *list;
     QemuOpts *opts;
-    const char *name = NULL;
-    const char *iscsi_name = qemu_get_vm_name();
+    const char *name;
+    char *iscsi_name;
+    UuidInfo *uuid_info;
 
     list = qemu_find_opts("iscsi");
     if (list) {
@@ -941,16 +960,22 @@ static char *parse_initiator_name(const char *target)
         }
         if (opts) {
             name = qemu_opt_get(opts, "initiator-name");
+            if (name) {
+                return g_strdup(name);
+            }
         }
     }
 
-    if (name) {
-        return g_strdup(name);
+    uuid_info = qmp_query_uuid(NULL);
+    if (strcmp(uuid_info->UUID, UUID_NONE) == 0) {
+        name = qemu_get_vm_name();
     } else {
-        return g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s",
-                               iscsi_name ? ":" : "",
-                               iscsi_name ? iscsi_name : "");
+        name = uuid_info->UUID;
     }
+    iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s",
+                                 name ? ":" : "", name ? name : "");
+    qapi_free_UuidInfo(uuid_info);
+    return iscsi_name;
 }
 
 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
@@ -968,17 +993,16 @@ static void iscsi_nop_timed_event(void *opaque)
         return;
     }
 
-    qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+    timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
     iscsi_set_events(iscsilun);
 }
 #endif
 
-static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
+static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
 {
     struct scsi_task *task = NULL;
     struct scsi_readcapacity10 *rc10 = NULL;
     struct scsi_readcapacity16 *rc16 = NULL;
-    int ret = 0;
     int retries = ISCSI_CMD_RETRIES; 
 
     do {
@@ -993,11 +1017,12 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
             if (task != NULL && task->status == SCSI_STATUS_GOOD) {
                 rc16 = scsi_datain_unmarshall(task);
                 if (rc16 == NULL) {
-                    error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
-                    ret = -EINVAL;
+                    error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data.");
                 } else {
                     iscsilun->block_size = rc16->block_length;
                     iscsilun->num_blocks = rc16->returned_lba + 1;
+                    iscsilun->lbpme = rc16->lbpme;
+                    iscsilun->lbprz = rc16->lbprz;
                 }
             }
             break;
@@ -1006,8 +1031,7 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
             if (task != NULL && task->status == SCSI_STATUS_GOOD) {
                 rc10 = scsi_datain_unmarshall(task);
                 if (rc10 == NULL) {
-                    error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
-                    ret = -EINVAL;
+                    error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data.");
                 } else {
                     iscsilun->block_size = rc10->block_size;
                     if (rc10->lba == 0) {
@@ -1020,20 +1044,18 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun)
             }
             break;
         default:
-            return 0;
+            return;
         }
     } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION
              && task->sense.key == SCSI_SENSE_UNIT_ATTENTION
              && retries-- > 0);
 
     if (task == NULL || task->status != SCSI_STATUS_GOOD) {
-        error_report("iSCSI: failed to send readcapacity10 command.");
-        ret = -EINVAL;
+        error_setg(errp, "iSCSI: failed to send readcapacity10 command.");
     }
     if (task) {
         scsi_free_scsi_task(task);
     }
-    return ret;
 }
 
 /* TODO Convert to fine grained options */
@@ -1050,45 +1072,88 @@ static QemuOptsList runtime_opts = {
     },
 };
 
+static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun,
+                                          int evpd, int pc, void **inq, Error **errp)
+{
+    int full_size;
+    struct scsi_task *task = NULL;
+    task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64);
+    if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+        goto fail;
+    }
+    full_size = scsi_datain_getfullsize(task);
+    if (full_size > task->datain.size) {
+        scsi_free_scsi_task(task);
+
+        /* we need more data for the full list */
+        task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size);
+        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+            goto fail;
+        }
+    }
+
+    *inq = scsi_datain_unmarshall(task);
+    if (*inq == NULL) {
+        error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob");
+        goto fail;
+    }
+
+    return task;
+
+fail:
+    if (!error_is_set(errp)) {
+        error_setg(errp, "iSCSI: Inquiry command failed : %s",
+                   iscsi_get_error(iscsi));
+    }
+    if (task != NULL) {
+        scsi_free_scsi_task(task);
+    }
+    return NULL;
+}
+
 /*
  * We support iscsi url's on the form
  * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
+ *
+ * Note: flags are currently not used by iscsi_open.  If this function
+ * is changed such that flags are used, please examine iscsi_reopen_prepare()
+ * to see if needs to be changed as well.
  */
-static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
+static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct iscsi_context *iscsi = NULL;
     struct iscsi_url *iscsi_url = NULL;
     struct scsi_task *task = NULL;
     struct scsi_inquiry_standard *inq = NULL;
+    struct scsi_inquiry_supported_pages *inq_vpd;
     char *initiator_name = NULL;
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
-    int ret;
+    int i, ret;
 
     if ((BDRV_SECTOR_SIZE % 512) != 0) {
-        error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
-                     "BDRV_SECTOR_SIZE(%lld) is not a multiple "
-                     "of 512", BDRV_SECTOR_SIZE);
+        error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. "
+                   "BDRV_SECTOR_SIZE(%lld) is not a multiple "
+                   "of 512", BDRV_SECTOR_SIZE);
         return -EINVAL;
     }
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto out;
     }
 
     filename = qemu_opt_get(opts, "filename");
 
-
     iscsi_url = iscsi_parse_full_url(iscsi, filename);
     if (iscsi_url == NULL) {
-        error_report("Failed to parse URL : %s", filename);
+        error_setg(errp, "Failed to parse URL : %s", filename);
         ret = -EINVAL;
         goto out;
     }
@@ -1099,13 +1164,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
 
     iscsi = iscsi_create_context(initiator_name);
     if (iscsi == NULL) {
-        error_report("iSCSI: Failed to create iSCSI context.");
+        error_setg(errp, "iSCSI: Failed to create iSCSI context.");
         ret = -ENOMEM;
         goto out;
     }
 
     if (iscsi_set_targetname(iscsi, iscsi_url->target)) {
-        error_report("iSCSI: Failed to set target name.");
+        error_setg(errp, "iSCSI: Failed to set target name.");
         ret = -EINVAL;
         goto out;
     }
@@ -1114,21 +1179,22 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
         ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user,
                                               iscsi_url->passwd);
         if (ret != 0) {
-            error_report("Failed to set initiator username and password");
+            error_setg(errp, "Failed to set initiator username and password");
             ret = -EINVAL;
             goto out;
         }
     }
 
     /* check if we got CHAP username/password via the options */
-    if (parse_chap(iscsi, iscsi_url->target) != 0) {
-        error_report("iSCSI: Failed to set CHAP user/password");
+    parse_chap(iscsi, iscsi_url->target, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto out;
     }
 
     if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) {
-        error_report("iSCSI: Failed to set session type to normal.");
+        error_setg(errp, "iSCSI: Failed to set session type to normal.");
         ret = -EINVAL;
         goto out;
     }
@@ -1136,10 +1202,15 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
     iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
 
     /* check if we got HEADER_DIGEST via the options */
-    parse_header_digest(iscsi, iscsi_url->target);
+    parse_header_digest(iscsi, iscsi_url->target, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto out;
+    }
 
     if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
-        error_report("iSCSI: Failed to connect to LUN : %s",
+        error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
             iscsi_get_error(iscsi));
         ret = -EINVAL;
         goto out;
@@ -1147,42 +1218,82 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags)
 
     iscsilun->iscsi = iscsi;
     iscsilun->lun   = iscsi_url->lun;
+    iscsilun->has_write_same = true;
 
-    task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36);
-
-    if (task == NULL || task->status != SCSI_STATUS_GOOD) {
-        error_report("iSCSI: failed to send inquiry command.");
-        ret = -EINVAL;
-        goto out;
-    }
-
-    inq = scsi_datain_unmarshall(task);
-    if (inq == NULL) {
-        error_report("iSCSI: Failed to unmarshall inquiry data.");
+    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0,
+                            (void **) &inq, errp);
+    if (task == NULL) {
         ret = -EINVAL;
         goto out;
     }
-
     iscsilun->type = inq->periperal_device_type;
+    scsi_free_scsi_task(task);
+    task = NULL;
 
-    if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
+    iscsi_readcapacity_sync(iscsilun, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
         goto out;
     }
     bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
+    bs->request_alignment = iscsilun->block_size;
 
-    /* Medium changer or tape. We dont have any emulation for this so this must
-     * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
-     * to read from the device to guess the image format.
+    /* We don't have any emulation for devices other than disks and CD-ROMs, so
+     * this must be sg ioctl compatible. We force it to be sg, otherwise qemu
+     * will try to read from the device to guess the image format.
      */
-    if (iscsilun->type == TYPE_MEDIUM_CHANGER ||
-        iscsilun->type == TYPE_TAPE) {
+    if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) {
         bs->sg = 1;
     }
 
+    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
+                            SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES,
+                            (void **) &inq_vpd, errp);
+    if (task == NULL) {
+        ret = -EINVAL;
+        goto out;
+    }
+    for (i = 0; i < inq_vpd->num_pages; i++) {
+        struct scsi_task *inq_task;
+        struct scsi_inquiry_logical_block_provisioning *inq_lbp;
+        struct scsi_inquiry_block_limits *inq_bl;
+        switch (inq_vpd->pages[i]) {
+        case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING:
+            inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
+                                        SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING,
+                                        (void **) &inq_lbp, errp);
+            if (inq_task == NULL) {
+                ret = -EINVAL;
+                goto out;
+            }
+            memcpy(&iscsilun->lbp, inq_lbp,
+                   sizeof(struct scsi_inquiry_logical_block_provisioning));
+            scsi_free_scsi_task(inq_task);
+            break;
+        case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS:
+            inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
+                                    SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS,
+                                    (void **) &inq_bl, errp);
+            if (inq_task == NULL) {
+                ret = -EINVAL;
+                goto out;
+            }
+            memcpy(&iscsilun->bl, inq_bl,
+                   sizeof(struct scsi_inquiry_block_limits));
+            scsi_free_scsi_task(inq_task);
+            break;
+        default:
+            break;
+        }
+    }
+    scsi_free_scsi_task(task);
+    task = NULL;
+
 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
     /* Set up a timer for sending out iSCSI NOPs */
-    iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun);
-    qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL);
+    iscsilun->nop_timer = timer_new_ms(QEMU_CLOCK_REALTIME, iscsi_nop_timed_event, iscsilun);
+    timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
 #endif
 
 out:
@@ -1212,25 +1323,66 @@ static void iscsi_close(BlockDriverState *bs)
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
     if (iscsilun->nop_timer) {
-        qemu_del_timer(iscsilun->nop_timer);
-        qemu_free_timer(iscsilun->nop_timer);
+        timer_del(iscsilun->nop_timer);
+        timer_free(iscsilun->nop_timer);
     }
-    qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL);
+    qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL);
     iscsi_destroy_context(iscsi);
+    g_free(iscsilun->zeroblock);
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+static int iscsi_refresh_limits(BlockDriverState *bs)
+{
+    IscsiLun *iscsilun = bs->opaque;
+
+    /* We don't actually refresh here, but just return data queried in
+     * iscsi_open(): iscsi targets don't change their limits. */
+    if (iscsilun->lbp.lbpu) {
+        if (iscsilun->bl.max_unmap < 0xffffffff) {
+            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
+                                                 iscsilun);
+        }
+        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                   iscsilun);
+    }
+
+    if (iscsilun->bl.max_ws_len < 0xffffffff) {
+        bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
+                                                  iscsilun);
+    }
+    if (iscsilun->lbp.lbpws) {
+        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                        iscsilun);
+    }
+    bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
+                                                 iscsilun);
+    return 0;
+}
+
+/* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in
+ * prepare.  Note that this will not re-establish a connection with an iSCSI
+ * target - it is effectively a NOP.  */
+static int iscsi_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    /* NOP */
+    return 0;
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
     IscsiLun *iscsilun = bs->opaque;
-    int ret = 0;
+    Error *local_err = NULL;
 
     if (iscsilun->type != TYPE_DISK) {
         return -ENOTSUP;
     }
 
-    if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
-        return ret;
+    iscsi_readcapacity_sync(iscsilun, &local_err);
+    if (local_err != NULL) {
+        error_free(local_err);
+        return -EIO;
     }
 
     if (offset > iscsi_getlength(bs)) {
@@ -1240,20 +1392,16 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
     return 0;
 }
 
-static int iscsi_has_zero_init(BlockDriverState *bs)
-{
-    return 0;
-}
-
-static int iscsi_create(const char *filename, QEMUOptionParameter *options)
+static int iscsi_create(const char *filename, QEMUOptionParameter *options,
+                        Error **errp)
 {
     int ret = 0;
     int64_t total_size = 0;
-    BlockDriverState bs;
+    BlockDriverState *bs;
     IscsiLun *iscsilun = NULL;
     QDict *bs_options;
 
-    memset(&bs, 0, sizeof(BlockDriverState));
+    bs = bdrv_new("");
 
     /* Read out options */
     while (options && options->name) {
@@ -1263,26 +1411,26 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    bs.opaque = g_malloc0(sizeof(struct IscsiLun));
-    iscsilun = bs.opaque;
+    bs->opaque = g_malloc0(sizeof(struct IscsiLun));
+    iscsilun = bs->opaque;
 
     bs_options = qdict_new();
     qdict_put(bs_options, "filename", qstring_from_str(filename));
-    ret = iscsi_open(&bs, bs_options, 0);
+    ret = iscsi_open(bs, bs_options, 0, NULL);
     QDECREF(bs_options);
 
     if (ret != 0) {
         goto out;
     }
     if (iscsilun->nop_timer) {
-        qemu_del_timer(iscsilun->nop_timer);
-        qemu_free_timer(iscsilun->nop_timer);
+        timer_del(iscsilun->nop_timer);
+        timer_free(iscsilun->nop_timer);
     }
     if (iscsilun->type != TYPE_DISK) {
         ret = -ENODEV;
         goto out;
     }
-    if (bs.total_sectors < total_size) {
+    if (bs->total_sectors < total_size) {
         ret = -ENOSPC;
         goto out;
     }
@@ -1292,10 +1440,27 @@ out:
     if (iscsilun->iscsi != NULL) {
         iscsi_destroy_context(iscsilun->iscsi);
     }
-    g_free(bs.opaque);
+    g_free(bs->opaque);
+    bs->opaque = NULL;
+    bdrv_unref(bs);
     return ret;
 }
 
+static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+    bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
+    /* Guess the internal cluster (page) size of the iscsi target by the means
+     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
+     * reasonable size for bdi->cluster_size */
+    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
+        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
+        bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
+    }
+    return 0;
+}
+
 static QEMUOptionParameter iscsi_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1310,20 +1475,28 @@ static BlockDriver bdrv_iscsi = {
     .protocol_name   = "iscsi",
 
     .instance_size   = sizeof(IscsiLun),
+    .bdrv_needs_filename = true,
     .bdrv_file_open  = iscsi_open,
     .bdrv_close      = iscsi_close,
     .bdrv_create     = iscsi_create,
     .create_options  = iscsi_create_options,
+    .bdrv_reopen_prepare  = iscsi_reopen_prepare,
 
     .bdrv_getlength  = iscsi_getlength,
+    .bdrv_get_info   = iscsi_get_info,
     .bdrv_truncate   = iscsi_truncate,
+    .bdrv_refresh_limits = iscsi_refresh_limits,
 
-    .bdrv_aio_readv  = iscsi_aio_readv,
-    .bdrv_aio_writev = iscsi_aio_writev,
-    .bdrv_aio_flush  = iscsi_aio_flush,
-
-    .bdrv_aio_discard = iscsi_aio_discard,
-    .bdrv_has_zero_init = iscsi_has_zero_init,
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+#endif
+    .bdrv_co_discard      = iscsi_co_discard,
+#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED)
+    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
+#endif
+    .bdrv_co_readv         = iscsi_co_readv,
+    .bdrv_co_writev        = iscsi_co_writev,
+    .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
     .bdrv_ioctl       = iscsi_ioctl,
diff --git a/block/linux-aio.c b/block/linux-aio.c
index ee0f8d10c..53434e2df 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -39,7 +39,6 @@ struct qemu_laiocb {
 struct qemu_laio_state {
     io_context_t ctx;
     EventNotifier e;
-    int count;
 };
 
 static inline ssize_t io_event_ret(struct io_event *ev)
@@ -55,8 +54,6 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 {
     int ret;
 
-    s->count--;
-
     ret = laiocb->ret;
     if (ret != -ECANCELED) {
         if (ret == laiocb->nbytes) {
@@ -101,13 +98,6 @@ static void qemu_laio_completion_cb(EventNotifier *e)
     }
 }
 
-static int qemu_laio_flush_cb(EventNotifier *e)
-{
-    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
-
-    return (s->count > 0) ? 1 : 0;
-}
-
 static void laio_cancel(BlockDriverAIOCB *blockacb)
 {
     struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
@@ -177,14 +167,11 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
         goto out_free_aiocb;
     }
     io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
-    s->count++;
 
     if (io_submit(s->ctx, 1, &iocbs) < 0)
-        goto out_dec_count;
+        goto out_free_aiocb;
     return &laiocb->common;
 
-out_dec_count:
-    s->count--;
 out_free_aiocb:
     qemu_aio_release(laiocb);
     return NULL;
@@ -203,8 +190,7 @@ void *laio_init(void)
         goto out_close_efd;
     }
 
-    qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb,
-                                qemu_laio_flush_cb);
+    qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb);
 
     return s;
 
diff --git a/block/mirror.c b/block/mirror.c
index bed4a7ead..0ef41f999 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -31,7 +31,8 @@ typedef struct MirrorBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *target;
-    MirrorSyncMode mode;
+    BlockDriverState *base;
+    bool is_none_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
     bool should_complete;
@@ -39,6 +40,7 @@ typedef struct MirrorBlockJob {
     int64_t granularity;
     size_t buf_size;
     unsigned long *cow_bitmap;
+    BdrvDirtyBitmap *dirty_bitmap;
     HBitmapIter hbi;
     uint8_t *buf;
     QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
@@ -94,8 +96,16 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
         bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
     }
 
+    qemu_iovec_destroy(&op->qiov);
     g_slice_free(MirrorOp, op);
-    qemu_coroutine_enter(s->common.co, NULL);
+
+    /* Enter coroutine when it is not sleeping.  The coroutine sleeps to
+     * rate-limit itself.  The coroutine will eventually resume since there is
+     * a sleep timeout so don't wake it early.
+     */
+    if (s->common.busy) {
+        qemu_coroutine_enter(s->common.co, NULL);
+    }
 }
 
 static void mirror_write_complete(void *opaque, int ret)
@@ -136,18 +146,20 @@ static void mirror_read_complete(void *opaque, int ret)
                     mirror_write_complete, op);
 }
 
-static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
+static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->common.bs;
     int nb_sectors, sectors_per_chunk, nb_chunks;
     int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
+    uint64_t delay_ns;
     MirrorOp *op;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(source, &s->hbi);
+        bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
         s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
+        trace_mirror_restart_iter(s,
+                                  bdrv_get_dirty_count(source, s->dirty_bitmap));
         assert(s->sector_num >= 0);
     }
 
@@ -183,7 +195,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
     do {
         int added_sectors, added_chunks;
 
-        if (!bdrv_get_dirty(source, next_sector) ||
+        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
             test_bit(next_chunk, s->in_flight_bitmap)) {
             assert(nb_sectors > 0);
             break;
@@ -227,7 +239,12 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         nb_chunks += added_chunks;
         next_sector += added_sectors;
         next_chunk += added_chunks;
-    } while (next_sector < end);
+        if (!s->synced && s->common.speed) {
+            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
+        } else {
+            delay_ns = 0;
+        }
+    } while (delay_ns == 0 && next_sector < end);
 
     /* Allocate a MirrorOp that is used as an AIO callback.  */
     op = g_slice_new(MirrorOp);
@@ -249,7 +266,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         /* Advance the HBitmapIter in parallel, so that we do not examine
          * the same sector twice.
          */
-        if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+        if (next_sector > hbitmap_next_sector
+            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
             hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
         }
 
@@ -263,6 +281,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
     bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                    mirror_read_complete, op);
+    return delay_ns;
 }
 
 static void mirror_free_init(MirrorBlockJob *s)
@@ -332,14 +351,13 @@ static void coroutine_fn mirror_run(void *opaque)
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     mirror_free_init(s);
 
-    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+    if (!s->is_none_mode) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
-        BlockDriverState *base;
-        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        BlockDriverState *base = s->base;
         for (sector_num = 0; sector_num < end; ) {
             int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
-            ret = bdrv_co_is_allocated_above(bs, base,
-                                             sector_num, next - sector_num, &n);
+            ret = bdrv_is_allocated_above(bs, base,
+                                          sector_num, next - sector_num, &n);
 
             if (ret < 0) {
                 goto immediate_exit;
@@ -355,10 +373,10 @@ static void coroutine_fn mirror_run(void *opaque)
         }
     }
 
-    bdrv_dirty_iter_init(bs, &s->hbi);
-    last_pause_ns = qemu_get_clock_ns(rt_clock);
+    bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
+    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     for (;;) {
-        uint64_t delay_ns;
+        uint64_t delay_ns = 0;
         int64_t cnt;
         bool should_complete;
 
@@ -367,14 +385,14 @@ static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
-        cnt = bdrv_get_dirty_count(bs);
+        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that qemu_aio_flush() returns.
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
-        if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
+        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
             if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
@@ -382,8 +400,10 @@ static void coroutine_fn mirror_run(void *opaque)
                 qemu_coroutine_yield();
                 continue;
             } else if (cnt != 0) {
-                mirror_iteration(s);
-                continue;
+                delay_ns = mirror_iteration(s);
+                if (delay_ns == 0) {
+                    continue;
+                }
             }
         }
 
@@ -409,7 +429,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
                 should_complete = s->should_complete ||
                     block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(bs);
+                cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
             }
         }
 
@@ -424,28 +444,21 @@ static void coroutine_fn mirror_run(void *opaque)
              */
             trace_mirror_before_drain(s, cnt);
             bdrv_drain_all();
-            cnt = bdrv_get_dirty_count(bs);
+            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
         }
 
         ret = 0;
-        trace_mirror_before_sleep(s, cnt, s->synced);
+        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
         if (!s->synced) {
             /* Publish progress */
             s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
-
-            if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
-            } else {
-                delay_ns = 0;
-            }
-
-            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
             if (block_job_is_cancelled(&s->common)) {
                 break;
             }
         } else if (!should_complete) {
             delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
-            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
         } else if (cnt == 0) {
             /* The two disks are in sync.  Exit and report successful
              * completion.
@@ -454,7 +467,7 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.cancelled = false;
             break;
         }
-        last_pause_ns = qemu_get_clock_ns(rt_clock);
+        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
 immediate_exit:
@@ -471,16 +484,22 @@ immediate_exit:
     qemu_vfree(s->buf);
     g_free(s->cow_bitmap);
     g_free(s->in_flight_bitmap);
-    bdrv_set_dirty_tracking(bs, 0);
+    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
         if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
             bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
         }
         bdrv_swap(s->target, s->common.bs);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            s->base->backing_hd = NULL;
+            bdrv_unref(p);
+        }
     }
-    bdrv_close(s->target);
-    bdrv_delete(s->target);
+    bdrv_unref(s->target);
     block_job_completed(&s->common, ret);
 }
 
@@ -505,14 +524,12 @@ static void mirror_iostatus_reset(BlockJob *job)
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    Error *local_err = NULL;
     int ret;
 
-    ret = bdrv_open_backing_file(s->target, NULL);
+    ret = bdrv_open_backing_file(s->target, NULL, &local_err);
     if (ret < 0) {
-        char backing_filename[PATH_MAX];
-        bdrv_get_full_backing_filename(s->target, backing_filename,
-                                       sizeof(backing_filename));
-        error_setg_file_open(errp, -ret, backing_filename);
+        error_propagate(errp, local_err);
         return;
     }
     if (!s->synced) {
@@ -524,20 +541,32 @@ static void mirror_complete(BlockJob *job, Error **errp)
     block_job_resume(job);
 }
 
-static const BlockJobType mirror_job_type = {
+static const BlockJobDriver mirror_job_driver = {
     .instance_size = sizeof(MirrorBlockJob),
-    .job_type      = "mirror",
+    .job_type      = BLOCK_JOB_TYPE_MIRROR,
     .set_speed     = mirror_set_speed,
     .iostatus_reset= mirror_iostatus_reset,
     .complete      = mirror_complete,
 };
 
-void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
-                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb,
-                  void *opaque, Error **errp)
+static const BlockJobDriver commit_active_job_driver = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset
+                   = mirror_iostatus_reset,
+    .complete      = mirror_complete,
+};
+
+static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
+                            int64_t speed, int64_t granularity,
+                            int64_t buf_size,
+                            BlockdevOnError on_source_error,
+                            BlockdevOnError on_target_error,
+                            BlockDriverCompletionFunc *cb,
+                            void *opaque, Error **errp,
+                            const BlockJobDriver *driver,
+                            bool is_none_mode, BlockDriverState *base)
 {
     MirrorBlockJob *s;
 
@@ -562,7 +591,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-    s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp);
+
+    s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
@@ -570,11 +600,12 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
     s->target = target;
-    s->mode = mode;
+    s->is_none_mode = is_none_mode;
+    s->base = base;
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
 
-    bdrv_set_dirty_tracking(bs, granularity);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity);
     bdrv_set_enable_write_cache(s->target, true);
     bdrv_set_on_error(s->target, on_target_error, on_target_error);
     bdrv_iostatus_enable(s->target);
@@ -582,3 +613,80 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     trace_mirror_start(bs, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
 }
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    bool is_none_mode;
+    BlockDriverState *base;
+
+    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
+    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
+    mirror_start_job(bs, target, speed, granularity, buf_size,
+                     on_source_error, on_target_error, cb, opaque, errp,
+                     &mirror_job_driver, is_none_mode, base);
+}
+
+void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
+                         int64_t speed,
+                         BlockdevOnError on_error,
+                         BlockDriverCompletionFunc *cb,
+                         void *opaque, Error **errp)
+{
+    int64_t length, base_length;
+    int orig_base_flags;
+    int ret;
+    Error *local_err = NULL;
+
+    orig_base_flags = bdrv_get_flags(base);
+
+    if (bdrv_reopen(base, bs->open_flags, errp)) {
+        return;
+    }
+
+    length = bdrv_getlength(bs);
+    if (length < 0) {
+        error_setg_errno(errp, -length,
+                         "Unable to determine length of %s", bs->filename);
+        goto error_restore_flags;
+    }
+
+    base_length = bdrv_getlength(base);
+    if (base_length < 0) {
+        error_setg_errno(errp, -base_length,
+                         "Unable to determine length of %s", base->filename);
+        goto error_restore_flags;
+    }
+
+    if (length > base_length) {
+        ret = bdrv_truncate(base, length);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                            "Top image %s is larger than base image %s, and "
+                             "resize of base image failed",
+                             bs->filename, base->filename);
+            goto error_restore_flags;
+        }
+    }
+
+    bdrv_ref(base);
+    mirror_start_job(bs, base, speed, 0, 0,
+                     on_error, on_error, cb, opaque, &local_err,
+                     &commit_active_job_driver, false, base);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        goto error_restore_flags;
+    }
+
+    return;
+
+error_restore_flags:
+    /* ignore error and errp for bdrv_reopen, because we want to propagate
+     * the original error */
+    bdrv_reopen(base, orig_base_flags, NULL);
+    return;
+}
diff --git a/block/nbd-client.c b/block/nbd-client.c
new file mode 100644
index 000000000..7d698cb61
--- /dev/null
+++ b/block/nbd-client.c
@@ -0,0 +1,388 @@
+/*
+ * QEMU Block driver for  NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "nbd-client.h"
+#include "qemu/sockets.h"
+
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
+
+static void nbd_recv_coroutines_enter_all(NbdClientSession *s)
+{
+    int i;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i]) {
+            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        }
+    }
+}
+
+static void nbd_teardown_connection(NbdClientSession *client)
+{
+    /* finish any pending coroutines */
+    shutdown(client->sock, 2);
+    nbd_recv_coroutines_enter_all(client);
+
+    qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL);
+    closesocket(client->sock);
+    client->sock = -1;
+}
+
+static void nbd_reply_ready(void *opaque)
+{
+    NbdClientSession *s = opaque;
+    uint64_t i;
+    int ret;
+
+    if (s->reply.handle == 0) {
+        /* No reply already in flight.  Fetch a header.  It is possible
+         * that another thread has done the same thing in parallel, so
+         * the socket is not readable anymore.
+         */
+        ret = nbd_receive_reply(s->sock, &s->reply);
+        if (ret == -EAGAIN) {
+            return;
+        }
+        if (ret < 0) {
+            s->reply.handle = 0;
+            goto fail;
+        }
+    }
+
+    /* There's no need for a mutex on the receive side, because the
+     * handler acts as a synchronization point and ensures that only
+     * one coroutine is called until the reply finishes.  */
+    i = HANDLE_TO_INDEX(s, s->reply.handle);
+    if (i >= MAX_NBD_REQUESTS) {
+        goto fail;
+    }
+
+    if (s->recv_coroutine[i]) {
+        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        return;
+    }
+
+fail:
+    nbd_teardown_connection(s);
+}
+
+static void nbd_restart_write(void *opaque)
+{
+    NbdClientSession *s = opaque;
+
+    qemu_coroutine_enter(s->send_coroutine, NULL);
+}
+
+static int nbd_co_send_request(NbdClientSession *s,
+    struct nbd_request *request,
+    QEMUIOVector *qiov, int offset)
+{
+    int rc, ret;
+
+    qemu_co_mutex_lock(&s->send_mutex);
+    s->send_coroutine = qemu_coroutine_self();
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s);
+    if (qiov) {
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 1);
+        }
+        rc = nbd_send_request(s->sock, request);
+        if (rc >= 0) {
+            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                rc = -EIO;
+            }
+        }
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 0);
+        }
+    } else {
+        rc = nbd_send_request(s->sock, request);
+    }
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s);
+    s->send_coroutine = NULL;
+    qemu_co_mutex_unlock(&s->send_mutex);
+    return rc;
+}
+
+static void nbd_co_receive_reply(NbdClientSession *s,
+    struct nbd_request *request, struct nbd_reply *reply,
+    QEMUIOVector *qiov, int offset)
+{
+    int ret;
+
+    /* Wait until we're woken up by the read handler.  TODO: perhaps
+     * peek at the next reply and avoid yielding if it's ours?  */
+    qemu_coroutine_yield();
+    *reply = s->reply;
+    if (reply->handle != request->handle) {
+        reply->error = EIO;
+    } else {
+        if (qiov && reply->error == 0) {
+            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                reply->error = EIO;
+            }
+        }
+
+        /* Tell the read handler to read another header.  */
+        s->reply.handle = 0;
+    }
+}
+
+static void nbd_coroutine_start(NbdClientSession *s,
+   struct nbd_request *request)
+{
+    int i;
+
+    /* Poor man semaphore.  The free_sema is locked when no other request
+     * can be accepted, and unlocked after receiving one reply.  */
+    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
+        qemu_co_mutex_lock(&s->free_sema);
+        assert(s->in_flight < MAX_NBD_REQUESTS);
+    }
+    s->in_flight++;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i] == NULL) {
+            s->recv_coroutine[i] = qemu_coroutine_self();
+            break;
+        }
+    }
+
+    assert(i < MAX_NBD_REQUESTS);
+    request->handle = INDEX_TO_HANDLE(s, i);
+}
+
+static void nbd_coroutine_end(NbdClientSession *s,
+    struct nbd_request *request)
+{
+    int i = HANDLE_TO_INDEX(s, request->handle);
+    s->recv_coroutine[i] = NULL;
+    if (s->in_flight-- == MAX_NBD_REQUESTS) {
+        qemu_co_mutex_unlock(&s->free_sema);
+    }
+}
+
+static int nbd_co_readv_1(NbdClientSession *client, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov,
+                          int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_READ };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static int nbd_co_writev_1(NbdClientSession *client, int64_t sector_num,
+                           int nb_sectors, QEMUIOVector *qiov,
+                           int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!bdrv_enable_write_cache(client->bs) &&
+        (client->nbdflags & NBD_FLAG_SEND_FUA)) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, qiov, offset);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
+ * remain aligned to 4K. */
+#define NBD_MAX_SECTORS 2040
+
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_readv_1(client, sector_num,
+                             NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_readv_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_writev_1(client, sector_num,
+                              NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_writev_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_flush(NbdClientSession *client)
+{
+    struct nbd_request request = { .type = NBD_CMD_FLUSH };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+        return 0;
+    }
+
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = 0;
+    request.len = 0;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors)
+{
+    struct nbd_request request = { .type = NBD_CMD_TRIM };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
+        return 0;
+    }
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+void nbd_client_session_close(NbdClientSession *client)
+{
+    struct nbd_request request = {
+        .type = NBD_CMD_DISC,
+        .from = 0,
+        .len = 0
+    };
+
+    if (!client->bs) {
+        return;
+    }
+    if (client->sock == -1) {
+        return;
+    }
+
+    nbd_send_request(client->sock, &request);
+
+    nbd_teardown_connection(client);
+    client->bs = NULL;
+}
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+    int sock, const char *export)
+{
+    int ret;
+
+    /* NBD handshake */
+    logout("session init %s\n", export);
+    qemu_set_block(sock);
+    ret = nbd_receive_negotiate(sock, export,
+                                &client->nbdflags, &client->size,
+                                &client->blocksize);
+    if (ret < 0) {
+        logout("Failed to negotiate with the NBD server\n");
+        closesocket(sock);
+        return ret;
+    }
+
+    qemu_co_mutex_init(&client->send_mutex);
+    qemu_co_mutex_init(&client->free_sema);
+    client->bs = bs;
+    client->sock = sock;
+
+    /* Now that we're connected, set the socket to be non-blocking and
+     * kick the reply mechanism.  */
+    qemu_set_nonblock(sock);
+    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client);
+
+    logout("Established connection with NBD server\n");
+    return 0;
+}
diff --git a/block/nbd-client.h b/block/nbd-client.h
new file mode 100644
index 000000000..f2a63378b
--- /dev/null
+++ b/block/nbd-client.h
@@ -0,0 +1,50 @@
+#ifndef NBD_CLIENT_H
+#define NBD_CLIENT_H
+
+#include "qemu-common.h"
+#include "block/nbd.h"
+#include "block/block_int.h"
+
+/* #define DEBUG_NBD */
+
+#if defined(DEBUG_NBD)
+#define logout(fmt, ...) \
+    fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#else
+#define logout(fmt, ...) ((void)0)
+#endif
+
+#define MAX_NBD_REQUESTS    16
+
+typedef struct NbdClientSession {
+    int sock;
+    uint32_t nbdflags;
+    off_t size;
+    size_t blocksize;
+
+    CoMutex send_mutex;
+    CoMutex free_sema;
+    Coroutine *send_coroutine;
+    int in_flight;
+
+    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+    struct nbd_reply reply;
+
+    bool is_unix;
+
+    BlockDriverState *bs;
+} NbdClientSession;
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+                            int sock, const char *export_name);
+void nbd_client_session_close(NbdClientSession *client);
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+                                  int nb_sectors);
+int nbd_client_session_co_flush(NbdClientSession *client);
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+                                int nb_sectors, QEMUIOVector *qiov);
+
+#endif /* NBD_CLIENT_H */
diff --git a/block/nbd.c b/block/nbd.c
index 9c480b8f2..55124239d 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -26,8 +26,7 @@
  * THE SOFTWARE.
  */
 
-#include "qemu-common.h"
-#include "block/nbd.h"
+#include "block/nbd-client.h"
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
@@ -40,37 +39,9 @@
 
 #define EN_OPTSTR ":exportname="
 
-/* #define DEBUG_NBD */
-
-#if defined(DEBUG_NBD)
-#define logout(fmt, ...) \
-                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
-#else
-#define logout(fmt, ...) ((void)0)
-#endif
-
-#define MAX_NBD_REQUESTS	16
-#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
-#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
-
 typedef struct BDRVNBDState {
-    int sock;
-    uint32_t nbdflags;
-    off_t size;
-    size_t blocksize;
-
-    CoMutex send_mutex;
-    CoMutex free_sema;
-    Coroutine *send_coroutine;
-    int in_flight;
-
-    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
-    struct nbd_reply reply;
-
-    bool is_unix;
+    NbdClientSession client;
     QemuOpts *socket_opts;
-
-    char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;
 
 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -217,204 +188,49 @@ out:
     g_free(file);
 }
 
-static int nbd_config(BDRVNBDState *s, QDict *options)
+static void nbd_config(BDRVNBDState *s, QDict *options, char **export,
+                       Error **errp)
 {
     Error *local_err = NULL;
 
-    if (qdict_haskey(options, "path")) {
-        if (qdict_haskey(options, "host")) {
-            qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not "
-                          "be used at the same time.");
-            return -EINVAL;
+    if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) {
+        if (qdict_haskey(options, "path")) {
+            error_setg(errp, "path and host may not be used at the same time.");
+        } else {
+            error_setg(errp, "one of path and host must be specified.");
         }
-        s->is_unix = true;
-    } else if (qdict_haskey(options, "host")) {
-        s->is_unix = false;
-    } else {
-        return -EINVAL;
+        return;
     }
 
-    s->socket_opts = qemu_opts_create_nofail(&socket_optslist);
+    s->client.is_unix = qdict_haskey(options, "path");
+    s->socket_opts = qemu_opts_create(&socket_optslist, NULL, 0,
+                                      &error_abort);
 
     qemu_opts_absorb_qdict(s->socket_opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
-        return -EINVAL;
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
     }
 
     if (!qemu_opt_get(s->socket_opts, "port")) {
         qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT);
     }
 
-    s->export_name = g_strdup(qdict_get_try_str(options, "export"));
-    if (s->export_name) {
+    *export = g_strdup(qdict_get_try_str(options, "export"));
+    if (*export) {
         qdict_del(options, "export");
     }
-
-    return 0;
-}
-
-
-static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i;
-
-    /* Poor man semaphore.  The free_sema is locked when no other request
-     * can be accepted, and unlocked after receiving one reply.  */
-    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
-        qemu_co_mutex_lock(&s->free_sema);
-        assert(s->in_flight < MAX_NBD_REQUESTS);
-    }
-    s->in_flight++;
-
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i] == NULL) {
-            s->recv_coroutine[i] = qemu_coroutine_self();
-            break;
-        }
-    }
-
-    assert(i < MAX_NBD_REQUESTS);
-    request->handle = INDEX_TO_HANDLE(s, i);
-}
-
-static int nbd_have_request(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-
-    return s->in_flight > 0;
-}
-
-static void nbd_reply_ready(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    uint64_t i;
-    int ret;
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
-        ret = nbd_receive_reply(s->sock, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
-        if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
-        }
-    }
-
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
-
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        return;
-    }
-
-fail:
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        }
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    qemu_coroutine_enter(s->send_coroutine, NULL);
-}
-
-static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
-                               QEMUIOVector *qiov, int offset)
-{
-    int rc, ret;
-
-    qemu_co_mutex_lock(&s->send_mutex);
-    s->send_coroutine = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
-                            nbd_have_request, s);
-    if (qiov) {
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 1);
-        }
-        rc = nbd_send_request(s->sock, request);
-        if (rc >= 0) {
-            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                rc = -EIO;
-            }
-        }
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 0);
-        }
-    } else {
-        rc = nbd_send_request(s->sock, request);
-    }
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
-                            nbd_have_request, s);
-    s->send_coroutine = NULL;
-    qemu_co_mutex_unlock(&s->send_mutex);
-    return rc;
-}
-
-static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
-                                 struct nbd_reply *reply,
-                                 QEMUIOVector *qiov, int offset)
-{
-    int ret;
-
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
-    qemu_coroutine_yield();
-    *reply = s->reply;
-    if (reply->handle != request->handle) {
-        reply->error = EIO;
-    } else {
-        if (qiov && reply->error == 0) {
-            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                reply->error = EIO;
-            }
-        }
-
-        /* Tell the read handler to read another header.  */
-        s->reply.handle = 0;
-    }
-}
-
-static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i = HANDLE_TO_INDEX(s, request->handle);
-    s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_mutex_unlock(&s->free_sema);
-    }
 }
 
-static int nbd_establish_connection(BlockDriverState *bs)
+static int nbd_establish_connection(BlockDriverState *bs, Error **errp)
 {
     BDRVNBDState *s = bs->opaque;
     int sock;
-    int ret;
-    off_t size;
-    size_t blocksize;
 
-    if (s->is_unix) {
-        sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path"));
+    if (s->client.is_unix) {
+        sock = unix_connect_opts(s->socket_opts, errp, NULL, NULL);
     } else {
-        sock = tcp_socket_outgoing_opts(s->socket_opts);
+        sock = inet_connect_opts(s->socket_opts, errp, NULL, NULL);
         if (sock >= 0) {
             socket_set_nodelay(sock);
         }
@@ -426,226 +242,85 @@ static int nbd_establish_connection(BlockDriverState *bs)
         return -errno;
     }
 
-    /* NBD handshake */
-    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
-                                &blocksize);
-    if (ret < 0) {
-        logout("Failed to negotiate with the NBD server\n");
-        closesocket(sock);
-        return ret;
-    }
-
-    /* Now that we're connected, set the socket to be non-blocking and
-     * kick the reply mechanism.  */
-    qemu_set_nonblock(sock);
-    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
-                            nbd_have_request, s);
-
-    s->sock = sock;
-    s->size = size;
-    s->blocksize = blocksize;
-
-    logout("Established connection with NBD server\n");
-    return 0;
-}
-
-static void nbd_teardown_connection(BlockDriverState *bs)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-
-    request.type = NBD_CMD_DISC;
-    request.from = 0;
-    request.len = 0;
-    nbd_send_request(s->sock, &request);
-
-    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
-    closesocket(s->sock);
+    return sock;
 }
 
-static int nbd_open(BlockDriverState *bs, QDict *options, int flags)
+static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVNBDState *s = bs->opaque;
-    int result;
-
-    qemu_co_mutex_init(&s->send_mutex);
-    qemu_co_mutex_init(&s->free_sema);
+    char *export = NULL;
+    int result, sock;
+    Error *local_err = NULL;
 
     /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, options);
-    if (result != 0) {
-        return result;
+    nbd_config(s, options, &export, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EINVAL;
     }
 
     /* establish TCP connection, return error if it fails
      * TODO: Configurable retry-until-timeout behaviour.
      */
-    result = nbd_establish_connection(bs);
-
-    return result;
-}
-
-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov,
-                          int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_READ;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, qiov, offset);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
-
-}
-
-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_WRITE;
-    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
-        request.type |= NBD_CMD_FLAG_FUA;
+    sock = nbd_establish_connection(bs, errp);
+    if (sock < 0) {
+        return sock;
     }
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, qiov, offset);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    /* NBD handshake */
+    result = nbd_client_session_init(&s->client, bs, sock, export);
+    g_free(export);
+    return result;
 }
 
-/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
- * remain aligned to 4K. */
-#define NBD_MAX_SECTORS 2040
-
 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_readv(&s->client, sector_num,
+                                       nb_sectors, qiov);
 }
 
 static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_writev(&s->client, sector_num,
+                                        nb_sectors, qiov);
 }
 
 static int nbd_co_flush(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
-        return 0;
-    }
-
-    request.type = NBD_CMD_FLUSH;
-    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
-        request.type |= NBD_CMD_FLAG_FUA;
-    }
-
-    request.from = 0;
-    request.len = 0;
 
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_flush(&s->client);
 }
 
 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors)
 {
     BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
 
-    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
-        return 0;
-    }
-    request.type = NBD_CMD_TRIM;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_discard(&s->client, sector_num,
+                                         nb_sectors);
 }
 
 static void nbd_close(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
-    g_free(s->export_name);
-    qemu_opts_del(s->socket_opts);
 
-    nbd_teardown_connection(bs);
+    qemu_opts_del(s->socket_opts);
+    nbd_client_session_close(&s->client);
 }
 
 static int64_t nbd_getlength(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
 
-    return s->size;
+    return s->client.size;
 }
 
 static BlockDriver bdrv_nbd = {
diff --git a/block/nfs.c b/block/nfs.c
new file mode 100644
index 000000000..98aa363e4
--- /dev/null
+++ b/block/nfs.c
@@ -0,0 +1,442 @@
+/*
+ * QEMU Block driver for native access to files on NFS shares
+ *
+ * Copyright (c) 2014 Peter Lieven <pl@kamp.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "config-host.h"
+
+#include <poll.h>
+#include "qemu-common.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
+#include "trace.h"
+#include "qemu/iov.h"
+#include "qemu/uri.h"
+#include "sysemu/sysemu.h"
+#include <nfsc/libnfs.h>
+
+typedef struct NFSClient {
+    struct nfs_context *context;
+    struct nfsfh *fh;
+    int events;
+    bool has_zero_init;
+} NFSClient;
+
+typedef struct NFSRPC {
+    int ret;
+    int complete;
+    QEMUIOVector *iov;
+    struct stat *st;
+    Coroutine *co;
+    QEMUBH *bh;
+} NFSRPC;
+
+static void nfs_process_read(void *arg);
+static void nfs_process_write(void *arg);
+
+static void nfs_set_events(NFSClient *client)
+{
+    int ev = nfs_which_events(client->context);
+    if (ev != client->events) {
+        qemu_aio_set_fd_handler(nfs_get_fd(client->context),
+                      (ev & POLLIN) ? nfs_process_read : NULL,
+                      (ev & POLLOUT) ? nfs_process_write : NULL,
+                      client);
+
+    }
+    client->events = ev;
+}
+
+static void nfs_process_read(void *arg)
+{
+    NFSClient *client = arg;
+    nfs_service(client->context, POLLIN);
+    nfs_set_events(client);
+}
+
+static void nfs_process_write(void *arg)
+{
+    NFSClient *client = arg;
+    nfs_service(client->context, POLLOUT);
+    nfs_set_events(client);
+}
+
+static void nfs_co_init_task(NFSClient *client, NFSRPC *task)
+{
+    *task = (NFSRPC) {
+        .co         = qemu_coroutine_self(),
+    };
+}
+
+static void nfs_co_generic_bh_cb(void *opaque)
+{
+    NFSRPC *task = opaque;
+    qemu_bh_delete(task->bh);
+    qemu_coroutine_enter(task->co, NULL);
+}
+
+static void
+nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
+                  void *private_data)
+{
+    NFSRPC *task = private_data;
+    task->complete = 1;
+    task->ret = ret;
+    if (task->ret > 0 && task->iov) {
+        if (task->ret <= task->iov->size) {
+            qemu_iovec_from_buf(task->iov, 0, data, task->ret);
+        } else {
+            task->ret = -EIO;
+        }
+    }
+    if (task->ret == 0 && task->st) {
+        memcpy(task->st, data, sizeof(struct stat));
+    }
+    if (task->ret < 0) {
+        error_report("NFS Error: %s", nfs_get_error(nfs));
+    }
+    if (task->co) {
+        task->bh = qemu_bh_new(nfs_co_generic_bh_cb, task);
+        qemu_bh_schedule(task->bh);
+    }
+}
+
+static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
+                                     int64_t sector_num, int nb_sectors,
+                                     QEMUIOVector *iov)
+{
+    NFSClient *client = bs->opaque;
+    NFSRPC task;
+
+    nfs_co_init_task(client, &task);
+    task.iov = iov;
+
+    if (nfs_pread_async(client->context, client->fh,
+                        sector_num * BDRV_SECTOR_SIZE,
+                        nb_sectors * BDRV_SECTOR_SIZE,
+                        nfs_co_generic_cb, &task) != 0) {
+        return -ENOMEM;
+    }
+
+    while (!task.complete) {
+        nfs_set_events(client);
+        qemu_coroutine_yield();
+    }
+
+    if (task.ret < 0) {
+        return task.ret;
+    }
+
+    /* zero pad short reads */
+    if (task.ret < iov->size) {
+        qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret);
+    }
+
+    return 0;
+}
+
+static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        QEMUIOVector *iov)
+{
+    NFSClient *client = bs->opaque;
+    NFSRPC task;
+    char *buf = NULL;
+
+    nfs_co_init_task(client, &task);
+
+    buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE);
+
+    if (nfs_pwrite_async(client->context, client->fh,
+                         sector_num * BDRV_SECTOR_SIZE,
+                         nb_sectors * BDRV_SECTOR_SIZE,
+                         buf, nfs_co_generic_cb, &task) != 0) {
+        g_free(buf);
+        return -ENOMEM;
+    }
+
+    while (!task.complete) {
+        nfs_set_events(client);
+        qemu_coroutine_yield();
+    }
+
+    g_free(buf);
+
+    if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) {
+        return task.ret < 0 ? task.ret : -EIO;
+    }
+
+    return 0;
+}
+
+static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
+{
+    NFSClient *client = bs->opaque;
+    NFSRPC task;
+
+    nfs_co_init_task(client, &task);
+
+    if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
+                        &task) != 0) {
+        return -ENOMEM;
+    }
+
+    while (!task.complete) {
+        nfs_set_events(client);
+        qemu_coroutine_yield();
+    }
+
+    return task.ret;
+}
+
+/* TODO Convert to fine grained options */
+static QemuOptsList runtime_opts = {
+    .name = "nfs",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the NFS file",
+        },
+        { /* end of list */ }
+    },
+};
+
+static void nfs_client_close(NFSClient *client)
+{
+    if (client->context) {
+        if (client->fh) {
+            nfs_close(client->context, client->fh);
+        }
+        qemu_aio_set_fd_handler(nfs_get_fd(client->context), NULL, NULL, NULL);
+        nfs_destroy_context(client->context);
+    }
+    memset(client, 0, sizeof(NFSClient));
+}
+
+static void nfs_file_close(BlockDriverState *bs)
+{
+    NFSClient *client = bs->opaque;
+    nfs_client_close(client);
+}
+
+static int64_t nfs_client_open(NFSClient *client, const char *filename,
+                               int flags, Error **errp)
+{
+    int ret = -EINVAL, i;
+    struct stat st;
+    URI *uri;
+    QueryParams *qp = NULL;
+    char *file = NULL, *strp = NULL;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        error_setg(errp, "Invalid URL specified");
+        goto fail;
+    }
+    strp = strrchr(uri->path, '/');
+    if (strp == NULL) {
+        error_setg(errp, "Invalid URL specified");
+        goto fail;
+    }
+    file = g_strdup(strp);
+    *strp = 0;
+
+    client->context = nfs_init_context();
+    if (client->context == NULL) {
+        error_setg(errp, "Failed to init NFS context");
+        goto fail;
+    }
+
+    qp = query_params_parse(uri->query);
+    for (i = 0; i < qp->n; i++) {
+        if (!qp->p[i].value) {
+            error_setg(errp, "Value for NFS parameter expected: %s",
+                       qp->p[i].name);
+            goto fail;
+        }
+        if (!strncmp(qp->p[i].name, "uid", 3)) {
+            nfs_set_uid(client->context, atoi(qp->p[i].value));
+        } else if (!strncmp(qp->p[i].name, "gid", 3)) {
+            nfs_set_gid(client->context, atoi(qp->p[i].value));
+        } else if (!strncmp(qp->p[i].name, "tcp-syncnt", 10)) {
+            nfs_set_tcp_syncnt(client->context, atoi(qp->p[i].value));
+        } else {
+            error_setg(errp, "Unknown NFS parameter name: %s",
+                       qp->p[i].name);
+            goto fail;
+        }
+    }
+
+    ret = nfs_mount(client->context, uri->server, uri->path);
+    if (ret < 0) {
+        error_setg(errp, "Failed to mount nfs share: %s",
+                   nfs_get_error(client->context));
+        goto fail;
+    }
+
+    if (flags & O_CREAT) {
+        ret = nfs_creat(client->context, file, 0600, &client->fh);
+        if (ret < 0) {
+            error_setg(errp, "Failed to create file: %s",
+                       nfs_get_error(client->context));
+            goto fail;
+        }
+    } else {
+        ret = nfs_open(client->context, file, flags, &client->fh);
+        if (ret < 0) {
+            error_setg(errp, "Failed to open file : %s",
+                       nfs_get_error(client->context));
+            goto fail;
+        }
+    }
+
+    ret = nfs_fstat(client->context, client->fh, &st);
+    if (ret < 0) {
+        error_setg(errp, "Failed to fstat file: %s",
+                   nfs_get_error(client->context));
+        goto fail;
+    }
+
+    ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+    client->has_zero_init = S_ISREG(st.st_mode);
+    goto out;
+fail:
+    nfs_client_close(client);
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    g_free(file);
+    return ret;
+}
+
+static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp) {
+    NFSClient *client = bs->opaque;
+    int64_t ret;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return -EINVAL;
+    }
+    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
+                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
+                          errp);
+    if (ret < 0) {
+        return ret;
+    }
+    bs->total_sectors = ret;
+    return 0;
+}
+
+static int nfs_file_create(const char *url, QEMUOptionParameter *options,
+                           Error **errp)
+{
+    int ret = 0;
+    int64_t total_size = 0;
+    NFSClient *client = g_malloc0(sizeof(NFSClient));
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, "size")) {
+            total_size = options->value.n;
+        }
+        options++;
+    }
+
+    ret = nfs_client_open(client, url, O_CREAT, errp);
+    if (ret < 0) {
+        goto out;
+    }
+    ret = nfs_ftruncate(client->context, client->fh, total_size);
+    nfs_client_close(client);
+out:
+    g_free(client);
+    return ret;
+}
+
+static int nfs_has_zero_init(BlockDriverState *bs)
+{
+    NFSClient *client = bs->opaque;
+    return client->has_zero_init;
+}
+
+static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
+{
+    NFSClient *client = bs->opaque;
+    NFSRPC task = {0};
+    struct stat st;
+
+    task.st = &st;
+    if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
+                        &task) != 0) {
+        return -ENOMEM;
+    }
+
+    while (!task.complete) {
+        nfs_set_events(client);
+        qemu_aio_wait();
+    }
+
+    return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+}
+
+static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
+{
+    NFSClient *client = bs->opaque;
+    return nfs_ftruncate(client->context, client->fh, offset);
+}
+
+static BlockDriver bdrv_nfs = {
+    .format_name     = "nfs",
+    .protocol_name   = "nfs",
+
+    .instance_size   = sizeof(NFSClient),
+    .bdrv_needs_filename = true,
+    .bdrv_has_zero_init = nfs_has_zero_init,
+    .bdrv_get_allocated_file_size = nfs_get_allocated_file_size,
+    .bdrv_truncate = nfs_file_truncate,
+
+    .bdrv_file_open  = nfs_file_open,
+    .bdrv_close      = nfs_file_close,
+    .bdrv_create     = nfs_file_create,
+
+    .bdrv_co_readv         = nfs_co_readv,
+    .bdrv_co_writev        = nfs_co_writev,
+    .bdrv_co_flush_to_disk = nfs_co_flush,
+};
+
+static void nfs_block_init(void)
+{
+    bdrv_register(&bdrv_nfs);
+}
+
+block_init(nfs_block_init);
diff --git a/block/parallels.c b/block/parallels.c
index 18b3ac0b2..1a5bd350b 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -49,9 +49,9 @@ typedef struct BDRVParallelsState {
     CoMutex lock;
 
     uint32_t *catalog_bitmap;
-    int catalog_size;
+    unsigned int catalog_size;
 
-    int tracks;
+    unsigned int tracks;
 } BDRVParallelsState;
 
 static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -68,7 +68,8 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam
     return 0;
 }
 
-static int parallels_open(BlockDriverState *bs, QDict *options, int flags)
+static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
+                          Error **errp)
 {
     BDRVParallelsState *s = bs->opaque;
     int i;
@@ -84,15 +85,26 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags)
 
     if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
         (le32_to_cpu(ph.version) != HEADER_VERSION)) {
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "Image not in Parallels format");
+        ret = -EINVAL;
         goto fail;
     }
 
     bs->total_sectors = le32_to_cpu(ph.nb_sectors);
 
     s->tracks = le32_to_cpu(ph.tracks);
+    if (s->tracks == 0) {
+        error_setg(errp, "Invalid image: Zero sectors per track");
+        ret = -EINVAL;
+        goto fail;
+    }
 
     s->catalog_size = le32_to_cpu(ph.catalog_entries);
+    if (s->catalog_size > INT_MAX / 4) {
+        error_setg(errp, "Catalog too large");
+        ret = -EFBIG;
+        goto fail;
+    }
     s->catalog_bitmap = g_malloc(s->catalog_size * 4);
 
     ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
diff --git a/block/qapi.c b/block/qapi.c
index a4bc4113b..8f2b4dbe7 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -25,6 +25,63 @@
 #include "block/qapi.h"
 #include "block/block_int.h"
 #include "qmp-commands.h"
+#include "qapi-visit.h"
+#include "qapi/qmp-output-visitor.h"
+#include "qapi/qmp/types.h"
+
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+{
+    BlockDeviceInfo *info = g_malloc0(sizeof(*info));
+
+    info->file                   = g_strdup(bs->filename);
+    info->ro                     = bs->read_only;
+    info->drv                    = g_strdup(bs->drv->format_name);
+    info->encrypted              = bs->encrypted;
+    info->encryption_key_missing = bdrv_key_required(bs);
+
+    if (bs->node_name[0]) {
+        info->has_node_name = true;
+        info->node_name = g_strdup(bs->node_name);
+    }
+
+    if (bs->backing_file[0]) {
+        info->has_backing_file = true;
+        info->backing_file = g_strdup(bs->backing_file);
+    }
+
+    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
+
+    if (bs->io_limits_enabled) {
+        ThrottleConfig cfg;
+        throttle_get_config(&bs->throttle_state, &cfg);
+        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
+        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
+        info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
+
+        info->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
+        info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
+        info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
+
+        info->has_bps_max     = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->bps_max         = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->has_bps_rd_max  = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->bps_rd_max      = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->has_bps_wr_max  = cfg.buckets[THROTTLE_BPS_WRITE].max;
+        info->bps_wr_max      = cfg.buckets[THROTTLE_BPS_WRITE].max;
+
+        info->has_iops_max    = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->iops_max        = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->iops_rd_max     = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max;
+        info->iops_wr_max     = cfg.buckets[THROTTLE_OPS_WRITE].max;
+
+        info->has_iops_size = cfg.op_size;
+        info->iops_size = cfg.op_size;
+    }
+
+    return info;
+}
 
 /*
  * Returns 0 on success, with *p_list either set to describe snapshot
@@ -134,6 +191,9 @@ void bdrv_query_image_info(BlockDriverState *bs,
         info->dirty_flag = bdi.is_dirty;
         info->has_dirty_flag = true;
     }
+    info->format_specific     = bdrv_get_specific_info(bs);
+    info->has_format_specific = info->format_specific != NULL;
+
     backing_filename = bs->backing_file;
     if (backing_filename[0] != '\0') {
         info->backing_filename = g_strdup(backing_filename);
@@ -198,50 +258,20 @@ void bdrv_query_info(BlockDriverState *bs,
         info->io_status = bs->iostatus;
     }
 
-    if (bs->dirty_bitmap) {
-        info->has_dirty = true;
-        info->dirty = g_malloc0(sizeof(*info->dirty));
-        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
-        info->dirty->granularity =
-         ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
+    if (!QLIST_EMPTY(&bs->dirty_bitmaps)) {
+        info->has_dirty_bitmaps = true;
+        info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs);
     }
 
     if (bs->drv) {
         info->has_inserted = true;
-        info->inserted = g_malloc0(sizeof(*info->inserted));
-        info->inserted->file = g_strdup(bs->filename);
-        info->inserted->ro = bs->read_only;
-        info->inserted->drv = g_strdup(bs->drv->format_name);
-        info->inserted->encrypted = bs->encrypted;
-        info->inserted->encryption_key_missing = bdrv_key_required(bs);
-
-        if (bs->backing_file[0]) {
-            info->inserted->has_backing_file = true;
-            info->inserted->backing_file = g_strdup(bs->backing_file);
-        }
-
-        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
-
-        if (bs->io_limits_enabled) {
-            info->inserted->bps =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->bps_rd =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
-            info->inserted->bps_wr =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
-            info->inserted->iops =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->iops_rd =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
-            info->inserted->iops_wr =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
-        }
+        info->inserted = bdrv_block_device_info(bs);
 
         bs0 = bs;
         p_image_info = &info->inserted->image;
         while (1) {
             bdrv_query_image_info(bs0, p_image_info, &local_err);
-            if (error_is_set(&local_err)) {
+            if (local_err) {
                 error_propagate(errp, local_err);
                 goto err;
             }
@@ -289,6 +319,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs)
         s->parent = bdrv_query_stats(bs->file);
     }
 
+    if (bs->backing_hd) {
+        s->has_backing = true;
+        s->backing = bdrv_query_stats(bs->backing_hd);
+    }
+
     return s;
 }
 
@@ -301,7 +336,7 @@ BlockInfoList *qmp_query_block(Error **errp)
      while ((bs = bdrv_next(bs))) {
         BlockInfoList *info = g_malloc0(sizeof(*info));
         bdrv_query_info(bs, &info->value, &local_err);
-        if (error_is_set(&local_err)) {
+        if (local_err) {
             error_propagate(errp, local_err);
             goto err;
         }
@@ -397,6 +432,119 @@ void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f,
     }
 }
 
+static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation,
+                       QDict *dict);
+static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation,
+                       QList *list);
+
+static void dump_qobject(fprintf_function func_fprintf, void *f,
+                         int comp_indent, QObject *obj)
+{
+    switch (qobject_type(obj)) {
+        case QTYPE_QINT: {
+            QInt *value = qobject_to_qint(obj);
+            func_fprintf(f, "%" PRId64, qint_get_int(value));
+            break;
+        }
+        case QTYPE_QSTRING: {
+            QString *value = qobject_to_qstring(obj);
+            func_fprintf(f, "%s", qstring_get_str(value));
+            break;
+        }
+        case QTYPE_QDICT: {
+            QDict *value = qobject_to_qdict(obj);
+            dump_qdict(func_fprintf, f, comp_indent, value);
+            break;
+        }
+        case QTYPE_QLIST: {
+            QList *value = qobject_to_qlist(obj);
+            dump_qlist(func_fprintf, f, comp_indent, value);
+            break;
+        }
+        case QTYPE_QFLOAT: {
+            QFloat *value = qobject_to_qfloat(obj);
+            func_fprintf(f, "%g", qfloat_get_double(value));
+            break;
+        }
+        case QTYPE_QBOOL: {
+            QBool *value = qobject_to_qbool(obj);
+            func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false");
+            break;
+        }
+        case QTYPE_QERROR: {
+            QString *value = qerror_human((QError *)obj);
+            func_fprintf(f, "%s", qstring_get_str(value));
+            break;
+        }
+        case QTYPE_NONE:
+            break;
+        case QTYPE_MAX:
+        default:
+            abort();
+    }
+}
+
+static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation,
+                       QList *list)
+{
+    const QListEntry *entry;
+    int i = 0;
+
+    for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) {
+        qtype_code type = qobject_type(entry->value);
+        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST);
+        const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: ";
+
+        func_fprintf(f, format, indentation * 4, "", i);
+        dump_qobject(func_fprintf, f, indentation + 1, entry->value);
+        if (!composite) {
+            func_fprintf(f, "\n");
+        }
+    }
+}
+
+static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation,
+                       QDict *dict)
+{
+    const QDictEntry *entry;
+
+    for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) {
+        qtype_code type = qobject_type(entry->value);
+        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST);
+        const char *format = composite ? "%*s%s:\n" : "%*s%s: ";
+        char key[strlen(entry->key) + 1];
+        int i;
+
+        /* replace dashes with spaces in key (variable) names */
+        for (i = 0; entry->key[i]; i++) {
+            key[i] = entry->key[i] == '-' ? ' ' : entry->key[i];
+        }
+        key[i] = 0;
+
+        func_fprintf(f, format, indentation * 4, "", key);
+        dump_qobject(func_fprintf, f, indentation + 1, entry->value);
+        if (!composite) {
+            func_fprintf(f, "\n");
+        }
+    }
+}
+
+void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,
+                                   ImageInfoSpecific *info_spec)
+{
+    Error *local_err = NULL;
+    QmpOutputVisitor *ov = qmp_output_visitor_new();
+    QObject *obj, *data;
+
+    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL,
+                                 &local_err);
+    obj = qmp_output_get_qobject(ov);
+    assert(qobject_type(obj) == QTYPE_QDICT);
+    data = qdict_get(qobject_to_qdict(obj), "data");
+    dump_qobject(func_fprintf, f, 1, data);
+    qmp_output_visitor_cleanup(ov);
+}
+
 void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
                           ImageInfo *info)
 {
@@ -467,4 +615,9 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
             func_fprintf(f, "\n");
         }
     }
+
+    if (info->has_format_specific) {
+        func_fprintf(f, "Format specific information:\n");
+        bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific);
+    }
 }
diff --git a/block/qcow.c b/block/qcow.c
index 5239bd68f..d5a7d5fd1 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -92,7 +92,8 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
         return 0;
 }
 
-static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
     int len, i, shift, ret;
@@ -112,23 +113,26 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
     be64_to_cpus(&header.l1_table_offset);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "Image not in qcow format");
+        ret = -EINVAL;
         goto fail;
     }
     if (header.version != QCOW_VERSION) {
         char version[64];
         snprintf(version, sizeof(version), "QCOW version %d", header.version);
-        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "qcow", version);
+        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                  bs->device_name, "qcow", version);
         ret = -ENOTSUP;
         goto fail;
     }
 
     if (header.size <= 1 || header.cluster_bits < 9) {
+        error_setg(errp, "invalid value in qcow header");
         ret = -EINVAL;
         goto fail;
     }
     if (header.crypt_method > QCOW_CRYPT_AES) {
+        error_setg(errp, "invalid encryption method in qcow header");
         ret = -EINVAL;
         goto fail;
     }
@@ -395,7 +399,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
@@ -410,7 +414,14 @@ static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
     if (n > nb_sectors)
         n = nb_sectors;
     *pnum = n;
-    return (cluster_offset != 0);
+    if (!cluster_offset) {
+        return 0;
+    }
+    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) {
+        return BDRV_BLOCK_DATA;
+    }
+    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@@ -651,7 +662,8 @@ static void qcow_close(BlockDriverState *bs)
     error_free(s->migration_blocker);
 }
 
-static int qcow_create(const char *filename, QEMUOptionParameter *options)
+static int qcow_create(const char *filename, QEMUOptionParameter *options,
+                       Error **errp)
 {
     int header_size, backing_filename_len, l1_size, shift, i;
     QCowHeader header;
@@ -659,6 +671,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
     int64_t total_size = 0;
     const char *backing_file = NULL;
     int flags = 0;
+    Error *local_err = NULL;
     int ret;
     BlockDriverState *qcow_bs;
 
@@ -674,13 +687,17 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    ret = bdrv_create_file(filename, options);
+    ret = bdrv_create_file(filename, options, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
-    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
+    qcow_bs = NULL;
+    ret = bdrv_open(&qcow_bs, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
@@ -706,7 +723,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
             backing_file = NULL;
         }
         header.cluster_bits = 9; /* 512 byte cluster to avoid copying
-                                    unmodifyed sectors */
+                                    unmodified sectors */
         header.l2_bits = 12; /* 32 KB L2 tables */
     } else {
         header.cluster_bits = 12; /* 4 KB clusters */
@@ -751,7 +768,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
     g_free(tmp);
     ret = 0;
 exit:
-    bdrv_delete(qcow_bs);
+    bdrv_unref(qcow_bs);
     return ret;
 }
 
@@ -896,7 +913,7 @@ static BlockDriver bdrv_qcow = {
 
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_is_allocated   = qcow_co_is_allocated,
+    .bdrv_co_get_block_status   = qcow_co_get_block_status,
 
     .bdrv_set_key           = qcow_set_key,
     .bdrv_make_empty        = qcow_make_empty,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 2f3114ecc..8ecbb5bc0 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -115,6 +115,21 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     }
 
     if (c == s->refcount_block_cache) {
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK,
+                c->entries[i].offset, s->cluster_size);
+    } else if (c == s->l2_table_cache) {
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
+                c->entries[i].offset, s->cluster_size);
+    } else {
+        ret = qcow2_pre_write_overlap_check(bs, 0,
+                c->entries[i].offset, s->cluster_size);
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (c == s->refcount_block_cache) {
         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
     } else if (c == s->l2_table_cache) {
         BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
@@ -185,6 +200,24 @@ void qcow2_cache_depends_on_flush(Qcow2Cache *c)
     c->depends_on_flush = true;
 }
 
+int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int ret, i;
+
+    ret = qcow2_cache_flush(bs, c);
+    if (ret < 0) {
+        return ret;
+    }
+
+    for (i = 0; i < c->size; i++) {
+        assert(c->entries[i].ref == 0);
+        c->entries[i].offset = 0;
+        c->entries[i].cache_hits = 0;
+    }
+
+    return 0;
+}
+
 static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
 {
     int i;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index cca76d4fc..331ab0802 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -35,6 +35,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     BDRVQcowState *s = bs->opaque;
     int new_l1_size2, ret, i;
     uint64_t *new_l1_table;
+    int64_t old_l1_table_offset, old_l1_size;
     int64_t new_l1_table_offset, new_l1_size;
     uint8_t data[12];
 
@@ -54,7 +55,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
         }
     }
 
-    if (new_l1_size > INT_MAX) {
+    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
         return -EFBIG;
     }
 
@@ -80,6 +81,14 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
         goto fail;
     }
 
+    /* the L1 position has not yet been updated, so these clusters must
+     * indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
+                                        new_l1_size2);
+    if (ret < 0) {
+        goto fail;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
@@ -92,17 +101,19 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     /* set new table */
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
     cpu_to_be32w((uint32_t*)data, new_l1_size);
-    cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
+    stq_be_p(data + 4, new_l1_table_offset);
     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
     if (ret < 0) {
         goto fail;
     }
     g_free(s->l1_table);
-    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
-                        QCOW2_DISCARD_OTHER);
+    old_l1_table_offset = s->l1_table_offset;
     s->l1_table_offset = new_l1_table_offset;
     s->l1_table = new_l1_table;
+    old_l1_size = s->l1_size;
     s->l1_size = new_l1_size;
+    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
     return 0;
  fail:
     g_free(new_l1_table);
@@ -137,7 +148,7 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
  * and we really don't want bdrv_pread to perform a read-modify-write)
  */
 #define L1_ENTRIES_PER_SECTOR (512 / 8)
-static int write_l1_entry(BlockDriverState *bs, int l1_index)
+int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t buf[L1_ENTRIES_PER_SECTOR];
@@ -149,6 +160,12 @@ static int write_l1_entry(BlockDriverState *bs, int l1_index)
         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
+            s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
         buf, sizeof(buf));
@@ -173,7 +190,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t old_l2_offset;
-    uint64_t *l2_table;
+    uint64_t *l2_table = NULL;
     int64_t l2_offset;
     int ret;
 
@@ -185,7 +202,8 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 
     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
     if (l2_offset < 0) {
-        return l2_offset;
+        ret = l2_offset;
+        goto fail;
     }
 
     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
@@ -198,7 +216,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
     ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
 
     l2_table = *table;
@@ -239,7 +257,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     /* update the L1 entry */
     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
-    ret = write_l1_entry(bs, l1_index);
+    ret = qcow2_write_l1_entry(bs, l1_index);
     if (ret < 0) {
         goto fail;
     }
@@ -250,8 +268,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 
 fail:
     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
-    qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+    if (l2_table != NULL) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+    }
     s->l1_table[l1_index] = old_l2_offset;
+    if (l2_offset > 0) {
+        qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+                            QCOW2_DISCARD_ALWAYS);
+    }
     return ret;
 }
 
@@ -263,23 +287,26 @@ fail:
  * cluster which may require a different handling)
  */
 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
-        uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
+        uint64_t *l2_table, uint64_t stop_flags)
 {
     int i;
-    uint64_t mask = stop_flags | L2E_OFFSET_MASK;
-    uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
+    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
+    uint64_t first_entry = be64_to_cpu(l2_table[0]);
+    uint64_t offset = first_entry & mask;
 
     if (!offset)
         return 0;
 
-    for (i = start; i < start + nb_clusters; i++) {
+    assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED);
+
+    for (i = 0; i < nb_clusters; i++) {
         uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
         if (offset + (uint64_t) i * cluster_size != l2_entry) {
             break;
         }
     }
 
-	return (i - start);
+	return i;
 }
 
 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
@@ -332,15 +359,6 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
     struct iovec iov;
     int n, ret;
 
-    /*
-     * If this is the last cluster and it is only partially used, we must only
-     * copy until the end of the image, or bdrv_check_request will fail for the
-     * bdrv_read/write calls below.
-     */
-    if (start_sect + n_end > bs->total_sectors) {
-        n_end = bs->total_sectors - start_sect;
-    }
-
     n = n_end - n_start;
     if (n <= 0) {
         return 0;
@@ -353,6 +371,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
     /* Call .bdrv_co_readv() directly instead of using the public block-layer
      * interface.  This avoids double I/O throttling and request tracking,
      * which can lead to deadlock when block layer copy-on-read is enabled.
@@ -368,6 +390,12 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
                         &s->aes_encrypt_key);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, 0,
+            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+    if (ret < 0) {
+        goto out;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
     if (ret < 0) {
@@ -463,11 +491,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
         break;
     case QCOW2_CLUSTER_ZERO:
         if (s->qcow_version < 3) {
+            qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
             return -EIO;
         }
         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], 0,
-                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+                &l2_table[l2_index], QCOW_OFLAG_ZERO);
         *cluster_offset = 0;
         break;
     case QCOW2_CLUSTER_UNALLOCATED:
@@ -478,8 +506,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     case QCOW2_CLUSTER_NORMAL:
         /* how many allocated clusters ? */
         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], 0,
-                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+                &l2_table[l2_index], QCOW_OFLAG_ZERO);
         *cluster_offset &= L2E_OFFSET_MASK;
         break;
     default:
@@ -695,6 +722,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 
+    assert(l2_index + m->nb_clusters <= s->l2_size);
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
 	 * each write allocates separate cluster and writes data concurrently.
@@ -908,7 +936,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
         /* We keep all QCOW_OFLAG_COPIED clusters */
         keep_clusters =
             count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], 0,
+                                      &l2_table[l2_index],
                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
         assert(keep_clusters <= nb_clusters);
 
@@ -1150,7 +1178,7 @@ fail:
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
+    int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t start, remaining;
@@ -1158,15 +1186,13 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     uint64_t cur_bytes;
     int ret;
 
-    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
-                                      n_start, n_end);
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
 
-    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
-    offset = start_of_cluster(s, offset);
+    assert((offset & ~BDRV_SECTOR_MASK) == 0);
 
 again:
-    start = offset + (n_start << BDRV_SECTOR_BITS);
-    remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+    start = offset;
+    remaining = *num << BDRV_SECTOR_BITS;
     cluster_offset = 0;
     *host_offset = 0;
     cur_bytes = 0;
@@ -1252,7 +1278,7 @@ again:
         }
     }
 
-    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+    *num -= remaining >> BDRV_SECTOR_BITS;
     assert(*num > 0);
     assert(*host_offset != 0);
 
@@ -1317,7 +1343,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
  * clusters.
  */
 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters)
+    unsigned int nb_clusters, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table;
@@ -1337,16 +1363,34 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         uint64_t old_offset;
 
         old_offset = be64_to_cpu(l2_table[l2_index + i]);
-        if ((old_offset & L2E_OFFSET_MASK) == 0) {
+
+        /*
+         * Make sure that a discarded area reads back as zeroes for v3 images
+         * (we cannot do it for v2 without actually writing a zero-filled
+         * buffer). We can skip the operation if the cluster is already marked
+         * as zero, or if it's unallocated and we don't have a backing file.
+         *
+         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
+         * holding s->lock, so that doesn't work today.
+         */
+        if (old_offset & QCOW_OFLAG_ZERO) {
+            continue;
+        }
+
+        if ((old_offset & L2E_OFFSET_MASK) == 0 && !bs->backing_hd) {
             continue;
         }
 
         /* First remove L2 entries */
         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-        l2_table[l2_index + i] = cpu_to_be64(0);
+        if (s->qcow_version >= 3) {
+            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+        } else {
+            l2_table[l2_index + i] = cpu_to_be64(0);
+        }
 
         /* Then decrease the refcount */
-        qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+        qcow2_free_any_clusters(bs, old_offset, 1, type);
     }
 
     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
@@ -1358,7 +1402,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
 }
 
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors)
+    int nb_sectors, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t end_offset;
@@ -1369,7 +1413,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 
     /* Round start up and end down */
     offset = align_offset(offset, s->cluster_size);
-    end_offset &= ~(s->cluster_size - 1);
+    end_offset = start_of_cluster(s, end_offset);
 
     if (offset > end_offset) {
         return 0;
@@ -1381,7 +1425,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 
     /* Each L2 table is handled by its own loop iteration */
     while (nb_clusters > 0) {
-        ret = discard_single_l2(bs, offset, nb_clusters);
+        ret = discard_single_l2(bs, offset, nb_clusters, type);
         if (ret < 0) {
             goto fail;
         }
@@ -1476,3 +1520,255 @@ fail:
 
     return ret;
 }
+
+/*
+ * Expands all zero clusters in a specific L1 table (or deallocates them, for
+ * non-backed non-pre-allocated zero clusters).
+ *
+ * expanded_clusters is a bitmap where every bit corresponds to one cluster in
+ * the image file; a bit gets set if the corresponding cluster has been used for
+ * zero expansion (i.e., has been filled with zeroes and is referenced from an
+ * L2 table). nb_clusters contains the total cluster count of the image file,
+ * i.e., the number of bits in expanded_clusters.
+ */
+static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+                                      int l1_size, uint8_t **expanded_clusters,
+                                      uint64_t *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    bool is_active_l1 = (l1_table == s->l1_table);
+    uint64_t *l2_table = NULL;
+    int ret;
+    int i, j;
+
+    if (!is_active_l1) {
+        /* inactive L2 tables require a buffer to be stored in when loading
+         * them from disk */
+        l2_table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    for (i = 0; i < l1_size; i++) {
+        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            /* unallocated */
+            continue;
+        }
+
+        if (is_active_l1) {
+            /* get active L2 tables from cache */
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                    (void **)&l2_table);
+        } else {
+            /* load inactive L2 tables from disk */
+            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                    (void *)l2_table, s->cluster_sectors);
+        }
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+            bool preallocated = offset != 0;
+
+            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
+                cluster_index = offset >> s->cluster_bits;
+                assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
+                if ((*expanded_clusters)[cluster_index / 8] &
+                    (1 << (cluster_index % 8))) {
+                    /* Probably a shared L2 table; this cluster was a zero
+                     * cluster which has been expanded, its refcount
+                     * therefore most likely requires an update. */
+                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
+                                                        QCOW2_DISCARD_NEVER);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                    /* Since we just increased the refcount, the COPIED flag may
+                     * no longer be set. */
+                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
+                    l2_dirty = true;
+                }
+                continue;
+            }
+            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+                continue;
+            }
+
+            if (!preallocated) {
+                if (!bs->backing_hd) {
+                    /* not backed; therefore we can simply deallocate the
+                     * cluster */
+                    l2_table[j] = 0;
+                    l2_dirty = true;
+                    continue;
+                }
+
+                offset = qcow2_alloc_clusters(bs, s->cluster_size);
+                if (offset < 0) {
+                    ret = offset;
+                    goto fail;
+                }
+            }
+
+            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
+            if (ret < 0) {
+                if (!preallocated) {
+                    qcow2_free_clusters(bs, offset, s->cluster_size,
+                                        QCOW2_DISCARD_ALWAYS);
+                }
+                goto fail;
+            }
+
+            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
+                                    s->cluster_sectors, 0);
+            if (ret < 0) {
+                if (!preallocated) {
+                    qcow2_free_clusters(bs, offset, s->cluster_size,
+                                        QCOW2_DISCARD_ALWAYS);
+                }
+                goto fail;
+            }
+
+            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            l2_dirty = true;
+
+            cluster_index = offset >> s->cluster_bits;
+
+            if (cluster_index >= *nb_clusters) {
+                uint64_t old_bitmap_size = (*nb_clusters + 7) / 8;
+                uint64_t new_bitmap_size;
+                /* The offset may lie beyond the old end of the underlying image
+                 * file for growable files only */
+                assert(bs->file->growable);
+                *nb_clusters = size_to_clusters(s, bs->file->total_sectors *
+                                                BDRV_SECTOR_SIZE);
+                new_bitmap_size = (*nb_clusters + 7) / 8;
+                *expanded_clusters = g_realloc(*expanded_clusters,
+                                               new_bitmap_size);
+                /* clear the newly allocated space */
+                memset(&(*expanded_clusters)[old_bitmap_size], 0,
+                       new_bitmap_size - old_bitmap_size);
+            }
+
+            assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
+            (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8);
+        }
+
+        if (is_active_l1) {
+            if (l2_dirty) {
+                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                qcow2_cache_depends_on_flush(s->l2_table_cache);
+            }
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            if (ret < 0) {
+                l2_table = NULL;
+                goto fail;
+            }
+        } else {
+            if (l2_dirty) {
+                ret = qcow2_pre_write_overlap_check(bs,
+                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
+                        s->cluster_size);
+                if (ret < 0) {
+                    goto fail;
+                }
+
+                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                        (void *)l2_table, s->cluster_sectors);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    if (l2_table) {
+        if (!is_active_l1) {
+            qemu_vfree(l2_table);
+        } else {
+            if (ret < 0) {
+                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            } else {
+                ret = qcow2_cache_put(bs, s->l2_table_cache,
+                        (void **)&l2_table);
+            }
+        }
+    }
+    return ret;
+}
+
+/*
+ * For backed images, expands all zero clusters on the image. For non-backed
+ * images, deallocates all non-pre-allocated zero clusters (and claims the
+ * allocation for pre-allocated ones). This is important for downgrading to a
+ * qcow2 version which doesn't yet support metadata zero clusters.
+ */
+int qcow2_expand_zero_clusters(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table = NULL;
+    uint64_t nb_clusters;
+    uint8_t *expanded_clusters;
+    int ret;
+    int i, j;
+
+    nb_clusters = size_to_clusters(s, bs->file->total_sectors *
+                                   BDRV_SECTOR_SIZE);
+    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);
+
+    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
+                                     &expanded_clusters, &nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Inactive L1 tables may point to active L2 tables - therefore it is
+     * necessary to flush the L2 table cache before trying to access the L2
+     * tables pointed to by inactive L1 entries (else we might try to expand
+     * zero clusters that have already been expanded); furthermore, it is also
+     * necessary to empty the L2 table cache, since it may contain tables which
+     * are now going to be modified directly on disk, bypassing the cache.
+     * qcow2_cache_empty() does both for us. */
+    ret = qcow2_cache_empty(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < s->nb_snapshots; i++) {
+        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
+                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
+
+        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+
+        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
+                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->snapshots[i].l1_size; j++) {
+            be64_to_cpus(&l1_table[j]);
+        }
+
+        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
+                                         &expanded_clusters, &nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+
+fail:
+    g_free(expanded_clusters);
+    g_free(l1_table);
+    return ret;
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 1244693f3..a37ee4501 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -25,8 +25,10 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/range.h"
+#include "qapi/qmp/types.h"
 
-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length,
                             int addend, enum qcow2_discard_type type);
@@ -38,8 +40,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 int qcow2_refcount_init(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, refcount_table_size2, i;
+    unsigned int refcount_table_size2, i;
+    int ret;
 
+    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     s->refcount_table = g_malloc(refcount_table_size2);
     if (s->refcount_table_size > 0) {
@@ -85,7 +89,7 @@ static int load_refcount_block(BlockDriverState *bs,
 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
 {
     BDRVQcowState *s = bs->opaque;
-    int refcount_table_index, block_index;
+    uint64_t refcount_table_index, block_index;
     int64_t refcount_block_offset;
     int ret;
     uint16_t *refcount_block;
@@ -94,7 +98,8 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
     if (refcount_table_index >= s->refcount_table_size)
         return 0;
-    refcount_block_offset = s->refcount_table[refcount_table_index];
+    refcount_block_offset =
+        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
     if (!refcount_block_offset)
         return 0;
 
@@ -189,10 +194,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
      *   they can describe them themselves.
      *
      * - We need to consider that at this point we are inside update_refcounts
-     *   and doing the initial refcount increase. This means that some clusters
-     *   have already been allocated by the caller, but their refcount isn't
-     *   accurate yet. free_cluster_index tells us where this allocation ends
-     *   as long as we don't overwrite it by freeing clusters.
+     *   and potentially doing an initial refcount increase. This means that
+     *   some clusters have already been allocated by the caller, but their
+     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
+     *   need to return -EAGAIN to signal the caller that it needs to restart
+     *   the search for free clusters.
      *
      * - alloc_clusters_noref and qcow2_free_clusters may load a different
      *   refcount block into the cache
@@ -277,7 +283,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
         }
 
         s->refcount_table[refcount_table_index] = new_block;
-        return 0;
+
+        /* The new refcount block may be where the caller intended to put its
+         * data, so let it restart the search. */
+        return -EAGAIN;
     }
 
     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
@@ -300,8 +309,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
 
     /* Calculate the number of refcount blocks needed so far */
     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
-    uint64_t blocks_used = (s->free_cluster_index +
-        refcount_block_clusters - 1) / refcount_block_clusters;
+    uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
+
+    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
+        return -EFBIG;
+    }
 
     /* And now we need at least one block more for the new metadata */
     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
@@ -334,8 +346,6 @@ static int alloc_refcount_block(BlockDriverState *bs,
     uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
     uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
 
-    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
-
     /* Fill the new refcount table */
     memcpy(new_table, s->refcount_table,
         s->refcount_table_size * sizeof(uint64_t));
@@ -398,18 +408,19 @@ static int alloc_refcount_block(BlockDriverState *bs,
     s->refcount_table_size = table_size;
     s->refcount_table_offset = table_offset;
 
-    /* Free old table. Remember, we must not change free_cluster_index */
-    uint64_t old_free_cluster_index = s->free_cluster_index;
+    /* Free old table. */
     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
                         QCOW2_DISCARD_OTHER);
-    s->free_cluster_index = old_free_cluster_index;
 
     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
     if (ret < 0) {
         return ret;
     }
 
-    return 0;
+    /* If we were trying to do the initial refcount update for some cluster
+     * allocation, we might have used the same clusters to store newly
+     * allocated metadata. Make the caller search some new space. */
+    return -EAGAIN;
 
 fail_table:
     g_free(new_table);
@@ -513,8 +524,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
             s->l2_table_cache);
     }
 
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + length - 1);
     for(cluster_offset = start; cluster_offset <= last;
         cluster_offset += s->cluster_size)
     {
@@ -599,10 +610,10 @@ fail:
  * If the return value is non-negative, it is the new refcount of the cluster.
  * If it is negative, it is -errno and indicates an error.
  */
-static int update_cluster_refcount(BlockDriverState *bs,
-                                   int64_t cluster_index,
-                                   int addend,
-                                   enum qcow2_discard_type type)
+int qcow2_update_cluster_refcount(BlockDriverState *bs,
+                                  int64_t cluster_index,
+                                  int addend,
+                                  enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -624,15 +635,16 @@ static int update_cluster_refcount(BlockDriverState *bs,
 
 
 /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 {
     BDRVQcowState *s = bs->opaque;
-    int i, nb_clusters, refcount;
+    uint64_t i, nb_clusters;
+    int refcount;
 
     nb_clusters = size_to_clusters(s, size);
 retry:
     for(i = 0; i < nb_clusters; i++) {
-        int64_t next_cluster_index = s->free_cluster_index++;
+        uint64_t next_cluster_index = s->free_cluster_index++;
         refcount = get_refcount(bs, next_cluster_index);
 
         if (refcount < 0) {
@@ -649,18 +661,21 @@ retry:
     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 }
 
-int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 {
     int64_t offset;
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
-    offset = alloc_clusters_noref(bs, size);
-    if (offset < 0) {
-        return offset;
-    }
+    do {
+        offset = alloc_clusters_noref(bs, size);
+        if (offset < 0) {
+            return offset;
+        }
+
+        ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    } while (ret == -EAGAIN);
 
-    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
@@ -673,33 +688,36 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t cluster_index;
-    uint64_t old_free_cluster_index;
-    int i, refcount, ret;
+    uint64_t i;
+    int refcount, ret;
 
-    /* Check how many clusters there are free */
-    cluster_index = offset >> s->cluster_bits;
-    for(i = 0; i < nb_clusters; i++) {
-        refcount = get_refcount(bs, cluster_index++);
+    assert(nb_clusters >= 0);
+    if (nb_clusters == 0) {
+        return 0;
+    }
 
-        if (refcount < 0) {
-            return refcount;
-        } else if (refcount != 0) {
-            break;
+    do {
+        /* Check how many clusters there are free */
+        cluster_index = offset >> s->cluster_bits;
+        for(i = 0; i < nb_clusters; i++) {
+            refcount = get_refcount(bs, cluster_index++);
+
+            if (refcount < 0) {
+                return refcount;
+            } else if (refcount != 0) {
+                break;
+            }
         }
-    }
 
-    /* And then allocate them */
-    old_free_cluster_index = s->free_cluster_index;
-    s->free_cluster_index = cluster_index + i;
+        /* And then allocate them */
+        ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                              QCOW2_DISCARD_NEVER);
+    } while (ret == -EAGAIN);
 
-    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
-                          QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
 
-    s->free_cluster_index = old_free_cluster_index;
-
     return i;
 }
 
@@ -722,7 +740,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     }
  redo:
     free_in_cluster = s->cluster_size -
-        (s->free_byte_offset & (s->cluster_size - 1));
+        offset_into_cluster(s, s->free_byte_offset);
     if (size <= free_in_cluster) {
         /* enough space in current cluster */
         offset = s->free_byte_offset;
@@ -730,20 +748,20 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         free_in_cluster -= size;
         if (free_in_cluster == 0)
             s->free_byte_offset = 0;
-        if ((offset & (s->cluster_size - 1)) != 0)
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+        if (offset_into_cluster(s, offset) != 0)
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
     } else {
         offset = qcow2_alloc_clusters(bs, s->cluster_size);
         if (offset < 0) {
             return offset;
         }
-        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        cluster_offset = start_of_cluster(s, s->free_byte_offset);
         if ((cluster_offset + s->cluster_size) == offset) {
             /* we are lucky: contiguous data */
             offset = s->free_byte_offset;
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
             s->free_byte_offset += size;
         } else {
             s->free_byte_offset = offset;
@@ -752,8 +770,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     }
 
     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
-     * or explicitly by update_cluster_refcount().  Refcount blocks must be
-     * flushed before the caller's L2 table updates.
+     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
+     * be flushed before the caller's L2 table updates.
      */
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     return offset;
@@ -794,11 +812,13 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
         }
         break;
     case QCOW2_CLUSTER_NORMAL:
-        qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
-                            nb_clusters << s->cluster_bits, type);
+    case QCOW2_CLUSTER_ZERO:
+        if (l2_entry & L2E_OFFSET_MASK) {
+            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                                nb_clusters << s->cluster_bits, type);
+        }
         break;
     case QCOW2_CLUSTER_UNALLOCATED:
-    case QCOW2_CLUSTER_ZERO:
         break;
     default:
         abort();
@@ -861,15 +881,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             }
 
             for(j = 0; j < s->l2_size; j++) {
+                uint64_t cluster_index;
+
                 offset = be64_to_cpu(l2_table[j]);
-                if (offset != 0) {
-                    old_offset = offset;
-                    offset &= ~QCOW_OFLAG_COPIED;
-                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                old_offset = offset;
+                offset &= ~QCOW_OFLAG_COPIED;
+
+                switch (qcow2_get_cluster_type(offset)) {
+                    case QCOW2_CLUSTER_COMPRESSED:
                         nb_csectors = ((offset >> s->csize_shift) &
                                        s->csize_mask) + 1;
                         if (addend != 0) {
-                            int ret;
                             ret = update_refcount(bs,
                                 (offset & s->cluster_offset_mask) & ~511,
                                 nb_csectors * 512, addend,
@@ -880,11 +902,20 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                         }
                         /* compressed clusters are never modified */
                         refcount = 2;
-                    } else {
-                        uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        break;
+
+                    case QCOW2_CLUSTER_NORMAL:
+                    case QCOW2_CLUSTER_ZERO:
+                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        if (!cluster_index) {
+                            /* unallocated */
+                            refcount = 0;
+                            break;
+                        }
                         if (addend != 0) {
-                            refcount = update_cluster_refcount(bs, cluster_index, addend,
-                                                               QCOW2_DISCARD_SNAPSHOT);
+                            refcount = qcow2_update_cluster_refcount(bs,
+                                    cluster_index, addend,
+                                    QCOW2_DISCARD_SNAPSHOT);
                         } else {
                             refcount = get_refcount(bs, cluster_index);
                         }
@@ -893,19 +924,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             ret = refcount;
                             goto fail;
                         }
-                    }
+                        break;
 
-                    if (refcount == 1) {
-                        offset |= QCOW_OFLAG_COPIED;
-                    }
-                    if (offset != old_offset) {
-                        if (addend > 0) {
-                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
-                                s->refcount_block_cache);
-                        }
-                        l2_table[j] = cpu_to_be64(offset);
-                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    case QCOW2_CLUSTER_UNALLOCATED:
+                        refcount = 0;
+                        break;
+
+                    default:
+                        abort();
+                }
+
+                if (refcount == 1) {
+                    offset |= QCOW_OFLAG_COPIED;
+                }
+                if (offset != old_offset) {
+                    if (addend > 0) {
+                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                            s->refcount_block_cache);
                     }
+                    l2_table[j] = cpu_to_be64(offset);
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 }
             }
 
@@ -916,8 +954,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 
             if (addend != 0) {
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
-                                                   QCOW2_DISCARD_SNAPSHOT);
+                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
+                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
             } else {
                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
             }
@@ -982,22 +1020,17 @@ static void inc_refcounts(BlockDriverState *bs,
                           int64_t offset, int64_t size)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t start, last, cluster_offset;
-    int k;
+    uint64_t start, last, cluster_offset, k;
 
     if (size <= 0)
         return;
 
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + size - 1);
     for(cluster_offset = start; cluster_offset <= last;
         cluster_offset += s->cluster_size) {
         k = cluster_offset >> s->cluster_bits;
-        if (k < 0) {
-            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
-                cluster_offset);
-            res->corruptions++;
-        } else if (k >= refcount_table_size) {
+        if (k >= refcount_table_size) {
             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
                 "the end of the image file, can't properly check refcounts.\n",
                 cluster_offset);
@@ -1014,7 +1047,6 @@ static void inc_refcounts(BlockDriverState *bs,
 
 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
 enum {
-    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */
     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
 };
 
@@ -1033,7 +1065,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table, l2_entry;
     uint64_t next_contiguous_offset = 0;
-    int i, l2_size, nb_csectors, refcount;
+    int i, l2_size, nb_csectors;
 
     /* Read L2 table from disk */
     l2_size = s->l2_size * sizeof(uint64_t);
@@ -1085,23 +1117,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
 
         case QCOW2_CLUSTER_NORMAL:
         {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
 
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, offset >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for offset %"
-                        PRIx64 ": %s\n", l2_entry, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
-                        PRIx64 " refcount=%d\n", l2_entry, refcount);
-                    res->corruptions++;
-                }
-            }
-
             if (flags & CHECK_FRAG_INFO) {
                 res->bfi.allocated_clusters++;
                 if (next_contiguous_offset &&
@@ -1116,7 +1133,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                 offset, s->cluster_size);
 
             /* Correct offsets are cluster aligned */
-            if (offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, offset)) {
                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
                     "properly aligned; L2 entry corrupted.\n", offset);
                 res->corruptions++;
@@ -1158,7 +1175,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l1_table, l2_offset, l1_size2;
-    int i, refcount, ret;
+    int i, ret;
 
     l1_size2 = l1_size * sizeof(uint64_t);
 
@@ -1182,29 +1199,13 @@ static int check_refcounts_l1(BlockDriverState *bs,
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
-                    >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for l2_offset %"
-                        PRIx64 ": %s\n", l2_offset, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
-                        " refcount=%d\n", l2_offset, refcount);
-                    res->corruptions++;
-                }
-            }
-
             /* Mark L2 table as used */
             l2_offset &= L1E_OFFSET_MASK;
             inc_refcounts(bs, res, refcount_table, refcount_table_size,
                 l2_offset, s->cluster_size);
 
             /* L2 tables are cluster aligned */
-            if (l2_offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, l2_offset)) {
                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
                     "cluster aligned; L1 entry corrupted\n", l2_offset);
                 res->corruptions++;
@@ -1229,6 +1230,240 @@ fail:
 }
 
 /*
+ * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
+ *
+ * This function does not print an error message nor does it increment
+ * check_errors if get_refcount fails (this is because such an error will have
+ * been already detected and sufficiently signaled by the calling function
+ * (qcow2_check_refcounts) by the time this function is called).
+ */
+static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
+                              BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
+    int ret;
+    int refcount;
+    int i, j;
+
+    for (i = 0; i < s->l1_size; i++) {
+        uint64_t l1_entry = s->l1_table[i];
+        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            continue;
+        }
+
+        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (refcount < 0) {
+            /* don't print message nor increment check_errors */
+            continue;
+        }
+        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
+            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
+                    "l1_entry=%" PRIx64 " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
+                    i, l1_entry, refcount);
+            if (fix & BDRV_FIX_ERRORS) {
+                s->l1_table[i] = refcount == 1
+                               ? l1_entry |  QCOW_OFLAG_COPIED
+                               : l1_entry & ~QCOW_OFLAG_COPIED;
+                ret = qcow2_write_l1_entry(bs, i);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto fail;
+                }
+                res->corruptions_fixed++;
+            } else {
+                res->corruptions++;
+            }
+        }
+
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                         s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
+                    strerror(-ret));
+            res->check_errors++;
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
+                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
+                refcount = get_refcount(bs, data_offset >> s->cluster_bits);
+                if (refcount < 0) {
+                    /* don't print message nor increment check_errors */
+                    continue;
+                }
+                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
+                            "l2_entry=%" PRIx64 " refcount=%d\n",
+                            fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                                    "ERROR",
+                            l2_entry, refcount);
+                    if (fix & BDRV_FIX_ERRORS) {
+                        l2_table[j] = cpu_to_be64(refcount == 1
+                                    ? l2_entry |  QCOW_OFLAG_COPIED
+                                    : l2_entry & ~QCOW_OFLAG_COPIED);
+                        l2_dirty = true;
+                        res->corruptions_fixed++;
+                    } else {
+                        res->corruptions++;
+                    }
+                }
+            }
+        }
+
+        if (l2_dirty) {
+            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
+                                                l2_offset, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
+                        "overlap check failed: %s\n", strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+
+            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
+                        strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    qemu_vfree(l2_table);
+    return ret;
+}
+
+/*
+ * Writes one sector of the refcount table to the disk
+ */
+#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
+static int write_reftable_entry(BlockDriverState *bs, int rt_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[RT_ENTRIES_PER_SECTOR];
+    int rt_start_index;
+    int i, ret;
+
+    rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
+    }
+
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
+            s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
+            sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
+            rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * Allocates a new cluster for the given refcount block (represented by its
+ * offset in the image file) and copies the current content there. This function
+ * does _not_ decrement the reference count for the currently occupied cluster.
+ *
+ * This function prints an informative message to stderr on error (and returns
+ * -errno); on success, the offset of the newly allocated cluster is returned.
+ */
+static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
+                                      uint64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t new_offset = 0;
+    void *refcount_block = NULL;
+    int ret;
+
+    /* allocate new refcount block */
+    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
+    if (new_offset < 0) {
+        fprintf(stderr, "Could not allocate new cluster: %s\n",
+                strerror(-new_offset));
+        ret = new_offset;
+        goto done;
+    }
+
+    /* fetch current refcount block content */
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
+    if (ret < 0) {
+        fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    /* new block has not yet been entered into refcount table, therefore it is
+     * no refcount block yet (regarding this check) */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block; metadata overlap "
+                "check failed: %s\n", strerror(-ret));
+        /* the image will be marked corrupt, so don't even attempt on freeing
+         * the cluster */
+        goto done;
+    }
+
+    /* write to new block */
+    ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
+            s->cluster_sectors);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    /* update refcount table */
+    assert(!offset_into_cluster(s, new_offset));
+    s->refcount_table[reftable_index] = new_offset;
+    ret = write_reftable_entry(bs, reftable_index);
+    if (ret < 0) {
+        fprintf(stderr, "Could not update refcount table: %s\n",
+                strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    goto done;
+
+fail_free_cluster:
+    qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
+
+done:
+    if (refcount_block) {
+        /* This should never fail, as it would only do so if the given refcount
+         * block cannot be found in the cache. As this is impossible as long as
+         * there are no bugs, assert the success. */
+        int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        assert(tmp == 0);
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    return new_offset;
+}
+
+/*
  * Checks an image for refcount consistency.
  *
  * Returns 0 if no errors are found, the number of errors in case the image is
@@ -1238,14 +1473,19 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t size, i, highest_cluster;
-    int nb_clusters, refcount1, refcount2;
+    int64_t size, i, highest_cluster, nb_clusters;
+    int refcount1, refcount2;
     QCowSnapshot *sn;
     uint16_t *refcount_table;
     int ret;
 
     size = bdrv_getlength(bs->file);
     nb_clusters = size_to_clusters(s, size);
+    if (nb_clusters > INT_MAX) {
+        res->check_errors++;
+        return -EFBIG;
+    }
+
     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
 
     res->bfi.total_clusters =
@@ -1257,8 +1497,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
     /* current L1 table */
     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
-                             s->l1_table_offset, s->l1_size,
-                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
+                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
     if (ret < 0) {
         goto fail;
     }
@@ -1286,7 +1525,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         cluster = offset >> s->cluster_bits;
 
         /* Refcount blocks are cluster aligned */
-        if (offset & (s->cluster_size - 1)) {
+        if (offset_into_cluster(s, offset)) {
             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
                 "cluster aligned; refcount table entry corrupted\n", i);
             res->corruptions++;
@@ -1304,10 +1543,39 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
             inc_refcounts(bs, res, refcount_table, nb_clusters,
                 offset, s->cluster_size);
             if (refcount_table[cluster] != 1) {
-                fprintf(stderr, "ERROR refcount block %" PRId64
+                fprintf(stderr, "%s refcount block %" PRId64
                     " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
                     i, refcount_table[cluster]);
-                res->corruptions++;
+
+                if (fix & BDRV_FIX_ERRORS) {
+                    int64_t new_offset;
+
+                    new_offset = realloc_refcount_block(bs, i, offset);
+                    if (new_offset < 0) {
+                        res->corruptions++;
+                        continue;
+                    }
+
+                    /* update refcounts */
+                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
+                        /* increase refcount_table size if necessary */
+                        int old_nb_clusters = nb_clusters;
+                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
+                        refcount_table = g_realloc(refcount_table,
+                                nb_clusters * sizeof(uint16_t));
+                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
+                                - old_nb_clusters) * sizeof(uint16_t));
+                    }
+                    refcount_table[cluster]--;
+                    inc_refcounts(bs, res, refcount_table, nb_clusters,
+                            new_offset, s->cluster_size);
+
+                    res->corruptions_fixed++;
+                } else {
+                    res->corruptions++;
+                }
             }
         }
     }
@@ -1363,6 +1631,12 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         }
     }
 
+    /* check OFLAG_COPIED */
+    ret = check_oflag_copied(bs, res, fix);
+    if (ret < 0) {
+        goto fail;
+    }
+
     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
     ret = 0;
 
@@ -1372,3 +1646,173 @@ fail:
     return ret;
 }
 
+#define overlaps_with(ofs, sz) \
+    ranges_overlap(offset, size, ofs, sz)
+
+/*
+ * Checks if the given offset into the image file is actually free to use by
+ * looking for overlaps with important metadata sections (L1/L2 tables etc.),
+ * i.e. a sanity check without relying on the refcount tables.
+ *
+ * The ign parameter specifies what checks not to perform (being a bitmask of
+ * QCow2MetadataOverlap values), i.e., what sections to ignore.
+ *
+ * Returns:
+ * - 0 if writing to this offset will not affect the mentioned metadata
+ * - a positive QCow2MetadataOverlap value indicating one overlapping section
+ * - a negative value (-errno) indicating an error while performing a check,
+ *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
+ */
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
+                                 int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int chk = s->overlap_check & ~ign;
+    int i, j;
+
+    if (!size) {
+        return 0;
+    }
+
+    if (chk & QCOW2_OL_MAIN_HEADER) {
+        if (offset < s->cluster_size) {
+            return QCOW2_OL_MAIN_HEADER;
+        }
+    }
+
+    /* align range to test to cluster boundaries */
+    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
+    offset = start_of_cluster(s, offset);
+
+    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
+        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
+            return QCOW2_OL_ACTIVE_L1;
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
+        if (overlaps_with(s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t))) {
+            return QCOW2_OL_REFCOUNT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
+        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
+            return QCOW2_OL_SNAPSHOT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].l1_size &&
+                overlaps_with(s->snapshots[i].l1_table_offset,
+                s->snapshots[i].l1_size * sizeof(uint64_t))) {
+                return QCOW2_OL_INACTIVE_L1;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
+                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_ACTIVE_L2;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
+        for (i = 0; i < s->refcount_table_size; i++) {
+            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
+                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_REFCOUNT_BLOCK;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
+            uint32_t l1_sz  = s->snapshots[i].l1_size;
+            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
+            uint64_t *l1 = g_malloc(l1_sz2);
+            int ret;
+
+            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
+            if (ret < 0) {
+                g_free(l1);
+                return ret;
+            }
+
+            for (j = 0; j < l1_sz; j++) {
+                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
+                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
+                    g_free(l1);
+                    return QCOW2_OL_INACTIVE_L2;
+                }
+            }
+
+            g_free(l1);
+        }
+    }
+
+    return 0;
+}
+
+static const char *metadata_ol_names[] = {
+    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
+    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
+    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
+    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
+    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
+    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
+    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
+    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
+};
+
+/*
+ * First performs a check for metadata overlaps (through
+ * qcow2_check_metadata_overlap); if that fails with a negative value (error
+ * while performing a check), that value is returned. If an impending overlap
+ * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
+ * and -EIO returned.
+ *
+ * Returns 0 if there were neither overlaps nor errors while checking for
+ * overlaps; or a negative value (-errno) on error.
+ */
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
+                                  int64_t size)
+{
+    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
+
+    if (ret < 0) {
+        return ret;
+    } else if (ret > 0) {
+        int metadata_ol_bitnr = ffs(ret) - 1;
+        char *message;
+        QObject *data;
+
+        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
+
+        fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
+                "with %s); image marked as corrupt.\n",
+                metadata_ol_names[metadata_ol_bitnr]);
+        message = g_strdup_printf("Prevented %s overwrite",
+                metadata_ol_names[metadata_ol_bitnr]);
+        data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
+                PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
+                offset, size);
+        monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
+        g_free(message);
+        qobject_decref(data);
+
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+        return -EIO;
+    }
+
+    return 0;
+}
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 0caac9055..0aa9defbe 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,31 +26,6 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 
-typedef struct QEMU_PACKED QCowSnapshotHeader {
-    /* header is 8 byte aligned */
-    uint64_t l1_table_offset;
-
-    uint32_t l1_size;
-    uint16_t id_str_size;
-    uint16_t name_size;
-
-    uint32_t date_sec;
-    uint32_t date_nsec;
-
-    uint64_t vm_clock_nsec;
-
-    uint32_t vm_state_size;
-    uint32_t extra_data_size; /* for extension */
-    /* extra data follows */
-    /* id_str follows */
-    /* name follows  */
-} QCowSnapshotHeader;
-
-typedef struct QEMU_PACKED QCowSnapshotExtraData {
-    uint64_t vm_state_size_large;
-    uint64_t disk_size;
-} QCowSnapshotExtraData;
-
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -141,8 +116,14 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         }
         offset += name_size;
         sn->name[name_size] = '\0';
+
+        if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
+            ret = -EFBIG;
+            goto fail;
+        }
     }
 
+    assert(offset - s->snapshots_offset <= INT_MAX);
     s->snapshots_size = offset - s->snapshots_offset;
     return 0;
 
@@ -163,7 +144,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         uint32_t nb_snapshots;
         uint64_t snapshots_offset;
     } QEMU_PACKED header_data;
-    int64_t offset, snapshots_offset;
+    int64_t offset, snapshots_offset = 0;
     int ret;
 
     /* compute the size of the snapshots */
@@ -175,20 +156,36 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         offset += sizeof(extra);
         offset += strlen(sn->id_str);
         offset += strlen(sn->name);
+
+        if (offset > QCOW_MAX_SNAPSHOTS_SIZE) {
+            ret = -EFBIG;
+            goto fail;
+        }
     }
+
+    assert(offset <= INT_MAX);
     snapshots_size = offset;
 
     /* Allocate space for the new snapshot list */
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
     offset = snapshots_offset;
     if (offset < 0) {
-        return offset;
+        ret = offset;
+        goto fail;
     }
     ret = bdrv_flush(bs);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
 
+    /* The snapshot list position has not yet been updated, so these clusters
+     * must indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+
     /* Write all snapshots to the new list */
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
@@ -211,6 +208,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
 
         id_str_size = strlen(sn->id_str);
         name_size = strlen(sn->name);
+        assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX);
         h.id_str_size = cpu_to_be16(id_str_size);
         h.name_size = cpu_to_be16(name_size);
         offset = align_offset(offset, 8);
@@ -269,6 +267,10 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     return 0;
 
 fail:
+    if (snapshots_offset > 0) {
+        qcow2_free_clusters(bs, snapshots_offset, snapshots_size,
+                            QCOW2_DISCARD_ALWAYS);
+    }
     return ret;
 }
 
@@ -277,7 +279,8 @@ static void find_new_snapshot_id(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     QCowSnapshot *sn;
-    int i, id, id_max = 0;
+    int i;
+    unsigned long id, id_max = 0;
 
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
@@ -285,34 +288,50 @@ static void find_new_snapshot_id(BlockDriverState *bs,
         if (id > id_max)
             id_max = id;
     }
-    snprintf(id_str, id_str_size, "%d", id_max + 1);
+    snprintf(id_str, id_str_size, "%lu", id_max + 1);
 }
 
-static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+static int find_snapshot_by_id_and_name(BlockDriverState *bs,
+                                        const char *id,
+                                        const char *name)
 {
     BDRVQcowState *s = bs->opaque;
     int i;
 
-    for(i = 0; i < s->nb_snapshots; i++) {
-        if (!strcmp(s->snapshots[i].id_str, id_str))
-            return i;
+    if (id && name) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (!strcmp(s->snapshots[i].id_str, id) &&
+                !strcmp(s->snapshots[i].name, name)) {
+                return i;
+            }
+        }
+    } else if (id) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (!strcmp(s->snapshots[i].id_str, id)) {
+                return i;
+            }
+        }
+    } else if (name) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (!strcmp(s->snapshots[i].name, name)) {
+                return i;
+            }
+        }
     }
+
     return -1;
 }
 
-static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+static int find_snapshot_by_id_or_name(BlockDriverState *bs,
+                                       const char *id_or_name)
 {
-    BDRVQcowState *s = bs->opaque;
-    int i, ret;
+    int ret;
 
-    ret = find_snapshot_by_id(bs, name);
-    if (ret >= 0)
+    ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL);
+    if (ret >= 0) {
         return ret;
-    for(i = 0; i < s->nb_snapshots; i++) {
-        if (!strcmp(s->snapshots[i].name, name))
-            return i;
     }
-    return -1;
+    return find_snapshot_by_id_and_name(bs, NULL, id_or_name);
 }
 
 /* if no id is provided, a new one is constructed */
@@ -326,6 +345,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     uint64_t *l1_table = NULL;
     int64_t l1_table_offset;
 
+    if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) {
+        return -EFBIG;
+    }
+
     memset(sn, 0, sizeof(*sn));
 
     /* Generate an ID if it wasn't passed */
@@ -334,7 +357,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 
     /* Check that the ID is unique */
-    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
+    if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
         return -EEXIST;
     }
 
@@ -363,6 +386,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset,
+                                        s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
     ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
                       s->l1_size * sizeof(uint64_t));
     if (ret < 0) {
@@ -396,11 +425,19 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (ret < 0) {
         g_free(s->snapshots);
         s->snapshots = old_snapshot_list;
+        s->nb_snapshots--;
         goto fail;
     }
 
     g_free(old_snapshot_list);
 
+    /* The VM state isn't needed any more in the active L1 table; in fact, it
+     * hurts by causing expensive COW for the next snapshot. */
+    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
+                           align_offset(sn->vm_state_size, s->cluster_size)
+                                >> BDRV_SECTOR_BITS,
+                           QCOW2_DISCARD_NEVER);
+
 #ifdef DEBUG_ALLOC
     {
       BdrvCheckResult result = {0};
@@ -475,6 +512,12 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto fail;
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
+                                        s->l1_table_offset, cur_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
                            cur_l1_bytes);
     if (ret < 0) {
@@ -531,15 +574,19 @@ fail:
     return ret;
 }
 
-int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+int qcow2_snapshot_delete(BlockDriverState *bs,
+                          const char *snapshot_id,
+                          const char *name,
+                          Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
     QCowSnapshot sn;
     int snapshot_index, ret;
 
     /* Search the snapshot */
-    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name);
     if (snapshot_index < 0) {
+        error_setg(errp, "Can't find the snapshot");
         return -ENOENT;
     }
     sn = s->snapshots[snapshot_index];
@@ -551,6 +598,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
     s->nb_snapshots--;
     ret = qcow2_write_snapshots(bs);
     if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "Failed to remove snapshot from snapshot list");
         return ret;
     }
 
@@ -568,6 +617,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
     ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
                                          sn.l1_size, -1);
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table");
         return ret;
     }
     qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
@@ -576,6 +626,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
     /* must update the copied flag on the current cluster offsets */
     ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
     if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "Failed to update snapshot status in disk");
         return ret;
     }
 
@@ -617,7 +669,10 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
     return s->nb_snapshots;
 }
 
-int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+int qcow2_snapshot_load_tmp(BlockDriverState *bs,
+                            const char *snapshot_id,
+                            const char *name,
+                            Error **errp)
 {
     int i, snapshot_index;
     BDRVQcowState *s = bs->opaque;
@@ -629,18 +684,25 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
     assert(bs->read_only);
 
     /* Search the snapshot */
-    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+    snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name);
     if (snapshot_index < 0) {
+        error_setg(errp,
+                   "Can't find snapshot");
         return -ENOENT;
     }
     sn = &s->snapshots[snapshot_index];
 
     /* Allocate and read in the snapshot's L1 table */
-    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    if (sn->l1_size > QCOW_MAX_L1_SIZE) {
+        error_setg(errp, "Snapshot L1 table too large");
+        return -EFBIG;
+    }
+    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
     new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
 
     ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
     if (ret < 0) {
+        error_setg(errp, "Failed to read l1 table for snapshot");
         g_free(new_l1_table);
         return ret;
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index 3376901bd..e903d971c 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -52,7 +52,7 @@
 typedef struct {
     uint32_t magic;
     uint32_t len;
-} QCowExtension;
+} QEMU_PACKED QCowExtension;
 
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
@@ -79,7 +79,8 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
  * return 0 upon success, non-0 otherwise
  */
 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
-                                 uint64_t end_offset, void **p_feature_table)
+                                 uint64_t end_offset, void **p_feature_table,
+                                 Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
     QCowExtension ext;
@@ -100,10 +101,10 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
         printf("attempting to read extended header in offset %lu\n", offset);
 #endif
 
-        if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
-            fprintf(stderr, "qcow2_read_extension: ERROR: "
-                    "pread fail from offset %" PRIu64 "\n",
-                    offset);
+        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
+                             "pread fail from offset %" PRIu64, offset);
             return 1;
         }
         be32_to_cpus(&ext.magic);
@@ -113,7 +114,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
         printf("ext.magic = 0x%x\n", ext.magic);
 #endif
         if (ext.len > end_offset - offset) {
-            error_report("Header extension too large");
+            error_setg(errp, "Header extension too large");
             return -EINVAL;
         }
 
@@ -123,14 +124,16 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
 
         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
             if (ext.len >= sizeof(bs->backing_format)) {
-                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
-                        " (>=%zu)\n",
-                        ext.len, sizeof(bs->backing_format));
+                error_setg(errp, "ERROR: ext_backing_format: len=%u too large"
+                           " (>=%zu)", ext.len, sizeof(bs->backing_format));
                 return 2;
             }
-            if (bdrv_pread(bs->file, offset , bs->backing_format,
-                           ext.len) != ext.len)
+            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
+                                 "Could not read format name");
                 return 3;
+            }
             bs->backing_format[ext.len] = '\0';
 #ifdef DEBUG_EXT
             printf("Qcow2: Got format extension %s\n", bs->backing_format);
@@ -142,6 +145,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
                 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
                 if (ret < 0) {
+                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
+                                     "Could not read table");
                     return ret;
                 }
 
@@ -161,6 +166,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
 
                 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
                 if (ret < 0) {
+                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
+                                     "Could not read data");
                     return ret;
                 }
             }
@@ -184,8 +191,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs)
     }
 }
 
-static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
-    const char *fmt, ...)
+static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
+    Error **errp, const char *fmt, ...)
 {
     char msg[64];
     va_list ap;
@@ -194,17 +201,17 @@ static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
     vsnprintf(msg, sizeof(msg), fmt, ap);
     va_end(ap);
 
-    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-        bs->device_name, "qcow2", msg);
+    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2",
+              msg);
 }
 
 static void report_unsupported_feature(BlockDriverState *bs,
-    Qcow2Feature *table, uint64_t mask)
+    Error **errp, Qcow2Feature *table, uint64_t mask)
 {
     while (table && table->name[0] != '\0') {
         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
             if (mask & (1 << table->bit)) {
-                report_unsupported(bs, "%.46s",table->name);
+                report_unsupported(bs, errp, "%.46s", table->name);
                 mask &= ~(1 << table->bit);
             }
         }
@@ -212,7 +219,8 @@ static void report_unsupported_feature(BlockDriverState *bs,
     }
 
     if (mask) {
-        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+        report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64,
+                           mask);
     }
 }
 
@@ -261,12 +269,46 @@ static int qcow2_mark_clean(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
 
     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        int ret;
+
+        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+
+        ret = bdrv_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        return qcow2_update_header(bs);
+    }
+    return 0;
+}
+
+/*
+ * Marks the image as corrupt.
+ */
+int qcow2_mark_corrupt(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
+    return qcow2_update_header(bs);
+}
+
+/*
+ * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
+ * before if necessary.
+ */
+int qcow2_mark_consistent(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
         int ret = bdrv_flush(bs);
         if (ret < 0) {
             return ret;
         }
 
-        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
         return qcow2_update_header(bs);
     }
     return 0;
@@ -281,11 +323,41 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
     }
 
     if (fix && result->check_errors == 0 && result->corruptions == 0) {
-        return qcow2_mark_clean(bs);
+        ret = qcow2_mark_clean(bs);
+        if (ret < 0) {
+            return ret;
+        }
+        return qcow2_mark_consistent(bs);
     }
     return ret;
 }
 
+static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
+                                 uint64_t entries, size_t entry_len)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t size;
+
+    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
+     * because values will be passed to qemu functions taking int64_t. */
+    if (entries > INT64_MAX / entry_len) {
+        return -EINVAL;
+    }
+
+    size = entries * entry_len;
+
+    if (INT64_MAX - size < offset) {
+        return -EINVAL;
+    }
+
+    /* Tables must be cluster aligned */
+    if (offset & (s->cluster_size - 1)) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static QemuOptsList qcow2_runtime_opts = {
     .name = "qcow2",
     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
@@ -311,22 +383,84 @@ static QemuOptsList qcow2_runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Generate discard requests when other clusters are freed",
         },
+        {
+            .name = QCOW2_OPT_OVERLAP,
+            .type = QEMU_OPT_STRING,
+            .help = "Selects which overlap checks to perform from a range of "
+                    "templates (none, constant, cached, all)",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into the main qcow2 header",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into the active L1 table",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into an active L2 table",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into the refcount table",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into a refcount block",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into the snapshot table",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into an inactive L1 table",
+        },
+        {
+            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
+            .type = QEMU_OPT_BOOL,
+            .help = "Check for unintended writes into an inactive L2 table",
+        },
         { /* end of list */ }
     },
 };
 
-static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
+static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
+    [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
+    [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
+    [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
+    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
+    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
+    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
+    [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
+    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
-    int len, i, ret = 0;
+    unsigned int len, i;
+    int ret = 0;
     QCowHeader header;
     QemuOpts *opts;
     Error *local_err = NULL;
     uint64_t ext_end;
     uint64_t l1_vm_state_index;
+    const char *opt_overlap_check;
+    int overlap_check_template = 0;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not read qcow2 header");
         goto fail;
     }
     be32_to_cpus(&header.magic);
@@ -344,17 +478,30 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     be32_to_cpus(&header.nb_snapshots);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "Image is not in qcow2 format");
+        ret = -EINVAL;
         goto fail;
     }
     if (header.version < 2 || header.version > 3) {
-        report_unsupported(bs, "QCOW version %d", header.version);
+        report_unsupported(bs, errp, "QCOW version %d", header.version);
         ret = -ENOTSUP;
         goto fail;
     }
 
     s->qcow_version = header.version;
 
+    /* Initialise cluster size */
+    if (header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS) {
+        error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+
     /* Initialise version 3 header fields */
     if (header.version == 2) {
         header.incompatible_features    = 0;
@@ -368,6 +515,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
         be64_to_cpus(&header.autoclear_features);
         be32_to_cpus(&header.refcount_order);
         be32_to_cpus(&header.header_length);
+
+        if (header.header_length < 104) {
+            error_setg(errp, "qcow2 header too short");
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
+    if (header.header_length > s->cluster_size) {
+        error_setg(errp, "qcow2 header exceeds cluster size");
+        ret = -EINVAL;
+        goto fail;
     }
 
     if (header.header_length > sizeof(header)) {
@@ -376,10 +535,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
         ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
                          s->unknown_header_fields_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
+                             "fields");
             goto fail;
         }
     }
 
+    if (header.backing_file_offset > s->cluster_size) {
+        error_setg(errp, "Invalid backing file offset");
+        ret = -EINVAL;
+        goto fail;
+    }
+
     if (header.backing_file_offset) {
         ext_end = header.backing_file_offset;
     } else {
@@ -394,28 +561,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
         void *feature_table = NULL;
         qcow2_read_extensions(bs, header.header_length, ext_end,
-                              &feature_table);
-        report_unsupported_feature(bs, feature_table,
+                              &feature_table, NULL);
+        report_unsupported_feature(bs, errp, feature_table,
                                    s->incompatible_features &
                                    ~QCOW2_INCOMPAT_MASK);
         ret = -ENOTSUP;
+        g_free(feature_table);
         goto fail;
     }
 
+    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
+        /* Corrupt images may not be written to unless they are being repaired
+         */
+        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
+            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
+                       "read/write");
+            ret = -EACCES;
+            goto fail;
+        }
+    }
+
     /* Check support for various header values */
     if (header.refcount_order != 4) {
-        report_unsupported(bs, "%d bit reference counts",
+        report_unsupported(bs, errp, "%d bit reference counts",
                            1 << header.refcount_order);
         ret = -ENOTSUP;
         goto fail;
     }
+    s->refcount_order = header.refcount_order;
 
-    if (header.cluster_bits < MIN_CLUSTER_BITS ||
-        header.cluster_bits > MAX_CLUSTER_BITS) {
-        ret = -EINVAL;
-        goto fail;
-    }
     if (header.crypt_method > QCOW_CRYPT_AES) {
+        error_setg(errp, "Unsupported encryption method: %i",
+                   header.crypt_method);
         ret = -EINVAL;
         goto fail;
     }
@@ -423,27 +600,57 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     if (s->crypt_method_header) {
         bs->encrypted = 1;
     }
-    s->cluster_bits = header.cluster_bits;
-    s->cluster_size = 1 << s->cluster_bits;
-    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+
     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
     s->l2_size = 1 << s->l2_bits;
     bs->total_sectors = header.size / 512;
     s->csize_shift = (62 - (s->cluster_bits - 8));
     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+
     s->refcount_table_offset = header.refcount_table_offset;
     s->refcount_table_size =
         header.refcount_table_clusters << (s->cluster_bits - 3);
 
-    s->snapshots_offset = header.snapshots_offset;
-    s->nb_snapshots = header.nb_snapshots;
+    if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) {
+        error_setg(errp, "Reference count table too large");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = validate_table_offset(bs, s->refcount_table_offset,
+                                s->refcount_table_size, sizeof(uint64_t));
+    if (ret < 0) {
+        error_setg(errp, "Invalid reference count table offset");
+        goto fail;
+    }
+
+    /* Snapshot table offset/length */
+    if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) {
+        error_setg(errp, "Too many snapshots");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = validate_table_offset(bs, header.snapshots_offset,
+                                header.nb_snapshots,
+                                sizeof(QCowSnapshotHeader));
+    if (ret < 0) {
+        error_setg(errp, "Invalid snapshot table offset");
+        goto fail;
+    }
 
     /* read the level 1 table */
+    if (header.l1_size > QCOW_MAX_L1_SIZE) {
+        error_setg(errp, "Active L1 table too large");
+        ret = -EFBIG;
+        goto fail;
+    }
     s->l1_size = header.l1_size;
 
     l1_vm_state_index = size_to_l1(s, header.size);
     if (l1_vm_state_index > INT_MAX) {
+        error_setg(errp, "Image is too big");
         ret = -EFBIG;
         goto fail;
     }
@@ -452,16 +659,27 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     /* the L1 table must contain at least enough entries to put
        header.size bytes */
     if (s->l1_size < s->l1_vm_state_index) {
+        error_setg(errp, "L1 table is too small");
         ret = -EINVAL;
         goto fail;
     }
+
+    ret = validate_table_offset(bs, header.l1_table_offset,
+                                header.l1_size, sizeof(uint64_t));
+    if (ret < 0) {
+        error_setg(errp, "Invalid L1 table offset");
+        goto fail;
+    }
     s->l1_table_offset = header.l1_table_offset;
+
+
     if (s->l1_size > 0) {
         s->l1_table = g_malloc0(
             align_offset(s->l1_size * sizeof(uint64_t), 512));
         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                          s->l1_size * sizeof(uint64_t));
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not read L1 table");
             goto fail;
         }
         for(i = 0;i < s->l1_size; i++) {
@@ -482,6 +700,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
 
     ret = qcow2_refcount_init(bs);
     if (ret != 0) {
+        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
         goto fail;
     }
 
@@ -489,7 +708,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     QTAILQ_INIT(&s->discards);
 
     /* read qcow2 extensions */
-    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
+    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
+        &local_err)) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
@@ -497,27 +718,36 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     /* read the backing file name */
     if (header.backing_file_offset != 0) {
         len = header.backing_file_size;
-        if (len > 1023) {
-            len = 1023;
+        if (len > MIN(1023, s->cluster_size - header.backing_file_offset)) {
+            error_setg(errp, "Backing file name too long");
+            ret = -EINVAL;
+            goto fail;
         }
         ret = bdrv_pread(bs->file, header.backing_file_offset,
                          bs->backing_file, len);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not read backing file name");
             goto fail;
         }
         bs->backing_file[len] = '\0';
     }
 
+    /* Internal snapshots */
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
     ret = qcow2_read_snapshots(bs);
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not read snapshots");
         goto fail;
     }
 
     /* Clear unknown autoclear feature bits */
-    if (!bs->read_only && s->autoclear_features != 0) {
+    if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) {
         s->autoclear_features = 0;
         ret = qcow2_update_header(bs);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not update qcow2 header");
             goto fail;
         }
     }
@@ -526,22 +756,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     qemu_co_mutex_init(&s->lock);
 
     /* Repair image if dirty */
-    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+    if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only &&
         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
         BdrvCheckResult result = {0};
 
         ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not repair dirty image");
             goto fail;
         }
     }
 
     /* Enable lazy_refcounts according to image and command line options */
-    opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
@@ -559,11 +789,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     s->discard_passthrough[QCOW2_DISCARD_OTHER] =
         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
 
+    opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached";
+    if (!strcmp(opt_overlap_check, "none")) {
+        overlap_check_template = 0;
+    } else if (!strcmp(opt_overlap_check, "constant")) {
+        overlap_check_template = QCOW2_OL_CONSTANT;
+    } else if (!strcmp(opt_overlap_check, "cached")) {
+        overlap_check_template = QCOW2_OL_CACHED;
+    } else if (!strcmp(opt_overlap_check, "all")) {
+        overlap_check_template = QCOW2_OL_ALL;
+    } else {
+        error_setg(errp, "Unsupported value '%s' for qcow2 option "
+                   "'overlap-check'. Allowed are either of the following: "
+                   "none, constant, cached, all", opt_overlap_check);
+        qemu_opts_del(opts);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->overlap_check = 0;
+    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
+        /* overlap-check defines a template bitmask, but every flag may be
+         * overwritten through the associated boolean option */
+        s->overlap_check |=
+            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
+                              overlap_check_template & (1 << i)) << i;
+    }
+
     qemu_opts_del(opts);
 
     if (s->use_lazy_refcounts && s->qcow_version < 3) {
-        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
-            "a qcow2 image with at least qemu 1.1 compatibility level");
+        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
+                   "qemu 1.1 compatibility level");
         ret = -EINVAL;
         goto fail;
     }
@@ -582,14 +839,28 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
     g_free(s->l1_table);
+    /* else pre-write overlap checks in cache_destroy may crash */
+    s->l1_table = NULL;
     if (s->l2_table_cache) {
         qcow2_cache_destroy(bs, s->l2_table_cache);
     }
+    if (s->refcount_block_cache) {
+        qcow2_cache_destroy(bs, s->refcount_block_cache);
+    }
     g_free(s->cluster_cache);
     qemu_vfree(s->cluster_data);
     return ret;
 }
 
+static int qcow2_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+
+    return 0;
+}
+
 static int qcow2_set_key(BlockDriverState *bs, const char *key)
 {
     BDRVQcowState *s = bs->opaque;
@@ -632,32 +903,56 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
-/* We have nothing to do for QCOW2 reopen, stubs just return
- * success */
+/* We have no actual commit/abort logic for qcow2, but we need to write out any
+ * unwritten data if we reopen read-only. */
 static int qcow2_reopen_prepare(BDRVReopenState *state,
                                 BlockReopenQueue *queue, Error **errp)
 {
+    int ret;
+
+    if ((state->flags & BDRV_O_RDWR) == 0) {
+        ret = bdrv_flush(state->bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = qcow2_mark_clean(state->bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
     return 0;
 }
 
-static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t cluster_offset;
-    int ret;
+    int index_in_cluster, ret;
+    int64_t status = 0;
 
     *pnum = nb_sectors;
-    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
-     * can't pass them on today */
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
-        *pnum = 0;
+        return ret;
     }
 
-    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
+    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
+        !s->crypt_method) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
+    }
+    if (ret == QCOW2_CLUSTER_ZERO) {
+        status |= BDRV_BLOCK_ZERO;
+    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
+        status |= BDRV_BLOCK_DATA;
+    }
+    return status;
 }
 
 /* handle reading after the end of the backing file */
@@ -821,7 +1116,6 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
-    int n_end;
     int ret;
     int cur_nr_sectors; /* number of sectors in current iteration */
     uint64_t cluster_offset;
@@ -845,14 +1139,16 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
         trace_qcow2_writev_start_part(qemu_coroutine_self());
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n_end = index_in_cluster + remaining_sectors;
+        cur_nr_sectors = remaining_sectors;
         if (s->crypt_method &&
-            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
-            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+            cur_nr_sectors >
+            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
+            cur_nr_sectors =
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
         }
 
         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
+            &cur_nr_sectors, &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
@@ -881,6 +1177,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
+        ret = qcow2_pre_write_overlap_check(bs, 0,
+                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
+                cur_nr_sectors * BDRV_SECTOR_SIZE);
+        if (ret < 0) {
+            goto fail;
+        }
+
         qemu_co_mutex_unlock(&s->lock);
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
@@ -947,11 +1250,15 @@ static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     g_free(s->l1_table);
+    /* else pre-write overlap checks in cache_destroy may crash */
+    s->l1_table = NULL;
 
-    qcow2_cache_flush(bs, s->l2_table_cache);
-    qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (!(bs->open_flags & BDRV_O_INCOMING)) {
+        qcow2_cache_flush(bs, s->l2_table_cache);
+        qcow2_cache_flush(bs, s->refcount_block_cache);
 
-    qcow2_mark_clean(bs);
+        qcow2_mark_clean(bs);
+    }
 
     qcow2_cache_destroy(bs, s->l2_table_cache);
     qcow2_cache_destroy(bs, s->refcount_block_cache);
@@ -965,7 +1272,7 @@ static void qcow2_close(BlockDriverState *bs)
     qcow2_free_snapshots(bs);
 }
 
-static void qcow2_invalidate_cache(BlockDriverState *bs)
+static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
     int flags = s->flags;
@@ -973,6 +1280,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
     AES_KEY aes_decrypt_key;
     uint32_t crypt_method = 0;
     QDict *options;
+    Error *local_err = NULL;
+    int ret;
 
     /*
      * Backing files are read-only which makes all of their metadata immutable,
@@ -987,12 +1296,25 @@ static void qcow2_invalidate_cache(BlockDriverState *bs)
 
     qcow2_close(bs);
 
-    options = qdict_new();
-    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
-              qbool_from_int(s->use_lazy_refcounts));
+    bdrv_invalidate_cache(bs->file, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
     memset(s, 0, sizeof(BDRVQcowState));
-    qcow2_open(bs, options, flags);
+    options = qdict_clone_shallow(bs->options);
+
+    ret = qcow2_open(bs, options, flags, &local_err);
+    if (local_err) {
+        error_setg(errp, "Could not reopen qcow2 layer: %s",
+                   error_get_pretty(local_err));
+        error_free(local_err);
+        return;
+    } else if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
+        return;
+    }
 
     QDECREF(options);
 
@@ -1076,7 +1398,7 @@ int qcow2_update_header(BlockDriverState *bs)
         .incompatible_features  = cpu_to_be64(s->incompatible_features),
         .compatible_features    = cpu_to_be64(s->compatible_features),
         .autoclear_features     = cpu_to_be64(s->autoclear_features),
-        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .refcount_order         = cpu_to_be32(s->refcount_order),
         .header_length          = cpu_to_be32(header_length),
     };
 
@@ -1130,6 +1452,11 @@ int qcow2_update_header(BlockDriverState *bs)
             .name = "dirty bit",
         },
         {
+            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+            .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
+            .name = "corrupt bit",
+        },
+        {
             .type = QCOW2_FEAT_TYPE_COMPATIBLE,
             .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
             .name = "lazy refcounts",
@@ -1210,34 +1537,39 @@ static int preallocate(BlockDriverState *bs)
     int ret;
     QCowL2Meta *meta;
 
-    nb_sectors = bdrv_getlength(bs) >> 9;
+    nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
     offset = 0;
 
     while (nb_sectors) {
-        num = MIN(nb_sectors, INT_MAX >> 9);
-        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &num,
                                          &host_offset, &meta);
         if (ret < 0) {
             return ret;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, meta);
-        if (ret < 0) {
-            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
-                                    QCOW2_DISCARD_NEVER);
-            return ret;
-        }
+        while (meta) {
+            QCowL2Meta *next = meta->next;
+
+            ret = qcow2_alloc_cluster_link_l2(bs, meta);
+            if (ret < 0) {
+                qcow2_free_any_clusters(bs, meta->alloc_offset,
+                                        meta->nb_clusters, QCOW2_DISCARD_NEVER);
+                return ret;
+            }
 
-        /* There are no dependent requests, but we need to remove our request
-         * from the list of in-flight requests */
-        if (meta != NULL) {
+            /* There are no dependent requests, but we need to remove our
+             * request from the list of in-flight requests */
             QLIST_REMOVE(meta, next_in_flight);
+
+            g_free(meta);
+            meta = next;
         }
 
         /* TODO Preallocate data if requested */
 
         nb_sectors -= num;
-        offset += num << 9;
+        offset += num << BDRV_SECTOR_BITS;
     }
 
     /*
@@ -1246,9 +1578,10 @@ static int preallocate(BlockDriverState *bs)
      * EOF). Extend the image to the last allocated sector.
      */
     if (host_offset != 0) {
-        uint8_t buf[512];
-        memset(buf, 0, 512);
-        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
+        uint8_t buf[BDRV_SECTOR_SIZE];
+        memset(buf, 0, BDRV_SECTOR_SIZE);
+        ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1,
+                         buf, 1);
         if (ret < 0) {
             return ret;
         }
@@ -1260,7 +1593,8 @@ static int preallocate(BlockDriverState *bs)
 static int qcow2_create2(const char *filename, int64_t total_size,
                          const char *backing_file, const char *backing_format,
                          int flags, size_t cluster_size, int prealloc,
-                         QEMUOptionParameter *options, int version)
+                         QEMUOptionParameter *options, int version,
+                         Error **errp)
 {
     /* Calculate cluster_bits */
     int cluster_bits;
@@ -1268,9 +1602,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
         (1 << cluster_bits) != cluster_size)
     {
-        error_report(
-            "Cluster size must be a power of two between %d and %dk",
-            1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
+        error_setg(errp, "Cluster size must be a power of two between %d and "
+                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
         return -EINVAL;
     }
 
@@ -1287,59 +1620,72 @@ static int qcow2_create2(const char *filename, int64_t total_size,
      * size for any qcow2 image.
      */
     BlockDriverState* bs;
-    QCowHeader header;
-    uint8_t* refcount_table;
+    QCowHeader *header;
+    uint64_t* refcount_table;
+    Error *local_err = NULL;
     int ret;
 
-    ret = bdrv_create_file(filename, options);
+    ret = bdrv_create_file(filename, options, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+    bs = NULL;
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
     /* Write the header */
-    memset(&header, 0, sizeof(header));
-    header.magic = cpu_to_be32(QCOW_MAGIC);
-    header.version = cpu_to_be32(version);
-    header.cluster_bits = cpu_to_be32(cluster_bits);
-    header.size = cpu_to_be64(0);
-    header.l1_table_offset = cpu_to_be64(0);
-    header.l1_size = cpu_to_be32(0);
-    header.refcount_table_offset = cpu_to_be64(cluster_size);
-    header.refcount_table_clusters = cpu_to_be32(1);
-    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
-    header.header_length = cpu_to_be32(sizeof(header));
+    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
+    header = g_malloc0(cluster_size);
+    *header = (QCowHeader) {
+        .magic                      = cpu_to_be32(QCOW_MAGIC),
+        .version                    = cpu_to_be32(version),
+        .cluster_bits               = cpu_to_be32(cluster_bits),
+        .size                       = cpu_to_be64(0),
+        .l1_table_offset            = cpu_to_be64(0),
+        .l1_size                    = cpu_to_be32(0),
+        .refcount_table_offset      = cpu_to_be64(cluster_size),
+        .refcount_table_clusters    = cpu_to_be32(1),
+        .refcount_order             = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .header_length              = cpu_to_be32(sizeof(*header)),
+    };
 
     if (flags & BLOCK_FLAG_ENCRYPT) {
-        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+        header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
     } else {
-        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+        header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
     }
 
     if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
-        header.compatible_features |=
+        header->compatible_features |=
             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
     }
 
-    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    ret = bdrv_pwrite(bs, 0, header, cluster_size);
+    g_free(header);
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write qcow2 header");
         goto out;
     }
 
-    /* Write an empty refcount table */
-    refcount_table = g_malloc0(cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+    /* Write a refcount table with one refcount block */
+    refcount_table = g_malloc0(2 * cluster_size);
+    refcount_table[0] = cpu_to_be64(2 * cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
     g_free(refcount_table);
 
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write refcount table");
         goto out;
     }
 
-    bdrv_close(bs);
+    bdrv_unref(bs);
+    bs = NULL;
 
     /*
      * And now open the image and make it consistent first (i.e. increase the
@@ -1348,14 +1694,17 @@ static int qcow2_create2(const char *filename, int64_t total_size,
      */
     BlockDriver* drv = bdrv_find_format("qcow2");
     assert(drv != NULL);
-    ret = bdrv_open(bs, filename, NULL,
-        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
+    ret = bdrv_open(&bs, filename, NULL, NULL,
+        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         goto out;
     }
 
-    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+    ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
+                         "header and refcount table");
         goto out;
 
     } else if (ret != 0) {
@@ -1366,6 +1715,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     /* Okay, now that we have a valid image, let's give it the right size */
     ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
     if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not resize image");
         goto out;
     }
 
@@ -1373,6 +1723,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     if (backing_file) {
         ret = bdrv_change_backing_file(bs, backing_file, backing_format);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
+                             "with format '%s'", backing_file, backing_format);
             goto out;
         }
     }
@@ -1384,17 +1736,33 @@ static int qcow2_create2(const char *filename, int64_t total_size,
         ret = preallocate(bs);
         qemu_co_mutex_unlock(&s->lock);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not preallocate metadata");
             goto out;
         }
     }
 
+    bdrv_unref(bs);
+    bs = NULL;
+
+    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
+    ret = bdrv_open(&bs, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
+                    drv, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto out;
+    }
+
     ret = 0;
 out:
-    bdrv_delete(bs);
+    if (bs) {
+        bdrv_unref(bs);
+    }
     return ret;
 }
 
-static int qcow2_create(const char *filename, QEMUOptionParameter *options)
+static int qcow2_create(const char *filename, QEMUOptionParameter *options,
+                        Error **errp)
 {
     const char *backing_file = NULL;
     const char *backing_fmt = NULL;
@@ -1402,7 +1770,9 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
     int flags = 0;
     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
     int prealloc = 0;
-    int version = 2;
+    int version = 3;
+    Error *local_err = NULL;
+    int ret;
 
     /* Read out options */
     while (options && options->name) {
@@ -1424,18 +1794,20 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
             } else if (!strcmp(options->value.s, "metadata")) {
                 prealloc = 1;
             } else {
-                fprintf(stderr, "Invalid preallocation mode: '%s'\n",
-                    options->value.s);
+                error_setg(errp, "Invalid preallocation mode: '%s'",
+                           options->value.s);
                 return -EINVAL;
             }
         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
-            if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+            if (!options->value.s) {
+                /* keep the default */
+            } else if (!strcmp(options->value.s, "0.10")) {
                 version = 2;
             } else if (!strcmp(options->value.s, "1.1")) {
                 version = 3;
             } else {
-                fprintf(stderr, "Invalid compatibility level: '%s'\n",
-                    options->value.s);
+                error_setg(errp, "Invalid compatibility level: '%s'",
+                           options->value.s);
                 return -EINVAL;
             }
         } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
@@ -1445,43 +1817,27 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
     }
 
     if (backing_file && prealloc) {
-        fprintf(stderr, "Backing file and preallocation cannot be used at "
-            "the same time\n");
+        error_setg(errp, "Backing file and preallocation cannot be used at "
+                   "the same time");
         return -EINVAL;
     }
 
     if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
-        fprintf(stderr, "Lazy refcounts only supported with compatibility "
-                "level 1.1 and above (use compat=1.1 or greater)\n");
+        error_setg(errp, "Lazy refcounts only supported with compatibility "
+                   "level 1.1 and above (use compat=1.1 or greater)");
         return -EINVAL;
     }
 
-    return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
-                         cluster_size, prealloc, options, version);
-}
-
-static int qcow2_make_empty(BlockDriverState *bs)
-{
-#if 0
-    /* XXX: not correct */
-    BDRVQcowState *s = bs->opaque;
-    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
-    int ret;
-
-    memset(s->l1_table, 0, l1_length);
-    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
-        return -1;
-    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
-    if (ret < 0)
-        return ret;
-
-    l2_cache_reset(bs);
-#endif
-    return 0;
+    ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+                        cluster_size, prealloc, options, version, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }
 
 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
     int ret;
     BDRVQcowState *s = bs->opaque;
@@ -1508,7 +1864,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
 
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+        nb_sectors, QCOW2_DISCARD_REQUEST);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -1631,6 +1987,12 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
             goto fail;
         }
         cluster_offset &= s->cluster_offset_mask;
+
+        ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
         ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
         if (ret < 0) {
@@ -1668,19 +2030,43 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     return 0;
 }
 
-static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
-{
-	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
-}
-
 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVQcowState *s = bs->opaque;
+    bdi->unallocated_blocks_are_zero = true;
+    bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3);
     bdi->cluster_size = s->cluster_size;
     bdi->vm_state_offset = qcow2_vm_state_offset(s);
     return 0;
 }
 
+static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
+
+    *spec_info = (ImageInfoSpecific){
+        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
+        {
+            .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
+        },
+    };
+    if (s->qcow_version == 2) {
+        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+            .compat = g_strdup("0.10"),
+        };
+    } else if (s->qcow_version == 3) {
+        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+            .compat             = g_strdup("1.1"),
+            .lazy_refcounts     = s->compatible_features &
+                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
+            .has_lazy_refcounts = true,
+        };
+    }
+
+    return spec_info;
+}
+
 #if 0
 static void dump_refcounts(BlockDriverState *bs)
 {
@@ -1706,13 +2092,22 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                               int64_t pos)
 {
     BDRVQcowState *s = bs->opaque;
+    int64_t total_sectors = bs->total_sectors;
     int growable = bs->growable;
+    bool zero_beyond_eof = bs->zero_beyond_eof;
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
     bs->growable = 1;
+    bs->zero_beyond_eof = false;
     ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
     bs->growable = growable;
+    bs->zero_beyond_eof = zero_beyond_eof;
+
+    /* bdrv_co_do_writev will have increased the total_sectors value to include
+     * the VM state - the VM state is however not an actual part of the block
+     * device, therefore, we need to restore the old value. */
+    bs->total_sectors = total_sectors;
 
     return ret;
 }
@@ -1722,16 +2117,212 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 {
     BDRVQcowState *s = bs->opaque;
     int growable = bs->growable;
+    bool zero_beyond_eof = bs->zero_beyond_eof;
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
     bs->growable = 1;
+    bs->zero_beyond_eof = false;
     ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
     bs->growable = growable;
+    bs->zero_beyond_eof = zero_beyond_eof;
 
     return ret;
 }
 
+/*
+ * Downgrades an image's version. To achieve this, any incompatible features
+ * have to be removed.
+ */
+static int qcow2_downgrade(BlockDriverState *bs, int target_version)
+{
+    BDRVQcowState *s = bs->opaque;
+    int current_version = s->qcow_version;
+    int ret;
+
+    if (target_version == current_version) {
+        return 0;
+    } else if (target_version > current_version) {
+        return -EINVAL;
+    } else if (target_version != 2) {
+        return -EINVAL;
+    }
+
+    if (s->refcount_order != 4) {
+        /* we would have to convert the image to a refcount_order == 4 image
+         * here; however, since qemu (at the time of writing this) does not
+         * support anything different than 4 anyway, there is no point in doing
+         * so right now; however, we should error out (if qemu supports this in
+         * the future and this code has not been adapted) */
+        error_report("qcow2_downgrade: Image refcount orders other than 4 are "
+                     "currently not supported.");
+        return -ENOTSUP;
+    }
+
+    /* clear incompatible features */
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        ret = qcow2_mark_clean(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
+     * the first place; if that happens nonetheless, returning -ENOTSUP is the
+     * best thing to do anyway */
+
+    if (s->incompatible_features) {
+        return -ENOTSUP;
+    }
+
+    /* since we can ignore compatible features, we can set them to 0 as well */
+    s->compatible_features = 0;
+    /* if lazy refcounts have been used, they have already been fixed through
+     * clearing the dirty flag */
+
+    /* clearing autoclear features is trivial */
+    s->autoclear_features = 0;
+
+    ret = qcow2_expand_zero_clusters(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->qcow_version = target_version;
+    ret = qcow2_update_header(bs);
+    if (ret < 0) {
+        s->qcow_version = current_version;
+        return ret;
+    }
+    return 0;
+}
+
+static int qcow2_amend_options(BlockDriverState *bs,
+                               QEMUOptionParameter *options)
+{
+    BDRVQcowState *s = bs->opaque;
+    int old_version = s->qcow_version, new_version = old_version;
+    uint64_t new_size = 0;
+    const char *backing_file = NULL, *backing_format = NULL;
+    bool lazy_refcounts = s->use_lazy_refcounts;
+    int ret;
+    int i;
+
+    for (i = 0; options[i].name; i++)
+    {
+        if (!options[i].assigned) {
+            /* only change explicitly defined options */
+            continue;
+        }
+
+        if (!strcmp(options[i].name, "compat")) {
+            if (!options[i].value.s) {
+                /* preserve default */
+            } else if (!strcmp(options[i].value.s, "0.10")) {
+                new_version = 2;
+            } else if (!strcmp(options[i].value.s, "1.1")) {
+                new_version = 3;
+            } else {
+                fprintf(stderr, "Unknown compatibility level %s.\n",
+                        options[i].value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options[i].name, "preallocation")) {
+            fprintf(stderr, "Cannot change preallocation mode.\n");
+            return -ENOTSUP;
+        } else if (!strcmp(options[i].name, "size")) {
+            new_size = options[i].value.n;
+        } else if (!strcmp(options[i].name, "backing_file")) {
+            backing_file = options[i].value.s;
+        } else if (!strcmp(options[i].name, "backing_fmt")) {
+            backing_format = options[i].value.s;
+        } else if (!strcmp(options[i].name, "encryption")) {
+            if ((options[i].value.n != !!s->crypt_method)) {
+                fprintf(stderr, "Changing the encryption flag is not "
+                        "supported.\n");
+                return -ENOTSUP;
+            }
+        } else if (!strcmp(options[i].name, "cluster_size")) {
+            if (options[i].value.n != s->cluster_size) {
+                fprintf(stderr, "Changing the cluster size is not "
+                        "supported.\n");
+                return -ENOTSUP;
+            }
+        } else if (!strcmp(options[i].name, "lazy_refcounts")) {
+            lazy_refcounts = options[i].value.n;
+        } else {
+            /* if this assertion fails, this probably means a new option was
+             * added without having it covered here */
+            assert(false);
+        }
+    }
+
+    if (new_version != old_version) {
+        if (new_version > old_version) {
+            /* Upgrade */
+            s->qcow_version = new_version;
+            ret = qcow2_update_header(bs);
+            if (ret < 0) {
+                s->qcow_version = old_version;
+                return ret;
+            }
+        } else {
+            ret = qcow2_downgrade(bs, new_version);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+
+    if (backing_file || backing_format) {
+        ret = qcow2_change_backing_file(bs, backing_file ?: bs->backing_file,
+                                        backing_format ?: bs->backing_format);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (s->use_lazy_refcounts != lazy_refcounts) {
+        if (lazy_refcounts) {
+            if (s->qcow_version < 3) {
+                fprintf(stderr, "Lazy refcounts only supported with compatibility "
+                        "level 1.1 and above (use compat=1.1 or greater)\n");
+                return -EINVAL;
+            }
+            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
+            ret = qcow2_update_header(bs);
+            if (ret < 0) {
+                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
+                return ret;
+            }
+            s->use_lazy_refcounts = true;
+        } else {
+            /* make image clean first */
+            ret = qcow2_mark_clean(bs);
+            if (ret < 0) {
+                return ret;
+            }
+            /* now disallow lazy refcounts */
+            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
+            ret = qcow2_update_header(bs);
+            if (ret < 0) {
+                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
+                return ret;
+            }
+            s->use_lazy_refcounts = false;
+        }
+    }
+
+    if (new_size) {
+        ret = bdrv_truncate(bs, new_size);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
 static QEMUOptionParameter qcow2_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1786,9 +2377,8 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = qcow2_co_is_allocated,
+    .bdrv_co_get_block_status = qcow2_co_get_block_status,
     .bdrv_set_key       = qcow2_set_key,
-    .bdrv_make_empty    = qcow2_make_empty,
 
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
@@ -1805,16 +2395,19 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_snapshot_list     = qcow2_snapshot_list,
     .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
     .bdrv_get_info      = qcow2_get_info,
+    .bdrv_get_specific_info = qcow2_get_specific_info,
 
     .bdrv_save_vmstate    = qcow2_save_vmstate,
     .bdrv_load_vmstate    = qcow2_load_vmstate,
 
     .bdrv_change_backing_file   = qcow2_change_backing_file,
 
+    .bdrv_refresh_limits        = qcow2_refresh_limits,
     .bdrv_invalidate_cache      = qcow2_invalidate_cache,
 
     .create_options = qcow2_create_options,
     .bdrv_check = qcow2_check,
+    .bdrv_amend_options = qcow2_amend_options,
 };
 
 static void bdrv_qcow2_init(void)
diff --git a/block/qcow2.h b/block/qcow2.h
index dba977141..b49424b85 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -38,13 +38,26 @@
 #define QCOW_CRYPT_AES  1
 
 #define QCOW_MAX_CRYPT_CLUSTERS 32
+#define QCOW_MAX_SNAPSHOTS 65536
+
+/* 8 MB refcount table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_REFTABLE_SIZE 0x800000
+
+/* 32 MB L1 table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_L1_SIZE 0x2000000
+
+/* Allow for an average of 1k per snapshot table entry, should be plenty of
+ * space for snapshot names and IDs */
+#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
 
 /* indicate that the refcount of the referenced cluster is exactly one. */
-#define QCOW_OFLAG_COPIED     (1LL << 63)
+#define QCOW_OFLAG_COPIED     (1ULL << 63)
 /* indicate that the cluster is compressed (they never have the copied flag) */
-#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+#define QCOW_OFLAG_COMPRESSED (1ULL << 62)
 /* The cluster reads as all zeros */
-#define QCOW_OFLAG_ZERO (1LL << 0)
+#define QCOW_OFLAG_ZERO (1ULL << 0)
 
 #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
 
@@ -63,6 +76,15 @@
 #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
 #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
 #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_OVERLAP "overlap-check"
+#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block"
+#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
 
 typedef struct QCowHeader {
     uint32_t magic;
@@ -86,7 +108,33 @@ typedef struct QCowHeader {
 
     uint32_t refcount_order;
     uint32_t header_length;
-} QCowHeader;
+} QEMU_PACKED QCowHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+} QCowSnapshotExtraData;
+
 
 typedef struct QCowSnapshot {
     uint64_t l1_table_offset;
@@ -119,9 +167,12 @@ enum {
 /* Incompatible feature bits */
 enum {
     QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
+    QCOW2_INCOMPAT_CORRUPT_BITNR = 1,
     QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+    QCOW2_INCOMPAT_CORRUPT       = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR,
 
-    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY
+                                 | QCOW2_INCOMPAT_CORRUPT,
 };
 
 /* Compatible feature bits */
@@ -179,8 +230,8 @@ typedef struct BDRVQcowState {
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
     uint32_t refcount_table_size;
-    int64_t free_cluster_index;
-    int64_t free_byte_offset;
+    uint64_t free_cluster_index;
+    uint64_t free_byte_offset;
 
     CoMutex lock;
 
@@ -190,15 +241,18 @@ typedef struct BDRVQcowState {
     AES_KEY aes_decrypt_key;
     uint64_t snapshots_offset;
     int snapshots_size;
-    int nb_snapshots;
+    unsigned int nb_snapshots;
     QCowSnapshot *snapshots;
 
     int flags;
     int qcow_version;
     bool use_lazy_refcounts;
+    int refcount_order;
 
     bool discard_passthrough[QCOW2_DISCARD_MAX];
 
+    int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
+
     uint64_t incompatible_features;
     uint64_t compatible_features;
     uint64_t autoclear_features;
@@ -286,11 +340,50 @@ enum {
     QCOW2_CLUSTER_ZERO
 };
 
-#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
-#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+typedef enum QCow2MetadataOverlap {
+    QCOW2_OL_MAIN_HEADER_BITNR    = 0,
+    QCOW2_OL_ACTIVE_L1_BITNR      = 1,
+    QCOW2_OL_ACTIVE_L2_BITNR      = 2,
+    QCOW2_OL_REFCOUNT_TABLE_BITNR = 3,
+    QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4,
+    QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5,
+    QCOW2_OL_INACTIVE_L1_BITNR    = 6,
+    QCOW2_OL_INACTIVE_L2_BITNR    = 7,
+
+    QCOW2_OL_MAX_BITNR            = 8,
+
+    QCOW2_OL_NONE           = 0,
+    QCOW2_OL_MAIN_HEADER    = (1 << QCOW2_OL_MAIN_HEADER_BITNR),
+    QCOW2_OL_ACTIVE_L1      = (1 << QCOW2_OL_ACTIVE_L1_BITNR),
+    QCOW2_OL_ACTIVE_L2      = (1 << QCOW2_OL_ACTIVE_L2_BITNR),
+    QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR),
+    QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR),
+    QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR),
+    QCOW2_OL_INACTIVE_L1    = (1 << QCOW2_OL_INACTIVE_L1_BITNR),
+    /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv
+     * reads. */
+    QCOW2_OL_INACTIVE_L2    = (1 << QCOW2_OL_INACTIVE_L2_BITNR),
+} QCow2MetadataOverlap;
+
+/* Perform all overlap checks which can be done in constant time */
+#define QCOW2_OL_CONSTANT \
+    (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \
+     QCOW2_OL_SNAPSHOT_TABLE)
+
+/* Perform all overlap checks which don't require disk access */
+#define QCOW2_OL_CACHED \
+    (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \
+     QCOW2_OL_INACTIVE_L1)
+
+/* Perform all overlap checks */
+#define QCOW2_OL_ALL \
+    (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
+
+#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
 #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
 
-#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
 
 static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
 {
@@ -324,6 +417,16 @@ static inline int64_t align_offset(int64_t offset, int n)
     return offset;
 }
 
+static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s)
+{
+    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
+}
+
+static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s)
+{
+    return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
+}
+
 static inline int qcow2_get_cluster_type(uint64_t l2_entry)
 {
     if (l2_entry & QCOW_OFLAG_COMPRESSED) {
@@ -361,13 +464,18 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
 
 int qcow2_mark_dirty(BlockDriverState *bs);
+int qcow2_mark_corrupt(BlockDriverState *bs);
+int qcow2_mark_consistent(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);
 
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
 
-int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
+                                  int addend, enum qcow2_discard_type type);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     int nb_clusters);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
@@ -385,9 +493,15 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
 void qcow2_process_discards(BlockDriverState *bs, int ret);
 
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
+                                 int64_t size);
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
+                                  int64_t size);
+
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
@@ -398,22 +512,30 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
+    int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          uint64_t offset,
                                          int compressed_size);
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors);
+    int nb_sectors, enum qcow2_discard_type type);
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
 
+int qcow2_expand_zero_clusters(BlockDriverState *bs);
+
 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
 int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
-int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs,
+                          const char *snapshot_id,
+                          const char *name,
+                          Error **errp);
 int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
-int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs,
+                            const char *snapshot_id,
+                            const char *name,
+                            Error **errp);
 
 void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);
@@ -428,6 +550,8 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 
+int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
+
 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
diff --git a/block/qed.c b/block/qed.c
index f767b0528..3bd9db9c8 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -353,10 +353,10 @@ static void qed_start_need_check_timer(BDRVQEDState *s)
 {
     trace_qed_start_need_check_timer(s);
 
-    /* Use vm_clock so we don't alter the image file while suspended for
+    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
      * migration.
      */
-    qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
+    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
                    get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
 }
 
@@ -364,7 +364,7 @@ static void qed_start_need_check_timer(BDRVQEDState *s)
 static void qed_cancel_need_check_timer(BDRVQEDState *s)
 {
     trace_qed_cancel_need_check_timer(s);
-    qemu_del_timer(s->need_check_timer);
+    timer_del(s->need_check_timer);
 }
 
 static void bdrv_qed_rebind(BlockDriverState *bs)
@@ -373,7 +373,8 @@ static void bdrv_qed_rebind(BlockDriverState *bs)
     s->bs = bs;
 }
 
-static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp)
 {
     BDRVQEDState *s = bs->opaque;
     QEDHeader le_header;
@@ -390,14 +391,15 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
     qed_header_le_to_cpu(&le_header, &s->header);
 
     if (s->header.magic != QED_MAGIC) {
-        return -EMEDIUMTYPE;
+        error_setg(errp, "Image not in QED format");
+        return -EINVAL;
     }
     if (s->header.features & ~QED_FEATURE_MASK) {
         /* image uses unsupported feature bits */
         char buf[64];
         snprintf(buf, sizeof(buf), "%" PRIx64,
             s->header.features & ~QED_FEATURE_MASK);
-        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
             bs->device_name, "QED", buf);
         return -ENOTSUP;
     }
@@ -494,7 +496,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
         }
     }
 
-    s->need_check_timer = qemu_new_timer_ns(vm_clock,
+    s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                             qed_need_check_timer_cb, s);
 
 out:
@@ -505,6 +507,15 @@ out:
     return ret;
 }
 
+static int bdrv_qed_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+
+    return 0;
+}
+
 /* We have nothing to do for QED reopen, stubs just return
  * success */
 static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
@@ -518,7 +529,7 @@ static void bdrv_qed_close(BlockDriverState *bs)
     BDRVQEDState *s = bs->opaque;
 
     qed_cancel_need_check_timer(s);
-    qemu_free_timer(s->need_check_timer);
+    timer_free(s->need_check_timer);
 
     /* Ensure writes reach stable storage */
     bdrv_flush(bs->file);
@@ -535,7 +546,8 @@ static void bdrv_qed_close(BlockDriverState *bs)
 
 static int qed_create(const char *filename, uint32_t cluster_size,
                       uint64_t image_size, uint32_t table_size,
-                      const char *backing_file, const char *backing_fmt)
+                      const char *backing_file, const char *backing_fmt,
+                      Error **errp)
 {
     QEDHeader header = {
         .magic = QED_MAGIC,
@@ -550,16 +562,22 @@ static int qed_create(const char *filename, uint32_t cluster_size,
     QEDHeader le_header;
     uint8_t *l1_table = NULL;
     size_t l1_size = header.cluster_size * header.table_size;
+    Error *local_err = NULL;
     int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
 
-    ret = bdrv_create_file(filename, NULL);
+    ret = bdrv_create_file(filename, NULL, &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+    bs = NULL;
+    ret = bdrv_open(&bs, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL,
+                    &local_err);
     if (ret < 0) {
+        error_propagate(errp, local_err);
         return ret;
     }
 
@@ -599,11 +617,12 @@ static int qed_create(const char *filename, uint32_t cluster_size,
     ret = 0; /* success */
 out:
     g_free(l1_table);
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     return ret;
 }
 
-static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options,
+                           Error **errp)
 {
     uint64_t image_size = 0;
     uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
@@ -648,54 +667,70 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
     }
 
     return qed_create(filename, cluster_size, image_size, table_size,
-                      backing_file, backing_fmt);
+                      backing_file, backing_fmt, errp);
 }
 
 typedef struct {
+    BlockDriverState *bs;
     Coroutine *co;
-    int is_allocated;
+    uint64_t pos;
+    int64_t status;
     int *pnum;
 } QEDIsAllocatedCB;
 
 static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
 {
     QEDIsAllocatedCB *cb = opaque;
+    BDRVQEDState *s = cb->bs->opaque;
     *cb->pnum = len / BDRV_SECTOR_SIZE;
-    cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        offset |= qed_offset_into_cluster(s, cb->pos);
+        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+        break;
+    case QED_CLUSTER_ZERO:
+        cb->status = BDRV_BLOCK_ZERO;
+        break;
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+        cb->status = 0;
+        break;
+    default:
+        assert(ret < 0);
+        cb->status = ret;
+        break;
+    }
+
     if (cb->co) {
         qemu_coroutine_enter(cb->co, NULL);
     }
 }
 
-static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
                                                  int64_t sector_num,
                                                  int nb_sectors, int *pnum)
 {
     BDRVQEDState *s = bs->opaque;
-    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
     size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
     QEDIsAllocatedCB cb = {
-        .is_allocated = -1,
+        .bs = bs,
+        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
+        .status = BDRV_BLOCK_OFFSET_MASK,
         .pnum = pnum,
     };
     QEDRequest request = { .l2_table = NULL };
 
-    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
 
     /* Now sleep if the callback wasn't invoked immediately */
-    while (cb.is_allocated == -1) {
+    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
         cb.co = qemu_coroutine_self();
         qemu_coroutine_yield();
     }
 
     qed_unref_l2_cache_entry(request.l2_table);
 
-    return cb.is_allocated;
-}
-
-static int bdrv_qed_make_empty(BlockDriverState *bs)
-{
-    return -ENOTSUP;
+    return cb.status;
 }
 
 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
@@ -1368,7 +1403,8 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
 
 static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                  int64_t sector_num,
-                                                 int nb_sectors)
+                                                 int nb_sectors,
+                                                 BdrvRequestFlags flags)
 {
     BlockDriverAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
@@ -1445,6 +1481,8 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     memset(bdi, 0, sizeof(*bdi));
     bdi->cluster_size = s->header.cluster_size;
     bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
+    bdi->unallocated_blocks_are_zero = true;
+    bdi->can_write_zeroes_with_unmap = true;
     return 0;
 }
 
@@ -1520,13 +1558,31 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
     return ret;
 }
 
-static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
     BDRVQEDState *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;
 
     bdrv_qed_close(bs);
+
+    bdrv_invalidate_cache(bs->file, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
     memset(s, 0, sizeof(BDRVQEDState));
-    bdrv_qed_open(bs, NULL, bs->open_flags);
+    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
+    if (local_err) {
+        error_setg(errp, "Could not reopen qed layer: %s",
+                   error_get_pretty(local_err));
+        error_free(local_err);
+        return;
+    } else if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not reopen qed layer");
+        return;
+    }
 }
 
 static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
@@ -1575,14 +1631,14 @@ static BlockDriver bdrv_qed = {
     .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
     .bdrv_create              = bdrv_qed_create,
     .bdrv_has_zero_init       = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
-    .bdrv_make_empty          = bdrv_qed_make_empty,
+    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
     .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
     .bdrv_change_backing_file = bdrv_qed_change_backing_file,
     .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
     .bdrv_check               = bdrv_qed_check,
diff --git a/block/qed.h b/block/qed.h
index 2b4ddedf3..5d65bea07 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -100,7 +100,7 @@ typedef struct {
     /* if (features & QED_F_BACKING_FILE) */
     uint32_t backing_filename_offset; /* in bytes from start of header */
     uint32_t backing_filename_size;   /* in bytes */
-} QEDHeader;
+} QEMU_PACKED QEDHeader;
 
 typedef struct {
     uint64_t offsets[0];            /* in bytes */
diff --git a/block/quorum.c b/block/quorum.c
new file mode 100644
index 000000000..7f580a83b
--- /dev/null
+++ b/block/quorum.c
@@ -0,0 +1,877 @@
+/*
+ * Quorum Block filter
+ *
+ * Copyright (C) 2012-2014 Nodalink, EURL.
+ *
+ * Author:
+ *   Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp)
+ * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc).
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <gnutls/gnutls.h>
+#include <gnutls/crypto.h>
+#include "block/block_int.h"
+#include "qapi/qmp/qjson.h"
+
+#define HASH_LENGTH 32
+
+#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
+#define QUORUM_OPT_BLKVERIFY      "blkverify"
+
+/* This union holds a vote hash value */
+typedef union QuorumVoteValue {
+    char h[HASH_LENGTH];       /* SHA-256 hash */
+    int64_t l;                 /* simpler 64 bits hash */
+} QuorumVoteValue;
+
+/* A vote item */
+typedef struct QuorumVoteItem {
+    int index;
+    QLIST_ENTRY(QuorumVoteItem) next;
+} QuorumVoteItem;
+
+/* this structure is a vote version. A version is the set of votes sharing the
+ * same vote value.
+ * The set of votes will be tracked with the items field and its cardinality is
+ * vote_count.
+ */
+typedef struct QuorumVoteVersion {
+    QuorumVoteValue value;
+    int index;
+    int vote_count;
+    QLIST_HEAD(, QuorumVoteItem) items;
+    QLIST_ENTRY(QuorumVoteVersion) next;
+} QuorumVoteVersion;
+
+/* this structure holds a group of vote versions together */
+typedef struct QuorumVotes {
+    QLIST_HEAD(, QuorumVoteVersion) vote_list;
+    bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
+} QuorumVotes;
+
+/* the following structure holds the state of one quorum instance */
+typedef struct BDRVQuorumState {
+    BlockDriverState **bs; /* children BlockDriverStates */
+    int num_children;      /* children count */
+    int threshold;         /* if less than threshold children reads gave the
+                            * same result a quorum error occurs.
+                            */
+    bool is_blkverify;     /* true if the driver is in blkverify mode
+                            * Writes are mirrored on two children devices.
+                            * On reads the two children devices' contents are
+                            * compared and if a difference is spotted its
+                            * location is printed and the code aborts.
+                            * It is useful to debug other block drivers by
+                            * comparing them with a reference one.
+                            */
+} BDRVQuorumState;
+
+typedef struct QuorumAIOCB QuorumAIOCB;
+
+/* Quorum will create one instance of the following structure per operation it
+ * performs on its children.
+ * So for each read/write operation coming from the upper layer there will be
+ * $children_count QuorumChildRequest.
+ */
+typedef struct QuorumChildRequest {
+    BlockDriverAIOCB *aiocb;
+    QEMUIOVector qiov;
+    uint8_t *buf;
+    int ret;
+    QuorumAIOCB *parent;
+} QuorumChildRequest;
+
+/* Quorum will use the following structure to track progress of each read/write
+ * operation received by the upper layer.
+ * This structure hold pointers to the QuorumChildRequest structures instances
+ * used to do operations on each children and track overall progress.
+ */
+struct QuorumAIOCB {
+    BlockDriverAIOCB common;
+
+    /* Request metadata */
+    uint64_t sector_num;
+    int nb_sectors;
+
+    QEMUIOVector *qiov;         /* calling IOV */
+
+    QuorumChildRequest *qcrs;   /* individual child requests */
+    int count;                  /* number of completed AIOCB */
+    int success_count;          /* number of successfully completed AIOCB */
+
+    QuorumVotes votes;
+
+    bool is_read;
+    int vote_ret;
+};
+
+static void quorum_vote(QuorumAIOCB *acb);
+
+static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    int i;
+
+    /* cancel all callbacks */
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_aio_cancel(acb->qcrs[i].aiocb);
+    }
+
+    g_free(acb->qcrs);
+    qemu_aio_release(acb);
+}
+
+static AIOCBInfo quorum_aiocb_info = {
+    .aiocb_size         = sizeof(QuorumAIOCB),
+    .cancel             = quorum_aio_cancel,
+};
+
+static void quorum_aio_finalize(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    int i, ret = 0;
+
+    if (acb->vote_ret) {
+        ret = acb->vote_ret;
+    }
+
+    acb->common.cb(acb->common.opaque, ret);
+
+    if (acb->is_read) {
+        for (i = 0; i < s->num_children; i++) {
+            qemu_vfree(acb->qcrs[i].buf);
+            qemu_iovec_destroy(&acb->qcrs[i].qiov);
+        }
+    }
+
+    g_free(acb->qcrs);
+    qemu_aio_release(acb);
+}
+
+static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
+{
+    return !memcmp(a->h, b->h, HASH_LENGTH);
+}
+
+static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
+{
+    return a->l == b->l;
+}
+
+static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
+                                   BlockDriverState *bs,
+                                   QEMUIOVector *qiov,
+                                   uint64_t sector_num,
+                                   int nb_sectors,
+                                   BlockDriverCompletionFunc *cb,
+                                   void *opaque)
+{
+    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
+    int i;
+
+    acb->common.bs->opaque = s;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->qiov = qiov;
+    acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
+    acb->count = 0;
+    acb->success_count = 0;
+    acb->votes.compare = quorum_sha256_compare;
+    QLIST_INIT(&acb->votes.vote_list);
+    acb->is_read = false;
+    acb->vote_ret = 0;
+
+    for (i = 0; i < s->num_children; i++) {
+        acb->qcrs[i].buf = NULL;
+        acb->qcrs[i].ret = 0;
+        acb->qcrs[i].parent = acb;
+    }
+
+    return acb;
+}
+
+static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
+{
+    QObject *data;
+    assert(node_name);
+    data = qobject_from_jsonf("{ 'node-name': %s"
+                              ", 'sector-num': %" PRId64
+                              ", 'sectors-count': %d }",
+                              node_name, acb->sector_num, acb->nb_sectors);
+    if (ret < 0) {
+        QDict *dict = qobject_to_qdict(data);
+        qdict_put(dict, "error", qstring_from_str(strerror(-ret)));
+    }
+    monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
+    qobject_decref(data);
+}
+
+static void quorum_report_failure(QuorumAIOCB *acb)
+{
+    QObject *data;
+    const char *reference = acb->common.bs->device_name[0] ?
+                            acb->common.bs->device_name :
+                            acb->common.bs->node_name;
+    data = qobject_from_jsonf("{ 'reference': %s"
+                              ", 'sector-num': %" PRId64
+                              ", 'sectors-count': %d }",
+                              reference, acb->sector_num, acb->nb_sectors);
+    monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
+    qobject_decref(data);
+}
+
+static int quorum_vote_error(QuorumAIOCB *acb);
+
+static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    if (acb->success_count < s->threshold) {
+        acb->vote_ret = quorum_vote_error(acb);
+        quorum_report_failure(acb);
+        return true;
+    }
+
+    return false;
+}
+
+static void quorum_aio_cb(void *opaque, int ret)
+{
+    QuorumChildRequest *sacb = opaque;
+    QuorumAIOCB *acb = sacb->parent;
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    sacb->ret = ret;
+    acb->count++;
+    if (ret == 0) {
+        acb->success_count++;
+    } else {
+        quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
+    }
+    assert(acb->count <= s->num_children);
+    assert(acb->success_count <= s->num_children);
+    if (acb->count < s->num_children) {
+        return;
+    }
+
+    /* Do the vote on read */
+    if (acb->is_read) {
+        quorum_vote(acb);
+    } else {
+        quorum_has_too_much_io_failed(acb);
+    }
+
+    quorum_aio_finalize(acb);
+}
+
+static void quorum_report_bad_versions(BDRVQuorumState *s,
+                                       QuorumAIOCB *acb,
+                                       QuorumVoteValue *value)
+{
+    QuorumVoteVersion *version;
+    QuorumVoteItem *item;
+
+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+        if (acb->votes.compare(&version->value, value)) {
+            continue;
+        }
+        QLIST_FOREACH(item, &version->items, next) {
+            quorum_report_bad(acb, s->bs[item->index]->node_name, 0);
+        }
+    }
+}
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+    int i;
+    assert(dest->niov == source->niov);
+    assert(dest->size == source->size);
+    for (i = 0; i < source->niov; i++) {
+        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+        memcpy(dest->iov[i].iov_base,
+               source->iov[i].iov_base,
+               source->iov[i].iov_len);
+    }
+}
+
+static void quorum_count_vote(QuorumVotes *votes,
+                              QuorumVoteValue *value,
+                              int index)
+{
+    QuorumVoteVersion *v = NULL, *version = NULL;
+    QuorumVoteItem *item;
+
+    /* look if we have something with this hash */
+    QLIST_FOREACH(v, &votes->vote_list, next) {
+        if (votes->compare(&v->value, value)) {
+            version = v;
+            break;
+        }
+    }
+
+    /* It's a version not yet in the list add it */
+    if (!version) {
+        version = g_new0(QuorumVoteVersion, 1);
+        QLIST_INIT(&version->items);
+        memcpy(&version->value, value, sizeof(version->value));
+        version->index = index;
+        version->vote_count = 0;
+        QLIST_INSERT_HEAD(&votes->vote_list, version, next);
+    }
+
+    version->vote_count++;
+
+    item = g_new0(QuorumVoteItem, 1);
+    item->index = index;
+    QLIST_INSERT_HEAD(&version->items, item, next);
+}
+
+static void quorum_free_vote_list(QuorumVotes *votes)
+{
+    QuorumVoteVersion *version, *next_version;
+    QuorumVoteItem *item, *next_item;
+
+    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
+        QLIST_REMOVE(version, next);
+        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
+            QLIST_REMOVE(item, next);
+            g_free(item);
+        }
+        g_free(version);
+    }
+}
+
+static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
+{
+    int j, ret;
+    gnutls_hash_hd_t dig;
+    QEMUIOVector *qiov = &acb->qcrs[i].qiov;
+
+    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    for (j = 0; j < qiov->niov; j++) {
+        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    gnutls_hash_deinit(dig, (void *) hash);
+    return ret;
+}
+
+static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
+{
+    int max = 0;
+    QuorumVoteVersion *candidate, *winner = NULL;
+
+    QLIST_FOREACH(candidate, &votes->vote_list, next) {
+        if (candidate->vote_count > max) {
+            max = candidate->vote_count;
+            winner = candidate;
+        }
+    }
+
+    return winner;
+}
+
+/* qemu_iovec_compare is handy for blkverify mode because it returns the first
+ * differing byte location. Yet it is handcoded to compare vectors one byte
+ * after another so it does not benefit from the libc SIMD optimizations.
+ * quorum_iovec_compare is written for speed and should be used in the non
+ * blkverify mode of quorum.
+ */
+static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+    int i;
+    int result;
+
+    assert(a->niov == b->niov);
+    for (i = 0; i < a->niov; i++) {
+        assert(a->iov[i].iov_len == b->iov[i].iov_len);
+        result = memcmp(a->iov[i].iov_base,
+                        b->iov[i].iov_base,
+                        a->iov[i].iov_len);
+        if (result) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
+                                          const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
+            acb->sector_num, acb->nb_sectors);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+    exit(1);
+}
+
+static bool quorum_compare(QuorumAIOCB *acb,
+                           QEMUIOVector *a,
+                           QEMUIOVector *b)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    ssize_t offset;
+
+    /* This driver will replace blkverify in this particular case */
+    if (s->is_blkverify) {
+        offset = qemu_iovec_compare(a, b);
+        if (offset != -1) {
+            quorum_err(acb, "contents mismatch in sector %" PRId64,
+                       acb->sector_num +
+                       (uint64_t)(offset / BDRV_SECTOR_SIZE));
+        }
+        return true;
+    }
+
+    return quorum_iovec_compare(a, b);
+}
+
+/* Do a vote to get the error code */
+static int quorum_vote_error(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    QuorumVoteVersion *winner = NULL;
+    QuorumVotes error_votes;
+    QuorumVoteValue result_value;
+    int i, ret = 0;
+    bool error = false;
+
+    QLIST_INIT(&error_votes.vote_list);
+    error_votes.compare = quorum_64bits_compare;
+
+    for (i = 0; i < s->num_children; i++) {
+        ret = acb->qcrs[i].ret;
+        if (ret) {
+            error = true;
+            result_value.l = ret;
+            quorum_count_vote(&error_votes, &result_value, i);
+        }
+    }
+
+    if (error) {
+        winner = quorum_get_vote_winner(&error_votes);
+        ret = winner->value.l;
+    }
+
+    quorum_free_vote_list(&error_votes);
+
+    return ret;
+}
+
+static void quorum_vote(QuorumAIOCB *acb)
+{
+    bool quorum = true;
+    int i, j, ret;
+    QuorumVoteValue hash;
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    QuorumVoteVersion *winner;
+
+    if (quorum_has_too_much_io_failed(acb)) {
+        return;
+    }
+
+    /* get the index of the first successful read */
+    for (i = 0; i < s->num_children; i++) {
+        if (!acb->qcrs[i].ret) {
+            break;
+        }
+    }
+
+    assert(i < s->num_children);
+
+    /* compare this read with all other successful reads stopping at quorum
+     * failure
+     */
+    for (j = i + 1; j < s->num_children; j++) {
+        if (acb->qcrs[j].ret) {
+            continue;
+        }
+        quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov);
+        if (!quorum) {
+            break;
+       }
+    }
+
+    /* Every successful read agrees */
+    if (quorum) {
+        quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
+        return;
+    }
+
+    /* compute hashes for each successful read, also store indexes */
+    for (i = 0; i < s->num_children; i++) {
+        if (acb->qcrs[i].ret) {
+            continue;
+        }
+        ret = quorum_compute_hash(acb, i, &hash);
+        /* if ever the hash computation failed */
+        if (ret < 0) {
+            acb->vote_ret = ret;
+            goto free_exit;
+        }
+        quorum_count_vote(&acb->votes, &hash, i);
+    }
+
+    /* vote to select the most represented version */
+    winner = quorum_get_vote_winner(&acb->votes);
+
+    /* if the winner count is smaller than threshold the read fails */
+    if (winner->vote_count < s->threshold) {
+        quorum_report_failure(acb);
+        acb->vote_ret = -EIO;
+        goto free_exit;
+    }
+
+    /* we have a winner: copy it */
+    quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov);
+
+    /* some versions are bad print them */
+    quorum_report_bad_versions(s, acb, &winner->value);
+
+free_exit:
+    /* free lists */
+    quorum_free_vote_list(&acb->votes);
+}
+
+static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc *cb,
+                                         void *opaque)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
+                                      nb_sectors, cb, opaque);
+    int i;
+
+    acb->is_read = true;
+
+    for (i = 0; i < s->num_children; i++) {
+        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
+        qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
+        qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
+    }
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors,
+                       quorum_aio_cb, &acb->qcrs[i]);
+    }
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
+                                          int64_t sector_num,
+                                          QEMUIOVector *qiov,
+                                          int nb_sectors,
+                                          BlockDriverCompletionFunc *cb,
+                                          void *opaque)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
+                                      cb, opaque);
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
+                                             nb_sectors, &quorum_aio_cb,
+                                             &acb->qcrs[i]);
+    }
+
+    return &acb->common;
+}
+
+static int64_t quorum_getlength(BlockDriverState *bs)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int64_t result;
+    int i;
+
+    /* check that all file have the same length */
+    result = bdrv_getlength(s->bs[0]);
+    if (result < 0) {
+        return result;
+    }
+    for (i = 1; i < s->num_children; i++) {
+        int64_t value = bdrv_getlength(s->bs[i]);
+        if (value < 0) {
+            return value;
+        }
+        if (value != result) {
+            return -EIO;
+        }
+    }
+
+    return result;
+}
+
+static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
+{
+    BDRVQuorumState *s = bs->opaque;
+    Error *local_err = NULL;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_invalidate_cache(s->bs[i], &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumVoteVersion *winner = NULL;
+    QuorumVotes error_votes;
+    QuorumVoteValue result_value;
+    int i;
+    int result = 0;
+
+    QLIST_INIT(&error_votes.vote_list);
+    error_votes.compare = quorum_64bits_compare;
+
+    for (i = 0; i < s->num_children; i++) {
+        result = bdrv_co_flush(s->bs[i]);
+        result_value.l = result;
+        quorum_count_vote(&error_votes, &result_value, i);
+    }
+
+    winner = quorum_get_vote_winner(&error_votes);
+    result = winner->value.l;
+
+    quorum_free_vote_list(&error_votes);
+
+    return result;
+}
+
+static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
+                                               BlockDriverState *candidate)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bool perm = bdrv_recurse_is_first_non_filter(s->bs[i],
+                                                     candidate);
+        if (perm) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
+{
+
+    if (threshold < 1) {
+        error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+                  "vote-threshold", "value >= 1");
+        return -ERANGE;
+    }
+
+    if (threshold > num_children) {
+        error_setg(errp, "threshold may not exceed children count");
+        return -ERANGE;
+    }
+
+    return 0;
+}
+
+static QemuOptsList quorum_runtime_opts = {
+    .name = "quorum",
+    .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head),
+    .desc = {
+        {
+            .name = QUORUM_OPT_VOTE_THRESHOLD,
+            .type = QEMU_OPT_NUMBER,
+            .help = "The number of vote needed for reaching quorum",
+        },
+        {
+            .name = QUORUM_OPT_BLKVERIFY,
+            .type = QEMU_OPT_BOOL,
+            .help = "Trigger block verify mode if set",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
+                       Error **errp)
+{
+    BDRVQuorumState *s = bs->opaque;
+    Error *local_err = NULL;
+    QemuOpts *opts;
+    bool *opened;
+    QDict *sub = NULL;
+    QList *list = NULL;
+    const QListEntry *lentry;
+    int i;
+    int ret = 0;
+
+    qdict_flatten(options);
+    qdict_extract_subqdict(options, &sub, "children.");
+    qdict_array_split(sub, &list);
+
+    if (qdict_size(sub)) {
+        error_setg(&local_err, "Invalid option children.%s",
+                   qdict_first(sub)->key);
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* count how many different children are present */
+    s->num_children = qlist_size(list);
+    if (s->num_children < 2) {
+        error_setg(&local_err,
+                   "Number of provided children must be greater than 1");
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
+
+    /* and validate it against s->num_children */
+    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    /* is the driver in blkverify mode */
+    if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
+        s->num_children == 2 && s->threshold == 2) {
+        s->is_blkverify = true;
+    } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
+        fprintf(stderr, "blkverify mode is set by setting blkverify=on "
+                "and using two files with vote_threshold=2\n");
+    }
+
+    /* allocate the children BlockDriverState array */
+    s->bs = g_new0(BlockDriverState *, s->num_children);
+    opened = g_new0(bool, s->num_children);
+
+    for (i = 0, lentry = qlist_first(list); lentry;
+         lentry = qlist_next(lentry), i++) {
+        QDict *d;
+        QString *string;
+
+        switch (qobject_type(lentry->value))
+        {
+            /* List of options */
+            case QTYPE_QDICT:
+                d = qobject_to_qdict(lentry->value);
+                QINCREF(d);
+                ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL,
+                                &local_err);
+                break;
+
+            /* QMP reference */
+            case QTYPE_QSTRING:
+                string = qobject_to_qstring(lentry->value);
+                ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL,
+                                flags, NULL, &local_err);
+                break;
+
+            default:
+                error_setg(&local_err, "Specification of child block device %i "
+                           "is invalid", i);
+                ret = -EINVAL;
+        }
+
+        if (ret < 0) {
+            goto close_exit;
+        }
+        opened[i] = true;
+    }
+
+    g_free(opened);
+    goto exit;
+
+close_exit:
+    /* cleanup on error */
+    for (i = 0; i < s->num_children; i++) {
+        if (!opened[i]) {
+            continue;
+        }
+        bdrv_unref(s->bs[i]);
+    }
+    g_free(s->bs);
+    g_free(opened);
+exit:
+    /* propagate error */
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+    }
+    QDECREF(list);
+    QDECREF(sub);
+    return ret;
+}
+
+static void quorum_close(BlockDriverState *bs)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_unref(s->bs[i]);
+    }
+
+    g_free(s->bs);
+}
+
+static BlockDriver bdrv_quorum = {
+    .format_name        = "quorum",
+    .protocol_name      = "quorum",
+
+    .instance_size      = sizeof(BDRVQuorumState),
+
+    .bdrv_file_open     = quorum_open,
+    .bdrv_close         = quorum_close,
+
+    .bdrv_co_flush_to_disk = quorum_co_flush,
+
+    .bdrv_getlength     = quorum_getlength,
+
+    .bdrv_aio_readv     = quorum_aio_readv,
+    .bdrv_aio_writev    = quorum_aio_writev,
+    .bdrv_invalidate_cache = quorum_invalidate_cache,
+
+    .is_filter           = true,
+    .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter,
+};
+
+static void bdrv_quorum_init(void)
+{
+    bdrv_register(&bdrv_quorum);
+}
+
+block_init(bdrv_quorum_init);
diff --git a/block/raw-aio.h b/block/raw-aio.h
index c61f1595d..7ad0a8a0a 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -21,9 +21,10 @@
 #define QEMU_AIO_IOCTL        0x0004
 #define QEMU_AIO_FLUSH        0x0008
 #define QEMU_AIO_DISCARD      0x0010
+#define QEMU_AIO_WRITE_ZEROES 0x0020
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
-         QEMU_AIO_DISCARD)
+         QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ba721d3f5..1688e16c6 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,6 +127,8 @@ typedef struct BDRVRawState {
     int fd;
     int type;
     int open_flags;
+    size_t buf_align;
+
 #if defined(__linux__)
     /* linux floppy specific */
     int64_t fd_open_time;
@@ -139,9 +141,11 @@ typedef struct BDRVRawState {
     void *aio_ctx;
 #endif
 #ifdef CONFIG_XFS
-    bool is_xfs : 1;
+    bool is_xfs:1;
 #endif
-    bool has_discard : 1;
+    bool has_discard:1;
+    bool has_write_zeroes:1;
+    bool discard_zeroes:1;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -211,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    char *buf;
+    unsigned int sector_size;
+
+    /* For /dev/sg devices the alignment is not really used.
+       With buffered I/O, we don't have any restrictions. */
+    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+        bs->request_alignment = 1;
+        s->buf_align = 1;
+        return;
+    }
+
+    /* Try a few ioctls to get the right size */
+    bs->request_alignment = 0;
+    s->buf_align = 0;
+
+#ifdef BLKSSZGET
+    if (ioctl(s->fd, BLKSSZGET, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DKIOCGETBLOCKSIZE
+    if (ioctl(s->fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DIOCGSECTORSIZE
+    if (ioctl(s->fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef CONFIG_XFS
+    if (s->is_xfs) {
+        struct dioattr da;
+        if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
+            bs->request_alignment = da.d_miniosz;
+            /* The kernel returns wrong information for d_mem */
+            /* s->buf_align = da.d_mem; */
+        }
+    }
+#endif
+
+    /* If we could not get the sizes so far, we can only guess them */
+    if (!s->buf_align) {
+        size_t align;
+        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+                s->buf_align = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+
+    if (!bs->request_alignment) {
+        size_t align;
+        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf, align, 0) >= 0) {
+                bs->request_alignment = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+}
+
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
 {
     assert(open_flags != NULL);
@@ -262,6 +336,17 @@ error:
 }
 #endif
 
+static void raw_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    /* The filename does not have to be prefixed by the protocol name, since
+     * "file" is the default protocol; therefore, the return value of this
+     * function call can be ignored. */
+    strstart(filename, "file:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+
 static QemuOptsList raw_runtime_opts = {
     .name = "raw",
     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
@@ -276,19 +361,19 @@ static QemuOptsList raw_runtime_opts = {
 };
 
 static int raw_open_common(BlockDriverState *bs, QDict *options,
-                           int bdrv_flags, int open_flags)
+                           int bdrv_flags, int open_flags, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
     int fd, ret;
+    struct stat st;
 
-    opts = qemu_opts_create_nofail(&raw_runtime_opts);
+    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
@@ -297,6 +382,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 
     ret = raw_normalize_devicepath(&filename);
     if (ret != 0) {
+        error_setg_errno(errp, -ret, "Could not normalize device path");
         goto fail;
     }
 
@@ -318,14 +404,43 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
         qemu_close(fd);
         ret = -errno;
+        error_setg_errno(errp, -ret, "Could not set AIO state");
+        goto fail;
+    }
+#endif
+
+    s->has_discard = true;
+    s->has_write_zeroes = true;
+
+    if (fstat(s->fd, &st) < 0) {
+        error_setg_errno(errp, errno, "Could not stat file");
         goto fail;
     }
+    if (S_ISREG(st.st_mode)) {
+        s->discard_zeroes = true;
+    }
+    if (S_ISBLK(st.st_mode)) {
+#ifdef BLKDISCARDZEROES
+        unsigned int arg;
+        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
+            s->discard_zeroes = true;
+        }
+#endif
+#ifdef __linux__
+        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
+         * not rely on the contents of discarded blocks unless using O_DIRECT.
+         * Same for BLKZEROOUT.
+         */
+        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+            s->discard_zeroes = false;
+            s->has_write_zeroes = false;
+        }
 #endif
+    }
 
-    s->has_discard = 1;
 #ifdef CONFIG_XFS
     if (platform_test_xfs_fd(s->fd)) {
-        s->is_xfs = 1;
+        s->is_xfs = true;
     }
 #endif
 
@@ -335,12 +450,19 @@ fail:
     return ret;
 }
 
-static int raw_open(BlockDriverState *bs, QDict *options, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;
 
     s->type = FTYPE_FILE;
-    return raw_open_common(bs, options, flags, 0);
+    ret = raw_open_common(bs, options, flags, 0, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }
 
 static int raw_reopen_prepare(BDRVReopenState *state,
@@ -365,6 +487,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
      * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
      * won't override aio_ctx if aio_ctx is non-NULL */
     if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+        error_setg(errp, "Could not set AIO state");
         return -1;
     }
 #endif
@@ -416,13 +539,13 @@ static int raw_reopen_prepare(BDRVReopenState *state,
         assert(!(raw_s->open_flags & O_CREAT));
         raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
         if (raw_s->fd == -1) {
+            error_setg_errno(errp, errno, "Could not reopen file");
             ret = -1;
         }
     }
     return ret;
 }
 
-
 static void raw_reopen_commit(BDRVReopenState *state)
 {
     BDRVRawReopenState *raw_s = state->opaque;
@@ -458,23 +581,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
     state->opaque = NULL;
 }
 
+static int raw_refresh_limits(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
 
-/* XXX: use host sector size if necessary with:
-#ifdef DIOCGSECTORSIZE
-        {
-            unsigned int sectorsize = 512;
-            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
-                sectorsize > bufsize)
-                bufsize = sectorsize;
-        }
-#endif
-#ifdef CONFIG_COCOA
-        uint32_t blockSize = 512;
-        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
-            bufsize = blockSize;
-        }
-#endif
-*/
+    raw_probe_alignment(bs);
+    bs->bl.opt_mem_alignment = s->buf_align;
+
+    return 0;
+}
 
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
@@ -665,6 +780,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 }
 
 #ifdef CONFIG_XFS
+static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
+{
+    struct xfs_flock64 fl;
+
+    memset(&fl, 0, sizeof(fl));
+    fl.l_whence = SEEK_SET;
+    fl.l_start = offset;
+    fl.l_len = bytes;
+
+    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
+        DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+
 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
     struct xfs_flock64 fl;
@@ -683,13 +815,49 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 }
 #endif
 
+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+{
+    int ret = -EOPNOTSUPP;
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (s->has_write_zeroes == 0) {
+        return -ENOTSUP;
+    }
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+#ifdef BLKZEROOUT
+        do {
+            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
+
+        ret = -errno;
+#endif
+    } else {
+#ifdef CONFIG_XFS
+        if (s->is_xfs) {
+            return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+        }
+#endif
+    }
+
+    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
+        ret == -ENOTTY) {
+        s->has_write_zeroes = false;
+        ret = -ENOTSUP;
+    }
+    return ret;
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
     int ret = -EOPNOTSUPP;
     BDRVRawState *s = aiocb->bs->opaque;
 
-    if (s->has_discard == 0) {
-        return 0;
+    if (!s->has_discard) {
+        return -ENOTSUP;
     }
 
     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
@@ -724,8 +892,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 
     if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
         ret == -ENOTTY) {
-        s->has_discard = 0;
-        ret = 0;
+        s->has_discard = false;
+        ret = -ENOTSUP;
     }
     return ret;
 }
@@ -767,6 +935,9 @@ static int aio_worker(void *arg)
     case QEMU_AIO_DISCARD:
         ret = handle_aiocb_discard(aiocb);
         break;
+    case QEMU_AIO_WRITE_ZEROES:
+        ret = handle_aiocb_write_zeroes(aiocb);
+        break;
     default:
         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
         ret = -EINVAL;
@@ -777,6 +948,29 @@ static int aio_worker(void *arg)
     return ret;
 }
 
+static int paio_submit_co(BlockDriverState *bs, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int type)
+{
+    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+    ThreadPool *pool;
+
+    acb->bs = bs;
+    acb->aio_type = type;
+    acb->aio_fildes = fd;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit_co(sector_num, nb_sectors, type);
+    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+    return thread_pool_submit_co(pool, aio_worker, acb);
+}
+
 static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque, int type)
@@ -1040,12 +1234,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
     return (int64_t)st.st_blocks * 512;
 }
 
-static int raw_create(const char *filename, QEMUOptionParameter *options)
+static int raw_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     int fd;
     int result = 0;
     int64_t total_size = 0;
 
+    strstart(filename, "file:", &filename);
+
     /* Read out options */
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
@@ -1058,12 +1255,15 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
                    0644);
     if (fd < 0) {
         result = -errno;
+        error_setg_errno(errp, -result, "Could not create file");
     } else {
         if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
             result = -errno;
+            error_setg_errno(errp, -result, "Could not resize file");
         }
         if (qemu_close(fd) != 0) {
             result = -errno;
+            error_setg_errno(errp, -result, "Could not close the new file");
         }
     }
     return result;
@@ -1084,12 +1284,12 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
  * beyond the end of the disk image it will be clamped.
  */
-static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                             int64_t sector_num,
                                             int nb_sectors, int *pnum)
 {
     off_t start, data, hole;
-    int ret;
+    int64_t ret;
 
     ret = fd_open(bs);
     if (ret < 0) {
@@ -1097,6 +1297,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     }
 
     start = sector_num * BDRV_SECTOR_SIZE;
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
 
 #ifdef CONFIG_FIEMAP
 
@@ -1114,7 +1315,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
         /* Assume everything is allocated.  */
         *pnum = nb_sectors;
-        return 1;
+        return ret;
     }
 
     if (f.fm.fm_mapped_extents == 0) {
@@ -1127,6 +1328,9 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     } else {
         data = f.fe.fe_logical;
         hole = f.fe.fe_logical + f.fe.fe_length;
+        if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
+            ret |= BDRV_BLOCK_ZERO;
+        }
     }
 
 #elif defined SEEK_HOLE && defined SEEK_DATA
@@ -1141,7 +1345,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
 
         /* Most likely EINVAL.  Assume everything is allocated.  */
         *pnum = nb_sectors;
-        return 1;
+        return ret;
     }
 
     if (hole > start) {
@@ -1154,19 +1358,21 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
         }
     }
 #else
-    *pnum = nb_sectors;
-    return 1;
+    data = 0;
+    hole = start + nb_sectors * BDRV_SECTOR_SIZE;
 #endif
 
     if (data <= start) {
         /* On a data extent, compute sectors to the end of the extent.  */
         *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
-        return 1;
     } else {
         /* On a hole, compute sectors to the beginning of the next extent.  */
         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
-        return 0;
+        ret &= ~BDRV_BLOCK_DATA;
+        ret |= BDRV_BLOCK_ZERO;
     }
+
+    return ret;
 }
 
 static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
@@ -1179,6 +1385,31 @@ static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
                        cb, opaque, QEMU_AIO_DISCARD);
 }
 
+static int coroutine_fn raw_co_write_zeroes(
+    BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, BdrvRequestFlags flags)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD);
+    }
+    return -ENOTSUP;
+}
+
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVRawState *s = bs->opaque;
+
+    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
+    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
+    return 0;
+}
+
 static QEMUOptionParameter raw_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1192,7 +1423,9 @@ static BlockDriver bdrv_file = {
     .format_name = "file",
     .protocol_name = "file",
     .instance_size = sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
     .bdrv_probe = NULL, /* no probe for protocols */
+    .bdrv_parse_filename = raw_parse_filename,
     .bdrv_file_open = raw_open,
     .bdrv_reopen_prepare = raw_reopen_prepare,
     .bdrv_reopen_commit = raw_reopen_commit,
@@ -1200,15 +1433,18 @@ static BlockDriver bdrv_file = {
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = raw_co_is_allocated,
+    .bdrv_co_get_block_status = raw_co_get_block_status,
+    .bdrv_co_write_zeroes = raw_co_write_zeroes,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
     .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
+    .bdrv_get_info = raw_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
@@ -1325,9 +1561,20 @@ static int check_hdev_writable(BDRVRawState *s)
     return 0;
 }
 
-static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
+static void hdev_parse_filename(const char *filename, QDict *options,
+                                Error **errp)
+{
+    /* The prefix is optional, just as for "file". */
+    strstart(filename, "host_device:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
     int ret;
     const char *filename = qdict_get_str(options, "filename");
 
@@ -1371,8 +1618,11 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
     }
 #endif
 
-    ret = raw_open_common(bs, options, flags, 0);
+    ret = raw_open_common(bs, options, flags, 0, &local_err);
     if (ret < 0) {
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
         return ret;
     }
 
@@ -1380,6 +1630,7 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
         ret = check_hdev_writable(s);
         if (ret < 0) {
             raw_close(bs);
+            error_setg_errno(errp, -ret, "The device is not writable");
             return ret;
         }
     }
@@ -1498,12 +1749,45 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
-static int hdev_create(const char *filename, QEMUOptionParameter *options)
+static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int rc;
+
+    rc = fd_open(bs);
+    if (rc < 0) {
+        return rc;
+    }
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+    }
+    return -ENOTSUP;
+}
+
+static int hdev_create(const char *filename, QEMUOptionParameter *options,
+                       Error **errp)
 {
     int fd;
     int ret = 0;
     struct stat stat_buf;
     int64_t total_size = 0;
+    bool has_prefix;
+
+    /* This function is used by all three protocol block drivers and therefore
+     * any of these three prefixes may be given.
+     * The return value has to be stored somewhere, otherwise this is an error
+     * due to -Werror=unused-value. */
+    has_prefix =
+        strstart(filename, "host_device:", &filename) ||
+        strstart(filename, "host_cdrom:" , &filename) ||
+        strstart(filename, "host_floppy:", &filename);
+
+    (void)has_prefix;
 
     /* Read out options */
     while (options && options->name) {
@@ -1514,15 +1798,23 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options)
     }
 
     fd = qemu_open(filename, O_WRONLY | O_BINARY);
-    if (fd < 0)
-        return -errno;
+    if (fd < 0) {
+        ret = -errno;
+        error_setg_errno(errp, -ret, "Could not open device");
+        return ret;
+    }
 
-    if (fstat(fd, &stat_buf) < 0)
+    if (fstat(fd, &stat_buf) < 0) {
         ret = -errno;
-    else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode))
+        error_setg_errno(errp, -ret, "Could not stat device");
+    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
+        error_setg(errp,
+                   "The given file is neither a block nor a character device");
         ret = -ENODEV;
-    else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE)
+    } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) {
+        error_setg(errp, "Device is too small");
         ret = -ENOSPC;
+    }
 
     qemu_close(fd);
     return ret;
@@ -1532,7 +1824,9 @@ static BlockDriver bdrv_host_device = {
     .format_name        = "host_device",
     .protocol_name        = "host_device",
     .instance_size      = sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
     .bdrv_probe_device  = hdev_probe_device,
+    .bdrv_parse_filename = hdev_parse_filename,
     .bdrv_file_open     = hdev_open,
     .bdrv_close         = raw_close,
     .bdrv_reopen_prepare = raw_reopen_prepare,
@@ -1540,14 +1834,17 @@ static BlockDriver bdrv_host_device = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
+    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
 
     .bdrv_aio_readv	= raw_aio_readv,
     .bdrv_aio_writev	= raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
+    .bdrv_get_info = raw_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
@@ -1559,17 +1856,32 @@ static BlockDriver bdrv_host_device = {
 };
 
 #ifdef __linux__
-static int floppy_open(BlockDriverState *bs, QDict *options, int flags)
+static void floppy_parse_filename(const char *filename, QDict *options,
+                                  Error **errp)
+{
+    /* The prefix is optional, just as for "file". */
+    strstart(filename, "host_floppy:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+
+static int floppy_open(BlockDriverState *bs, QDict *options, int flags,
+                       Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
     int ret;
 
     s->type = FTYPE_FD;
 
     /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, options, flags, O_NONBLOCK);
-    if (ret)
+    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
+    if (ret) {
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
         return ret;
+    }
 
     /* close fd so that we can reopen it as needed */
     qemu_close(s->fd);
@@ -1656,7 +1968,9 @@ static BlockDriver bdrv_host_floppy = {
     .format_name        = "host_floppy",
     .protocol_name      = "host_floppy",
     .instance_size      = sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
     .bdrv_probe_device	= floppy_probe_device,
+    .bdrv_parse_filename = floppy_parse_filename,
     .bdrv_file_open     = floppy_open,
     .bdrv_close         = raw_close,
     .bdrv_reopen_prepare = raw_reopen_prepare,
@@ -1668,9 +1982,11 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
-    .bdrv_getlength	= raw_getlength,
+    .bdrv_getlength      = raw_getlength,
+    .has_variable_length = true,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
@@ -1679,15 +1995,35 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_media_changed = floppy_media_changed,
     .bdrv_eject         = floppy_eject,
 };
+#endif
+
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+static void cdrom_parse_filename(const char *filename, QDict *options,
+                                 Error **errp)
+{
+    /* The prefix is optional, just as for "file". */
+    strstart(filename, "host_cdrom:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+#endif
 
-static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
+#ifdef __linux__
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;
 
     s->type = FTYPE_CD;
 
     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
-    return raw_open_common(bs, options, flags, O_NONBLOCK);
+    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }
 
 static int cdrom_probe_device(const char *filename)
@@ -1757,7 +2093,9 @@ static BlockDriver bdrv_host_cdrom = {
     .format_name        = "host_cdrom",
     .protocol_name      = "host_cdrom",
     .instance_size      = sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
     .bdrv_probe_device	= cdrom_probe_device,
+    .bdrv_parse_filename = cdrom_parse_filename,
     .bdrv_file_open     = cdrom_open,
     .bdrv_close         = raw_close,
     .bdrv_reopen_prepare = raw_reopen_prepare,
@@ -1769,9 +2107,11 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
-    .bdrv_getlength     = raw_getlength,
+    .bdrv_getlength      = raw_getlength,
+    .has_variable_length = true,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
@@ -1787,16 +2127,22 @@ static BlockDriver bdrv_host_cdrom = {
 #endif /* __linux__ */
 
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
-static int cdrom_open(BlockDriverState *bs, QDict *options, int flags)
+static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
     int ret;
 
     s->type = FTYPE_CD;
 
-    ret = raw_open_common(bs, options, flags, 0);
-    if (ret)
+    ret = raw_open_common(bs, options, flags, 0, &local_err);
+    if (ret) {
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
         return ret;
+    }
 
     /* make sure the door isn't locked at this time */
     ioctl(s->fd, CDIOCALLOW);
@@ -1878,7 +2224,9 @@ static BlockDriver bdrv_host_cdrom = {
     .format_name        = "host_cdrom",
     .protocol_name      = "host_cdrom",
     .instance_size      = sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
     .bdrv_probe_device	= cdrom_probe_device,
+    .bdrv_parse_filename = cdrom_parse_filename,
     .bdrv_file_open     = cdrom_open,
     .bdrv_close         = raw_close,
     .bdrv_reopen_prepare = raw_reopen_prepare,
@@ -1890,9 +2238,11 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
-    .bdrv_getlength     = raw_getlength,
+    .bdrv_getlength      = raw_getlength,
+    .has_variable_length = true,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 9b5b2af4e..48cb2c225 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -85,6 +85,7 @@ static size_t handle_aiocb_rw(RawWin32AIOData *aiocb)
             ret_count = 0;
         }
         if (ret_count != len) {
+            offset += ret_count;
             break;
         }
         offset += len;
@@ -201,6 +202,35 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
+    DISK_GEOMETRY_EX dg;
+    BOOL status;
+
+    if (s->type == FTYPE_CD) {
+        bs->request_alignment = 2048;
+        return;
+    }
+    if (s->type == FTYPE_HARDDISK) {
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            bs->request_alignment = dg.Geometry.BytesPerSector;
+            return;
+        }
+        /* try GetDiskFreeSpace too */
+    }
+
+    if (s->drive_path[0]) {
+        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
+                         &dg.Geometry.BytesPerSector,
+                         &freeClusters, &totalClusters);
+        bs->request_alignment = dg.Geometry.BytesPerSector;
+    }
+}
+
 static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
 {
     assert(access_flags != NULL);
@@ -221,6 +251,17 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
     }
 }
 
+static void raw_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    /* The filename does not have to be prefixed by the protocol name, since
+     * "file" is the default protocol; therefore, the return value of this
+     * function call can be ignored. */
+    strstart(filename, "file:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+
 static QemuOptsList raw_runtime_opts = {
     .name = "raw",
     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
@@ -234,7 +275,8 @@ static QemuOptsList raw_runtime_opts = {
     },
 };
 
-static int raw_open(BlockDriverState *bs, QDict *options, int flags)
+static int raw_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     int access_flags;
@@ -246,11 +288,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags)
 
     s->type = FTYPE_FILE;
 
-    opts = qemu_opts_create_nofail(&raw_runtime_opts);
+    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
@@ -262,11 +303,23 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags)
     if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
         aio = win32_aio_init();
         if (aio == NULL) {
+            error_setg(errp, "Could not initialize AIO");
             ret = -EINVAL;
             goto fail;
         }
     }
 
+    if (filename[0] && filename[1] == ':') {
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
+    } else if (filename[0] == '\\' && filename[1] == '\\') {
+        s->drive_path[0] = 0;
+    } else {
+        /* Relative path.  */
+        char buf[MAX_PATH];
+        GetCurrentDirectory(MAX_PATH, buf);
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]);
+    }
+
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           OPEN_EXISTING, overlapped, NULL);
@@ -285,11 +338,13 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags)
         ret = win32_aio_attach(aio, s->hfile);
         if (ret < 0) {
             CloseHandle(s->hfile);
+            error_setg_errno(errp, -ret, "Could not enable AIO");
             goto fail;
         }
         s->aio = aio;
     }
 
+    raw_probe_alignment(bs);
     ret = 0;
 fail:
     qemu_opts_del(opts);
@@ -420,11 +475,14 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
     return st.st_size;
 }
 
-static int raw_create(const char *filename, QEMUOptionParameter *options)
+static int raw_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     int fd;
     int64_t total_size = 0;
 
+    strstart(filename, "file:", &filename);
+
     /* Read out options */
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
@@ -435,8 +493,10 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
 
     fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                    0644);
-    if (fd < 0)
+    if (fd < 0) {
+        error_setg_errno(errp, errno, "Could not create file");
         return -EIO;
+    }
     set_sparse(fd);
     ftruncate(fd, total_size * 512);
     qemu_close(fd);
@@ -456,6 +516,8 @@ static BlockDriver bdrv_file = {
     .format_name	= "file",
     .protocol_name	= "file",
     .instance_size	= sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
+    .bdrv_parse_filename = raw_parse_filename,
     .bdrv_file_open	= raw_open,
     .bdrv_close		= raw_close,
     .bdrv_create	= raw_create,
@@ -531,17 +593,44 @@ static int hdev_probe_device(const char *filename)
     return 0;
 }
 
-static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
+static void hdev_parse_filename(const char *filename, QDict *options,
+                                Error **errp)
+{
+    /* The prefix is optional, just as for "file". */
+    strstart(filename, "host_device:", &filename);
+
+    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
+}
+
+static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     int access_flags, create_flags;
+    int ret = 0;
     DWORD overlapped;
     char device_name[64];
-    const char *filename = qdict_get_str(options, "filename");
+
+    Error *local_err = NULL;
+    const char *filename;
+
+    QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0,
+                                      &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto done;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
 
     if (strstart(filename, "/dev/cdrom", NULL)) {
-        if (find_cdrom(device_name, sizeof(device_name)) < 0)
-            return -ENOENT;
+        if (find_cdrom(device_name, sizeof(device_name)) < 0) {
+            error_setg(errp, "Could not open CD-ROM drive");
+            ret = -ENOENT;
+            goto done;
+        }
         filename = device_name;
     } else {
         /* transform drive letters into device name */
@@ -564,17 +653,26 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
     if (s->hfile == INVALID_HANDLE_VALUE) {
         int err = GetLastError();
 
-        if (err == ERROR_ACCESS_DENIED)
-            return -EACCES;
-        return -1;
+        if (err == ERROR_ACCESS_DENIED) {
+            ret = -EACCES;
+        } else {
+            ret = -EINVAL;
+        }
+        error_setg_errno(errp, -ret, "Could not open device");
+        goto done;
     }
-    return 0;
+
+done:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static BlockDriver bdrv_host_device = {
     .format_name	= "host_device",
     .protocol_name	= "host_device",
     .instance_size	= sizeof(BDRVRawState),
+    .bdrv_needs_filename = true,
+    .bdrv_parse_filename = hdev_parse_filename,
     .bdrv_probe_device	= hdev_probe_device,
     .bdrv_file_open	= hdev_open,
     .bdrv_close		= raw_close,
@@ -583,7 +681,9 @@ static BlockDriver bdrv_host_device = {
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush     = raw_aio_flush,
 
-    .bdrv_getlength	= raw_getlength,
+    .bdrv_getlength      = raw_getlength,
+    .has_variable_length = true,
+
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 };
diff --git a/block/raw.c b/block/raw_bsd.c
index 47518253f..01ea692a4 100644
--- a/block/raw.c
+++ b/block/raw_bsd.c
@@ -1,13 +1,17 @@
-/*
- * Block driver for RAW format
+/* BlockDriver implementation for "raw"
  *
- * Copyright (c) 2006 Fabrice Bellard
+ * Copyright (C) 2010, 2013, Red Hat, Inc.
+ * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com>
+ * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * Author:
+ *   Laszlo Ersek <lersek@redhat.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
@@ -15,27 +19,27 @@
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "qemu-common.h"
 #include "block/block_int.h"
-#include "qemu/module.h"
+#include "qemu/option.h"
 
-static int raw_open(BlockDriverState *bs, QDict *options, int flags)
-{
-    bs->sg = bs->file->sg;
-    return 0;
-}
+static QEMUOptionParameter raw_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { 0 }
+};
 
-/* We have nothing to do for raw reopen, stubs just return
- * success */
-static int raw_reopen_prepare(BDRVReopenState *state,
-                              BlockReopenQueue *queue,  Error **errp)
+static int raw_reopen_prepare(BDRVReopenState *reopen_state,
+                              BlockReopenQueue *queue, Error **errp)
 {
     return 0;
 }
@@ -54,22 +58,26 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
 }
 
-static void raw_close(BlockDriverState *bs)
-{
-}
-
-static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                             int64_t sector_num,
                                             int nb_sectors, int *pnum)
 {
-    return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
+    *pnum = nb_sectors;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+           (sector_num << BDRV_SECTOR_BITS);
 }
 
 static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            int nb_sectors)
+                                            int64_t sector_num, int nb_sectors,
+                                            BdrvRequestFlags flags)
 {
-    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors);
+    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags);
+}
+
+static int coroutine_fn raw_co_discard(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors)
+{
+    return bdrv_co_discard(bs->file, sector_num, nb_sectors);
 }
 
 static int64_t raw_getlength(BlockDriverState *bs)
@@ -77,20 +85,20 @@ static int64_t raw_getlength(BlockDriverState *bs)
     return bdrv_getlength(bs->file);
 }
 
-static int raw_truncate(BlockDriverState *bs, int64_t offset)
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
-    return bdrv_truncate(bs->file, offset);
+    return bdrv_get_info(bs->file, bdi);
 }
 
-static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
+static int raw_refresh_limits(BlockDriverState *bs)
 {
-   return 1; /* everything can be opened as raw image */
+    bs->bl = bs->file->bl;
+    return 0;
 }
 
-static int coroutine_fn raw_co_discard(BlockDriverState *bs,
-                                       int64_t sector_num, int nb_sectors)
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
-    return bdrv_co_discard(bs->file, sector_num, nb_sectors);
+    return bdrv_truncate(bs->file, offset);
 }
 
 static int raw_is_inserted(BlockDriverState *bs)
@@ -115,73 +123,79 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
 
 static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
-   return bdrv_ioctl(bs->file, req, buf);
+    return bdrv_ioctl(bs->file, req, buf);
 }
 
 static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
-        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-   return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque);
-}
-
-static int raw_create(const char *filename, QEMUOptionParameter *options)
+                                       unsigned long int req, void *buf,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
 {
-    return bdrv_create_file(filename, options);
+    return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque);
 }
 
-static QEMUOptionParameter raw_create_options[] = {
-    {
-        .name = BLOCK_OPT_SIZE,
-        .type = OPT_SIZE,
-        .help = "Virtual disk size"
-    },
-    { NULL }
-};
-
 static int raw_has_zero_init(BlockDriverState *bs)
 {
     return bdrv_has_zero_init(bs->file);
 }
 
-static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+static int raw_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
-    return bdrv_get_info(bs->file, bdi);
-}
-
-static BlockDriver bdrv_raw = {
-    .format_name        = "raw",
-
-    /* It's really 0, but we need to make g_malloc() happy */
-    .instance_size      = 1,
-
-    .bdrv_open          = raw_open,
-    .bdrv_close         = raw_close,
-
-    .bdrv_reopen_prepare  = raw_reopen_prepare,
+    Error *local_err = NULL;
+    int ret;
 
-    .bdrv_co_readv          = raw_co_readv,
-    .bdrv_co_writev         = raw_co_writev,
-    .bdrv_co_is_allocated   = raw_co_is_allocated,
-    .bdrv_co_write_zeroes   = raw_co_write_zeroes,
-    .bdrv_co_discard        = raw_co_discard,
+    ret = bdrv_create_file(filename, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
+}
 
-    .bdrv_probe         = raw_probe,
-    .bdrv_getlength     = raw_getlength,
-    .bdrv_get_info      = raw_get_info,
-    .bdrv_truncate      = raw_truncate,
+static int raw_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
+{
+    bs->sg = bs->file->sg;
+    return 0;
+}
 
-    .bdrv_is_inserted   = raw_is_inserted,
-    .bdrv_media_changed = raw_media_changed,
-    .bdrv_eject         = raw_eject,
-    .bdrv_lock_medium   = raw_lock_medium,
+static void raw_close(BlockDriverState *bs)
+{
+}
 
-    .bdrv_ioctl         = raw_ioctl,
-    .bdrv_aio_ioctl     = raw_aio_ioctl,
+static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    /* smallest possible positive score so that raw is used if and only if no
+     * other block driver works
+     */
+    return 1;
+}
 
-    .bdrv_create        = raw_create,
-    .create_options     = raw_create_options,
-    .bdrv_has_zero_init = raw_has_zero_init,
+static BlockDriver bdrv_raw = {
+    .format_name          = "raw",
+    .bdrv_probe           = &raw_probe,
+    .bdrv_reopen_prepare  = &raw_reopen_prepare,
+    .bdrv_open            = &raw_open,
+    .bdrv_close           = &raw_close,
+    .bdrv_create          = &raw_create,
+    .bdrv_co_readv        = &raw_co_readv,
+    .bdrv_co_writev       = &raw_co_writev,
+    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
+    .bdrv_co_discard      = &raw_co_discard,
+    .bdrv_co_get_block_status = &raw_co_get_block_status,
+    .bdrv_truncate        = &raw_truncate,
+    .bdrv_getlength       = &raw_getlength,
+    .has_variable_length  = true,
+    .bdrv_get_info        = &raw_get_info,
+    .bdrv_refresh_limits  = &raw_refresh_limits,
+    .bdrv_is_inserted     = &raw_is_inserted,
+    .bdrv_media_changed   = &raw_media_changed,
+    .bdrv_eject           = &raw_eject,
+    .bdrv_lock_medium     = &raw_lock_medium,
+    .bdrv_ioctl           = &raw_ioctl,
+    .bdrv_aio_ioctl       = &raw_aio_ioctl,
+    .create_options       = &raw_create_options[0],
+    .bdrv_has_zero_init   = &raw_has_zero_init
 };
 
 static void bdrv_raw_init(void)
diff --git a/block/rbd.c b/block/rbd.c
index cb7175121..dbc79f452 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -95,19 +95,13 @@ typedef struct RADOSCB {
 #define RBD_FD_WRITE 1
 
 typedef struct BDRVRBDState {
-    int fds[2];
     rados_t cluster;
     rados_ioctx_t io_ctx;
     rbd_image_t image;
     char name[RBD_MAX_IMAGE_NAME_SIZE];
-    int qemu_aio_count;
     char *snap;
-    int event_reader_pos;
-    RADOSCB *event_rcb;
 } BDRVRBDState;
 
-static void rbd_aio_bh_cb(void *opaque);
-
 static int qemu_rbd_next_tok(char *dst, int dst_len,
                              char *src, char delim,
                              const char *name,
@@ -288,7 +282,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf)
     return ret;
 }
 
-static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options)
+static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options,
+                           Error **errp)
 {
     int64_t bytes = 0;
     int64_t objsize;
@@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options)
 }
 
 /*
- * This aio completion is being called from qemu_rbd_aio_event_reader()
- * and runs in qemu context. It schedules a bh, but just in case the aio
- * was not cancelled before.
+ * This aio completion is being called from rbd_finish_bh() and runs in qemu
+ * BH context.
  */
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
@@ -401,44 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
             acb->ret = r;
         }
     }
-    /* Note that acb->bh can be NULL in case where the aio was cancelled */
-    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
-    g_free(rcb);
-}
-
-/*
- * aio fd read handler. It runs in the qemu context and calls the
- * completion handling of completed rados aio operations.
- */
-static void qemu_rbd_aio_event_reader(void *opaque)
-{
-    BDRVRBDState *s = opaque;
-
-    ssize_t ret;
 
-    do {
-        char *p = (char *)&s->event_rcb;
-
-        /* now read the rcb pointer that was sent from a non qemu thread */
-        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_rcb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_rcb)) {
-                s->event_reader_pos = 0;
-                qemu_rbd_complete_aio(s->event_rcb);
-                s->qemu_aio_count--;
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
-}
+    g_free(rcb);
 
-static int qemu_rbd_aio_flush_cb(void *opaque)
-{
-    BDRVRBDState *s = opaque;
+    if (acb->cmd == RBD_AIO_READ) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    acb->status = 0;
 
-    return (s->qemu_aio_count > 0);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }
 
 /* TODO Convert to fine grained options */
@@ -455,7 +424,8 @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
+static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp)
 {
     BDRVRBDState *s = bs->opaque;
     char pool[RBD_MAX_POOL_NAME_SIZE];
@@ -468,9 +438,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
     const char *filename;
     int r;
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         qerror_report_err(local_err);
         error_free(local_err);
         qemu_opts_del(opts);
@@ -545,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
 
     bs->read_only = (s->snap != NULL);
 
-    s->event_reader_pos = 0;
-    r = qemu_pipe(s->fds);
-    if (r < 0) {
-        error_report("error opening eventfd");
-        goto failed;
-    }
-    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
-    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
-                            NULL, qemu_rbd_aio_flush_cb, s);
-
-
     qemu_opts_del(opts);
     return 0;
 
-failed:
-    rbd_close(s->image);
 failed_open:
     rados_ioctx_destroy(s->io_ctx);
 failed_shutdown:
@@ -576,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
 
-    close(s->fds[0]);
-    close(s->fds[1]);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL);
-
     rbd_close(s->image);
     rados_ioctx_destroy(s->io_ctx);
     g_free(s->snap);
@@ -607,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = {
     .cancel = qemu_rbd_aio_cancel,
 };
 
-static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
+static void rbd_finish_bh(void *opaque)
 {
-    int ret = 0;
-    while (1) {
-        fd_set wfd;
-        int fd = s->fds[RBD_FD_WRITE];
-
-        /* send the op pointer to the qemu thread that is responsible
-           for the aio/op completion. Must do it in a qemu thread context */
-        ret = write(fd, (void *)&rcb, sizeof(rcb));
-        if (ret >= 0) {
-            break;
-        }
-        if (errno == EINTR) {
-            continue;
-        }
-        if (errno != EAGAIN) {
-            break;
-        }
-
-        FD_ZERO(&wfd);
-        FD_SET(fd, &wfd);
-        do {
-            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
-        } while (ret < 0 && errno == EINTR);
-    }
-
-    return ret;
+    RADOSCB *rcb = opaque;
+    qemu_bh_delete(rcb->acb->bh);
+    qemu_rbd_complete_aio(rcb);
 }
 
 /*
@@ -642,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
  *
  * Note: this function is being called from a non qemu thread so
  * we need to be careful about what we do here. Generally we only
- * write to the block notification pipe, and do the rest of the
- * io completion handling from qemu_rbd_aio_event_reader() which
- * runs in a qemu context.
+ * schedule a BH, and do the rest of the io completion handling
+ * from rbd_finish_bh() which runs in a qemu context.
  */
 static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
 {
-    int ret;
+    RBDAIOCB *acb = rcb->acb;
+
     rcb->ret = rbd_aio_get_return_value(c);
     rbd_aio_release(c);
-    ret = qemu_rbd_send_pipe(rcb->s, rcb);
-    if (ret < 0) {
-        error_report("failed writing to acb->s->fds");
-        g_free(rcb);
-    }
-}
-
-/* Callback when all queued rbd_aio requests are complete */
-
-static void rbd_aio_bh_cb(void *opaque)
-{
-    RBDAIOCB *acb = opaque;
 
-    if (acb->cmd == RBD_AIO_READ) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    acb->status = 0;
-
-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    acb->bh = qemu_bh_new(rbd_finish_bh, rcb);
+    qemu_bh_schedule(acb->bh);
 }
 
 static int rbd_aio_discard_wrapper(rbd_image_t image,
@@ -741,8 +648,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     off = sector_num * BDRV_SECTOR_SIZE;
     size = nb_sectors * BDRV_SECTOR_SIZE;
 
-    s->qemu_aio_count++; /* All the RADOSCB */
-
     rcb = g_malloc(sizeof(RADOSCB));
     rcb->done = 0;
     rcb->acb = acb;
@@ -779,7 +684,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
 
 failed:
     g_free(rcb);
-    s->qemu_aio_count--;
     qemu_aio_release(acb);
     return NULL;
 }
@@ -903,12 +807,31 @@ static int qemu_rbd_snap_create(BlockDriverState *bs,
 }
 
 static int qemu_rbd_snap_remove(BlockDriverState *bs,
-                                const char *snapshot_name)
+                                const char *snapshot_id,
+                                const char *snapshot_name,
+                                Error **errp)
 {
     BDRVRBDState *s = bs->opaque;
     int r;
 
+    if (!snapshot_name) {
+        error_setg(errp, "rbd need a valid snapshot name");
+        return -EINVAL;
+    }
+
+    /* If snapshot_id is specified, it must be equal to name, see
+       qemu_rbd_snap_list() */
+    if (snapshot_id && strcmp(snapshot_id, snapshot_name)) {
+        error_setg(errp,
+                   "rbd do not support snapshot id, it should be NULL or "
+                   "equal to snapshot name");
+        return -EINVAL;
+    }
+
     r = rbd_snap_remove(s->image, snapshot_name);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "Failed to remove the snapshot");
+    }
     return r;
 }
 
@@ -934,7 +857,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
     do {
         snaps = g_malloc(sizeof(*snaps) * max_snaps);
         snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
-        if (snap_count < 0) {
+        if (snap_count <= 0) {
             g_free(snaps);
         }
     } while (snap_count == -ERANGE);
@@ -958,6 +881,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
         sn_info->vm_clock_nsec = 0;
     }
     rbd_snap_list_end(snaps);
+    g_free(snaps);
 
  done:
     *psn_tab = sn_tab;
@@ -993,6 +917,7 @@ static QEMUOptionParameter qemu_rbd_create_options[] = {
 static BlockDriver bdrv_rbd = {
     .format_name        = "rbd",
     .instance_size      = sizeof(BDRVRBDState),
+    .bdrv_needs_filename = true,
     .bdrv_file_open     = qemu_rbd_open,
     .bdrv_close         = qemu_rbd_close,
     .bdrv_create        = qemu_rbd_create,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index afe053376..0eb33ee80 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -91,6 +91,14 @@
 #define SD_NR_VDIS   (1U << 24)
 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+/*
+ * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
+ * (SD_EC_MAX_STRIP - 1) for parity strips
+ *
+ * SD_MAX_COPIES is sum of number of data strips and parity strips.
+ */
+#define SD_EC_MAX_STRIP 16
+#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
 
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0
@@ -125,8 +133,9 @@ typedef struct SheepdogObjReq {
     uint32_t data_length;
     uint64_t oid;
     uint64_t cow_oid;
-    uint32_t copies;
-    uint32_t rsvd;
+    uint8_t copies;
+    uint8_t copy_policy;
+    uint8_t reserved[6];
     uint64_t offset;
 } SheepdogObjReq;
 
@@ -138,7 +147,9 @@ typedef struct SheepdogObjRsp {
     uint32_t id;
     uint32_t data_length;
     uint32_t result;
-    uint32_t copies;
+    uint8_t copies;
+    uint8_t copy_policy;
+    uint8_t reserved[2];
     uint32_t pad[6];
 } SheepdogObjRsp;
 
@@ -150,8 +161,10 @@ typedef struct SheepdogVdiReq {
     uint32_t id;
     uint32_t data_length;
     uint64_t vdi_size;
-    uint32_t vdi_id;
-    uint32_t copies;
+    uint32_t base_vdi_id;
+    uint8_t copies;
+    uint8_t copy_policy;
+    uint8_t reserved[2];
     uint32_t snapid;
     uint32_t pad[3];
 } SheepdogVdiReq;
@@ -222,6 +235,11 @@ static inline uint64_t data_oid_to_idx(uint64_t oid)
     return oid & (MAX_DATA_OBJS - 1);
 }
 
+static inline uint32_t oid_to_vid(uint64_t oid)
+{
+    return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+}
+
 static inline uint64_t vid_to_vdi_oid(uint32_t vid)
 {
     return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
@@ -289,11 +307,14 @@ struct SheepdogAIOCB {
     Coroutine *coroutine;
     void (*aio_done_func)(SheepdogAIOCB *);
 
-    bool canceled;
+    bool cancelable;
+    bool *finished;
     int nr_pending;
 };
 
 typedef struct BDRVSheepdogState {
+    BlockDriverState *bs;
+
     SheepdogInode inode;
 
     uint32_t min_dirty_data_idx;
@@ -313,8 +334,11 @@ typedef struct BDRVSheepdogState {
     Coroutine *co_recv;
 
     uint32_t aioreq_seq_num;
+
+    /* Every aio request must be linked to either of these queues. */
     QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
     QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
+    QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
 } BDRVSheepdogState;
 
 static const char * sd_strerror(int err)
@@ -403,6 +427,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 {
     SheepdogAIOCB *acb = aio_req->aiocb;
 
+    acb->cancelable = false;
     QLIST_REMOVE(aio_req, aio_siblings);
     g_free(aio_req);
 
@@ -411,23 +436,68 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 
 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
 {
-    if (!acb->canceled) {
-        qemu_coroutine_enter(acb->coroutine, NULL);
+    qemu_coroutine_enter(acb->coroutine, NULL);
+    if (acb->finished) {
+        *acb->finished = true;
     }
     qemu_aio_release(acb);
 }
 
+/*
+ * Check whether the specified acb can be canceled
+ *
+ * We can cancel aio when any request belonging to the acb is:
+ *  - Not processed by the sheepdog server.
+ *  - Not linked to the inflight queue.
+ */
+static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
+{
+    BDRVSheepdogState *s = acb->common.bs->opaque;
+    AIOReq *aioreq;
+
+    if (!acb->cancelable) {
+        return false;
+    }
+
+    QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
+        if (aioreq->aiocb == acb) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
+    BDRVSheepdogState *s = acb->common.bs->opaque;
+    AIOReq *aioreq, *next;
+    bool finished = false;
+
+    acb->finished = &finished;
+    while (!finished) {
+        if (sd_acb_cancelable(acb)) {
+            /* Remove outstanding requests from pending and failed queues.  */
+            QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
+                               next) {
+                if (aioreq->aiocb == acb) {
+                    free_aio_req(s, aioreq);
+                }
+            }
+            QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
+                               next) {
+                if (aioreq->aiocb == acb) {
+                    free_aio_req(s, aioreq);
+                }
+            }
 
-    /*
-     * Sheepdog cannot cancel the requests which are already sent to
-     * the servers, so we just complete the request with -EIO here.
-     */
-    acb->ret = -EIO;
-    qemu_coroutine_enter(acb->coroutine, NULL);
-    acb->canceled = true;
+            assert(acb->nr_pending == 0);
+            sd_finish_aiocb(acb);
+            return;
+        }
+        qemu_aio_wait();
+    }
 }
 
 static const AIOCBInfo sd_aiocb_info = {
@@ -448,7 +518,8 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->nb_sectors = nb_sectors;
 
     acb->aio_done_func = NULL;
-    acb->canceled = false;
+    acb->cancelable = true;
+    acb->finished = NULL;
     acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     acb->nr_pending = 0;
@@ -489,13 +560,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     int ret;
 
     ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
-    if (ret < sizeof(*hdr)) {
+    if (ret != sizeof(*hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
         return ret;
     }
 
     ret = qemu_co_send(sockfd, data, *wlen);
-    if (ret < *wlen) {
+    if (ret != *wlen) {
         error_report("failed to send a req, %s", strerror(errno));
     }
 
@@ -509,13 +580,6 @@ static void restart_co_req(void *opaque)
     qemu_coroutine_enter(co, NULL);
 }
 
-static int have_co_req(void *opaque)
-{
-    /* this handler is set only when there is a pending request, so
-     * always returns 1. */
-    return 1;
-}
-
 typedef struct SheepdogReqCo {
     int sockfd;
     SheepdogReq *hdr;
@@ -538,17 +602,17 @@ static coroutine_fn void do_co_req(void *opaque)
     unsigned int *rlen = srco->rlen;
 
     co = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co);
+    qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, co);
 
     ret = send_co_req(sockfd, hdr, data, wlen);
     if (ret < 0) {
         goto out;
     }
 
-    qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co);
+    qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, co);
 
     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
-    if (ret < sizeof(*hdr)) {
+    if (ret != sizeof(*hdr)) {
         error_report("failed to get a rsp, %s", strerror(errno));
         ret = -errno;
         goto out;
@@ -560,7 +624,7 @@ static coroutine_fn void do_co_req(void *opaque)
 
     if (*rlen) {
         ret = qemu_co_recv(sockfd, data, *rlen);
-        if (ret < *rlen) {
+        if (ret != *rlen) {
             error_report("failed to get the data, %s", strerror(errno));
             ret = -errno;
             goto out;
@@ -570,7 +634,7 @@ static coroutine_fn void do_co_req(void *opaque)
 out:
     /* there is at most one request for this sockfd, so it is safe to
      * set each handler to NULL. */
-    qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
+    qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL);
 
     srco->ret = ret;
     srco->finished = true;
@@ -603,11 +667,13 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
     return srco.ret;
 }
 
-static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
                            struct iovec *iov, int niov, bool create,
                            enum AIOCBState aiocb_type);
-static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
-
+static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
+static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
+static int get_sheep_fd(BDRVSheepdogState *s);
+static void co_write_request(void *opaque);
 
 static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
 {
@@ -630,22 +696,59 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
 {
     AIOReq *aio_req;
     SheepdogAIOCB *acb;
-    int ret;
 
     while ((aio_req = find_pending_req(s, oid)) != NULL) {
         acb = aio_req->aiocb;
         /* move aio_req from pending list to inflight one */
         QLIST_REMOVE(aio_req, aio_siblings);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        ret = add_aio_request(s, aio_req, acb->qiov->iov,
-                              acb->qiov->niov, false, acb->aiocb_type);
-        if (ret < 0) {
-            error_report("add_aio_request is failed");
-            free_aio_req(s, aio_req);
-            if (!acb->nr_pending) {
-                sd_finish_aiocb(acb);
-            }
+        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, false,
+                        acb->aiocb_type);
+    }
+}
+
+static coroutine_fn void reconnect_to_sdog(void *opaque)
+{
+    BDRVSheepdogState *s = opaque;
+    AIOReq *aio_req, *next;
+
+    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL);
+    close(s->fd);
+    s->fd = -1;
+
+    /* Wait for outstanding write requests to be completed. */
+    while (s->co_send != NULL) {
+        co_write_request(opaque);
+    }
+
+    /* Try to reconnect the sheepdog server every one second. */
+    while (s->fd < 0) {
+        s->fd = get_sheep_fd(s);
+        if (s->fd < 0) {
+            DPRINTF("Wait for connection to be established\n");
+            co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
+                            1000000000ULL);
         }
+    };
+
+    /*
+     * Now we have to resend all the request in the inflight queue.  However,
+     * resend_aioreq() can yield and newly created requests can be added to the
+     * inflight queue before the coroutine is resumed.  To avoid mixing them, we
+     * have to move all the inflight requests to the failed queue before
+     * resend_aioreq() is called.
+     */
+    QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
+        QLIST_REMOVE(aio_req, aio_siblings);
+        QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
+    }
+
+    /* Resend all the failed aio requests. */
+    while (!QLIST_EMPTY(&s->failed_aio_head)) {
+        aio_req = QLIST_FIRST(&s->failed_aio_head);
+        QLIST_REMOVE(aio_req, aio_siblings);
+        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+        resend_aioreq(s, aio_req);
     }
 }
 
@@ -665,15 +768,11 @@ static void coroutine_fn aio_read_response(void *opaque)
     SheepdogAIOCB *acb;
     uint64_t idx;
 
-    if (QLIST_EMPTY(&s->inflight_aio_head)) {
-        goto out;
-    }
-
     /* read a header */
     ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
-    if (ret < 0) {
+    if (ret != sizeof(rsp)) {
         error_report("failed to get the header, %s", strerror(errno));
-        goto out;
+        goto err;
     }
 
     /* find the right aio_req from the inflight aio list */
@@ -684,7 +783,7 @@ static void coroutine_fn aio_read_response(void *opaque)
     }
     if (!aio_req) {
         error_report("cannot find aio_req %x", rsp.id);
-        goto out;
+        goto err;
     }
 
     acb = aio_req->aiocb;
@@ -722,9 +821,9 @@ static void coroutine_fn aio_read_response(void *opaque)
     case AIOCB_READ_UDATA:
         ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
                             aio_req->iov_offset, rsp.data_length);
-        if (ret < 0) {
+        if (ret != rsp.data_length) {
             error_report("failed to get the data, %s", strerror(errno));
-            goto out;
+            goto err;
         }
         break;
     case AIOCB_FLUSH_CACHE:
@@ -755,11 +854,20 @@ static void coroutine_fn aio_read_response(void *opaque)
     case SD_RES_SUCCESS:
         break;
     case SD_RES_READONLY:
-        ret = resend_aioreq(s, aio_req);
-        if (ret == SD_RES_SUCCESS) {
-            goto out;
+        if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
+            ret = reload_inode(s, 0, "");
+            if (ret < 0) {
+                goto err;
+            }
         }
-        /* fall through */
+        if (is_data_obj(aio_req->oid)) {
+            aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
+                                           data_oid_to_idx(aio_req->oid));
+        } else {
+            aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
+        }
+        resend_aioreq(s, aio_req);
+        goto out;
     default:
         acb->ret = -EIO;
         error_report("%s", sd_strerror(rsp.result));
@@ -776,6 +884,10 @@ static void coroutine_fn aio_read_response(void *opaque)
     }
 out:
     s->co_recv = NULL;
+    return;
+err:
+    s->co_recv = NULL;
+    reconnect_to_sdog(opaque);
 }
 
 static void co_read_response(void *opaque)
@@ -796,18 +908,10 @@ static void co_write_request(void *opaque)
     qemu_coroutine_enter(s->co_send, NULL);
 }
 
-static int aio_flush_request(void *opaque)
-{
-    BDRVSheepdogState *s = opaque;
-
-    return !QLIST_EMPTY(&s->inflight_aio_head) ||
-        !QLIST_EMPTY(&s->pending_aio_head);
-}
-
 /*
- * Return a socket discriptor to read/write objects.
+ * Return a socket descriptor to read/write objects.
  *
- * We cannot use this discriptor for other operations because
+ * We cannot use this descriptor for other operations because
  * the block driver may be on waiting response from the server.
  */
 static int get_sheep_fd(BDRVSheepdogState *s)
@@ -819,7 +923,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
         return fd;
     }
 
-    qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
+    qemu_aio_set_fd_handler(fd, co_read_response, NULL, s);
     return fd;
 }
 
@@ -1012,7 +1116,7 @@ out:
     return ret;
 }
 
-static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
                            struct iovec *iov, int niov, bool create,
                            enum AIOCBState aiocb_type)
 {
@@ -1069,36 +1173,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
     qemu_co_mutex_lock(&s->lock);
     s->co_send = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
-                            aio_flush_request, s);
+    qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, s);
     socket_set_cork(s->fd, 1);
 
     /* send a header */
     ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
-    if (ret < 0) {
-        qemu_co_mutex_unlock(&s->lock);
+    if (ret != sizeof(hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
-        return -errno;
+        goto out;
     }
 
     if (wlen) {
         ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
-        if (ret < 0) {
-            qemu_co_mutex_unlock(&s->lock);
+        if (ret != wlen) {
             error_report("failed to send a data, %s", strerror(errno));
-            return -errno;
         }
     }
-
+out:
     socket_set_cork(s->fd, 0);
-    qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
-                            aio_flush_request, s);
+    qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, s);
+    s->co_send = NULL;
     qemu_co_mutex_unlock(&s->lock);
-
-    return 0;
 }
 
-static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
+static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies,
                              unsigned int datalen, uint64_t offset,
                              bool write, bool create, uint32_t cache_flags)
 {
@@ -1146,7 +1244,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
     }
 }
 
-static int read_object(int fd, char *buf, uint64_t oid, int copies,
+static int read_object(int fd, char *buf, uint64_t oid, uint8_t copies,
                        unsigned int datalen, uint64_t offset,
                        uint32_t cache_flags)
 {
@@ -1154,7 +1252,7 @@ static int read_object(int fd, char *buf, uint64_t oid, int copies,
                              false, cache_flags);
 }
 
-static int write_object(int fd, char *buf, uint64_t oid, int copies,
+static int write_object(int fd, char *buf, uint64_t oid, uint8_t copies,
                         unsigned int datalen, uint64_t offset, bool create,
                         uint32_t cache_flags)
 {
@@ -1198,51 +1296,62 @@ out:
     return ret;
 }
 
-static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+/* Return true if the specified request is linked to the pending list. */
+static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
 {
-    SheepdogAIOCB *acb = aio_req->aiocb;
-    bool create = false;
-    int ret;
-
-    ret = reload_inode(s, 0, "");
-    if (ret < 0) {
-        return ret;
+    AIOReq *areq;
+    QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
+        if (areq != aio_req && areq->oid == aio_req->oid) {
+            /*
+             * Sheepdog cannot handle simultaneous create requests to the same
+             * object, so we cannot send the request until the previous request
+             * finishes.
+             */
+            DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
+            aio_req->flags = 0;
+            aio_req->base_oid = 0;
+            QLIST_REMOVE(aio_req, aio_siblings);
+            QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
+            return true;
+        }
     }
 
-    aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
-                                   data_oid_to_idx(aio_req->oid));
+    return false;
+}
+
+static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+    SheepdogAIOCB *acb = aio_req->aiocb;
+    bool create = false;
 
     /* check whether this request becomes a CoW one */
-    if (acb->aiocb_type == AIOCB_WRITE_UDATA) {
+    if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
         int idx = data_oid_to_idx(aio_req->oid);
-        AIOReq *areq;
 
-        if (s->inode.data_vdi_id[idx] == 0) {
-            create = true;
-            goto out;
-        }
         if (is_data_obj_writable(&s->inode, idx)) {
             goto out;
         }
 
-        /* link to the pending list if there is another CoW request to
-         * the same object */
-        QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
-            if (areq != aio_req && areq->oid == aio_req->oid) {
-                DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid);
-                QLIST_REMOVE(aio_req, aio_siblings);
-                QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
-                return SD_RES_SUCCESS;
-            }
+        if (check_simultaneous_create(s, aio_req)) {
+            return;
         }
 
-        aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
-        aio_req->flags |= SD_FLAG_CMD_COW;
+        if (s->inode.data_vdi_id[idx]) {
+            aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
+            aio_req->flags |= SD_FLAG_CMD_COW;
+        }
         create = true;
     }
 out:
-    return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
-                           create, acb->aiocb_type);
+    if (is_data_obj(aio_req->oid)) {
+        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create,
+                        acb->aiocb_type);
+    } else {
+        struct iovec iov;
+        iov.iov_base = &s->inode;
+        iov.iov_len = sizeof(s->inode);
+        add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
+    }
 }
 
 /* TODO Convert to fine grained options */
@@ -1259,7 +1368,8 @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static int sd_open(BlockDriverState *bs, QDict *options, int flags)
+static int sd_open(BlockDriverState *bs, QDict *options, int flags,
+                   Error **errp)
 {
     int ret, fd;
     uint32_t vid = 0;
@@ -1271,9 +1381,11 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags)
     Error *local_err = NULL;
     const char *filename;
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    s->bs = bs;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         qerror_report_err(local_err);
         error_free(local_err);
         ret = -EINVAL;
@@ -1284,6 +1396,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags)
 
     QLIST_INIT(&s->inflight_aio_head);
     QLIST_INIT(&s->pending_aio_head);
+    QLIST_INIT(&s->failed_aio_head);
     s->fd = -1;
 
     memset(vdi, 0, sizeof(vdi));
@@ -1350,7 +1463,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags)
     g_free(buf);
     return 0;
 out:
-    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
+    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL);
     if (s->fd >= 0) {
         closesocket(s->fd);
     }
@@ -1359,8 +1472,7 @@ out:
     return ret;
 }
 
-static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
-                        uint32_t base_vid, uint32_t *vdi_id, int snapshot)
+static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot)
 {
     SheepdogVdiReq hdr;
     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
@@ -1377,11 +1489,11 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
      * does not fit in buf?  For now, just truncate and avoid buffer overrun.
      */
     memset(buf, 0, sizeof(buf));
-    pstrcpy(buf, sizeof(buf), filename);
+    pstrcpy(buf, sizeof(buf), s->name);
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
-    hdr.vdi_id = base_vid;
+    hdr.base_vdi_id = s->inode.vdi_id;
 
     wlen = SD_MAX_VDI_LEN;
 
@@ -1389,7 +1501,9 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
     hdr.snapid = snapshot;
 
     hdr.data_length = wlen;
-    hdr.vdi_size = vdi_size;
+    hdr.vdi_size = s->inode.vdi_size;
+    hdr.copy_policy = s->inode.copy_policy;
+    hdr.copies = s->inode.nr_copies;
 
     ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
@@ -1400,7 +1514,7 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
     }
 
     if (rsp->result != SD_RES_SUCCESS) {
-        error_report("%s, %s", sd_strerror(rsp->result), filename);
+        error_report("%s, %s", sd_strerror(rsp->result), s->inode.name);
         return -EIO;
     }
 
@@ -1417,10 +1531,14 @@ static int sd_prealloc(const char *filename)
     uint32_t idx, max_idx;
     int64_t vdi_size;
     void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+    Error *local_err = NULL;
     int ret;
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
     if (ret < 0) {
+        qerror_report_err(local_err);
+        error_free(local_err);
         goto out;
     }
 
@@ -1447,32 +1565,86 @@ static int sd_prealloc(const char *filename)
     }
 out:
     if (bs) {
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
     g_free(buf);
 
     return ret;
 }
 
-static int sd_create(const char *filename, QEMUOptionParameter *options)
+/*
+ * Sheepdog support two kinds of redundancy, full replication and erasure
+ * coding.
+ *
+ * # create a fully replicated vdi with x copies
+ * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
+ *
+ * # create a erasure coded vdi with x data strips and y parity strips
+ * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
+ */
+static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
+{
+    struct SheepdogInode *inode = &s->inode;
+    const char *n1, *n2;
+    long copy, parity;
+    char p[10];
+
+    pstrcpy(p, sizeof(p), opt);
+    n1 = strtok(p, ":");
+    n2 = strtok(NULL, ":");
+
+    if (!n1) {
+        return -EINVAL;
+    }
+
+    copy = strtol(n1, NULL, 10);
+    if (copy > SD_MAX_COPIES || copy < 1) {
+        return -EINVAL;
+    }
+    if (!n2) {
+        inode->copy_policy = 0;
+        inode->nr_copies = copy;
+        return 0;
+    }
+
+    if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
+        return -EINVAL;
+    }
+
+    parity = strtol(n2, NULL, 10);
+    if (parity >= SD_EC_MAX_STRIP || parity < 1) {
+        return -EINVAL;
+    }
+
+    /*
+     * 4 bits for parity and 4 bits for data.
+     * We have to compress upper data bits because it can't represent 16
+     */
+    inode->copy_policy = ((copy / 2) << 4) + parity;
+    inode->nr_copies = copy + parity;
+
+    return 0;
+}
+
+static int sd_create(const char *filename, QEMUOptionParameter *options,
+                     Error **errp)
 {
     int ret = 0;
-    uint32_t vid = 0, base_vid = 0;
-    int64_t vdi_size = 0;
+    uint32_t vid = 0;
     char *backing_file = NULL;
     BDRVSheepdogState *s;
-    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+    char tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
     bool prealloc = false;
+    Error *local_err = NULL;
 
     s = g_malloc0(sizeof(BDRVSheepdogState));
 
-    memset(vdi, 0, sizeof(vdi));
     memset(tag, 0, sizeof(tag));
     if (strstr(filename, "://")) {
-        ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+        ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
     } else {
-        ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+        ret = parse_vdiname(s, filename, s->name, &snapid, tag);
     }
     if (ret < 0) {
         goto out;
@@ -1480,7 +1652,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
 
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
-            vdi_size = options->value.n;
+            s->inode.vdi_size = options->value.n;
         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
             backing_file = options->value.s;
         } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
@@ -1494,11 +1666,18 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
                 ret = -EINVAL;
                 goto out;
             }
+        } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) {
+            if (options->value.s) {
+                ret = parse_redundancy(s, options->value.s);
+                if (ret < 0) {
+                    goto out;
+                }
+            }
         }
         options++;
     }
 
-    if (vdi_size > SD_MAX_VDI_SIZE) {
+    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
         error_report("too big image size");
         ret = -EINVAL;
         goto out;
@@ -1506,7 +1685,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
 
     if (backing_file) {
         BlockDriverState *bs;
-        BDRVSheepdogState *s;
+        BDRVSheepdogState *base;
         BlockDriver *drv;
 
         /* Currently, only Sheepdog backing image is supported. */
@@ -1517,25 +1696,28 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
             goto out;
         }
 
-        ret = bdrv_file_open(&bs, backing_file, NULL, 0);
+        bs = NULL;
+        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL,
+                        &local_err);
         if (ret < 0) {
+            qerror_report_err(local_err);
+            error_free(local_err);
             goto out;
         }
 
-        s = bs->opaque;
+        base = bs->opaque;
 
-        if (!is_snapshot(&s->inode)) {
+        if (!is_snapshot(&base->inode)) {
             error_report("cannot clone from a non snapshot vdi");
-            bdrv_delete(bs);
+            bdrv_unref(bs);
             ret = -EINVAL;
             goto out;
         }
-
-        base_vid = s->inode.vdi_id;
-        bdrv_delete(bs);
+        s->inode.vdi_id = base->inode.vdi_id;
+        bdrv_unref(bs);
     }
 
-    ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0);
+    ret = do_sd_create(s, &vid, 0);
     if (!prealloc || ret) {
         goto out;
     }
@@ -1564,7 +1746,7 @@ static void sd_close(BlockDriverState *bs)
     memset(&hdr, 0, sizeof(hdr));
 
     hdr.opcode = SD_OP_RELEASE_VDI;
-    hdr.vdi_id = s->inode.vdi_id;
+    hdr.base_vdi_id = s->inode.vdi_id;
     wlen = strlen(s->name) + 1;
     hdr.data_length = wlen;
     hdr.flags = SD_FLAG_CMD_WRITE;
@@ -1578,7 +1760,7 @@ static void sd_close(BlockDriverState *bs)
         error_report("%s, %s", sd_strerror(rsp->result), s->name);
     }
 
-    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
+    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL);
     closesocket(s->fd);
     g_free(s->host_spec);
 }
@@ -1630,7 +1812,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
-    int ret;
     BDRVSheepdogState *s = acb->common.bs->opaque;
     struct iovec iov;
     AIOReq *aio_req;
@@ -1652,18 +1833,13 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
         aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                                 data_len, offset, 0, 0, offset);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
-        if (ret) {
-            free_aio_req(s, aio_req);
-            acb->ret = -EIO;
-            goto out;
-        }
+        add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
 
         acb->aio_done_func = sd_finish_aiocb;
         acb->aiocb_type = AIOCB_WRITE_UDATA;
         return;
     }
-out:
+
     sd_finish_aiocb(acb);
 }
 
@@ -1673,7 +1849,7 @@ static bool sd_delete(BDRVSheepdogState *s)
     unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
     SheepdogVdiReq hdr = {
         .opcode = SD_OP_DEL_VDI,
-        .vdi_id = s->inode.vdi_id,
+        .base_vdi_id = s->inode.vdi_id,
         .data_length = wlen,
         .flags = SD_FLAG_CMD_WRITE,
     };
@@ -1720,12 +1896,11 @@ static int sd_create_branch(BDRVSheepdogState *s)
 
     /*
      * Even If deletion fails, we will just create extra snapshot based on
-     * the workding VDI which was supposed to be deleted. So no need to
+     * the working VDI which was supposed to be deleted. So no need to
      * false bail out.
      */
     deleted = sd_delete(s);
-    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid,
-                       !deleted);
+    ret = do_sd_create(s, &vid, !deleted);
     if (ret) {
         goto out;
     }
@@ -1849,35 +2024,16 @@ static int coroutine_fn sd_co_rw_vector(void *p)
         }
 
         aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
+        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 
         if (create) {
-            AIOReq *areq;
-            QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
-                if (areq->oid == oid) {
-                    /*
-                     * Sheepdog cannot handle simultaneous create
-                     * requests to the same object.  So we cannot send
-                     * the request until the previous request
-                     * finishes.
-                     */
-                    aio_req->flags = 0;
-                    aio_req->base_oid = 0;
-                    QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
-                                      aio_siblings);
-                    goto done;
-                }
+            if (check_simultaneous_create(s, aio_req)) {
+                goto done;
             }
         }
 
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
-                              create, acb->aiocb_type);
-        if (ret < 0) {
-            error_report("add_aio_request is failed");
-            free_aio_req(s, aio_req);
-            acb->ret = -EIO;
-            goto out;
-        }
+        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create,
+                        acb->aiocb_type);
     done:
         offset = 0;
         idx++;
@@ -1895,13 +2051,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 {
     SheepdogAIOCB *acb;
     int ret;
+    int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
+    BDRVSheepdogState *s = bs->opaque;
 
-    if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
-        ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
+    if (bs->growable && offset > s->inode.vdi_size) {
+        ret = sd_truncate(bs, offset);
         if (ret < 0) {
             return ret;
         }
-        bs->total_sectors = sector_num + nb_sectors;
     }
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
@@ -1945,7 +2102,6 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
     BDRVSheepdogState *s = bs->opaque;
     SheepdogAIOCB *acb;
     AIOReq *aio_req;
-    int ret;
 
     if (s->cache_flags != SD_FLAG_CMD_CACHE) {
         return 0;
@@ -1958,13 +2114,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
     aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                             0, 0, 0, 0, 0);
     QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-    ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
-    if (ret < 0) {
-        error_report("add_aio_request is failed");
-        free_aio_req(s, aio_req);
-        qemu_aio_release(acb);
-        return ret;
-    }
+    add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
 
     qemu_coroutine_yield();
     return acb->ret;
@@ -2014,8 +2164,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto cleanup;
     }
 
-    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid,
-                       1);
+    ret = do_sd_create(s, &new_vid, 1);
     if (ret < 0) {
         error_report("failed to create inode for snapshot. %s",
                      strerror(errno));
@@ -2045,7 +2194,7 @@ cleanup:
  * We implement rollback(loadvm) operation to the specified snapshot by
  * 1) switch to the snapshot
  * 2) rely on sd_create_branch to delete working VDI and
- * 3) create a new working VDI based on the speicified snapshot
+ * 3) create a new working VDI based on the specified snapshot
  */
 static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 {
@@ -2089,7 +2238,10 @@ out:
     return ret;
 }
 
-static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+static int sd_snapshot_delete(BlockDriverState *bs,
+                              const char *snapshot_id,
+                              const char *name,
+                              Error **errp)
 {
     /* FIXME: Delete specified snapshot id.  */
     return 0;
@@ -2287,17 +2439,18 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
-static coroutine_fn int
-sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                   int *pnum)
+static coroutine_fn int64_t
+sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                       int *pnum)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
-    unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+    unsigned long start = offset / SD_DATA_OBJ_SIZE,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
                                      BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
     unsigned long idx;
-    int ret = 1;
+    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 
     for (idx = start; idx < end; idx++) {
         if (inode->data_vdi_id[idx] == 0) {
@@ -2321,6 +2474,22 @@ sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     return ret;
 }
 
+static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogInode *inode = &s->inode;
+    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+    uint64_t size = 0;
+
+    for (i = 0; i < last; i++) {
+        if (inode->data_vdi_id[i] == 0) {
+            continue;
+        }
+        size += SD_DATA_OBJ_SIZE;
+    }
+    return size;
+}
+
 static QEMUOptionParameter sd_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -2337,6 +2506,11 @@ static QEMUOptionParameter sd_create_options[] = {
         .type = OPT_STRING,
         .help = "Preallocation mode (allowed values: off, full)"
     },
+    {
+        .name = BLOCK_OPT_REDUNDANCY,
+        .type = OPT_STRING,
+        .help = "Redundancy of the image"
+    },
     { NULL }
 };
 
@@ -2344,18 +2518,20 @@ static BlockDriver bdrv_sheepdog = {
     .format_name    = "sheepdog",
     .protocol_name  = "sheepdog",
     .instance_size  = sizeof(BDRVSheepdogState),
+    .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
     .bdrv_truncate  = sd_truncate,
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
@@ -2372,18 +2548,20 @@ static BlockDriver bdrv_sheepdog_tcp = {
     .format_name    = "sheepdog",
     .protocol_name  = "sheepdog+tcp",
     .instance_size  = sizeof(BDRVSheepdogState),
+    .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
     .bdrv_truncate  = sd_truncate,
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
@@ -2400,18 +2578,20 @@ static BlockDriver bdrv_sheepdog_unix = {
     .format_name    = "sheepdog",
     .protocol_name  = "sheepdog+unix",
     .instance_size  = sizeof(BDRVSheepdogState),
+    .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
     .bdrv_truncate  = sd_truncate,
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
diff --git a/block/snapshot.c b/block/snapshot.c
index 6c6d9deea..85c52ff45 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -25,6 +25,24 @@
 #include "block/snapshot.h"
 #include "block/block_int.h"
 
+QemuOptsList internal_snapshot_opts = {
+    .name = "snapshot",
+    .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head),
+    .desc = {
+        {
+            .name = SNAPSHOT_OPT_ID,
+            .type = QEMU_OPT_STRING,
+            .help = "snapshot id"
+        },{
+            .name = SNAPSHOT_OPT_NAME,
+            .type = QEMU_OPT_STRING,
+            .help = "snapshot name"
+        },{
+            /* end of list */
+        }
+    },
+};
+
 int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
                        const char *name)
 {
@@ -48,6 +66,79 @@ int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
     return ret;
 }
 
+/**
+ * Look up an internal snapshot by @id and @name.
+ * @bs: block device to search
+ * @id: unique snapshot ID, or NULL
+ * @name: snapshot name, or NULL
+ * @sn_info: location to store information on the snapshot found
+ * @errp: location to store error, will be set only for exception
+ *
+ * This function will traverse snapshot list in @bs to search the matching
+ * one, @id and @name are the matching condition:
+ * If both @id and @name are specified, find the first one with id @id and
+ * name @name.
+ * If only @id is specified, find the first one with id @id.
+ * If only @name is specified, find the first one with name @name.
+ * if none is specified, abort().
+ *
+ * Returns: true when a snapshot is found and @sn_info will be filled, false
+ * when error or not found. If all operation succeed but no matching one is
+ * found, @errp will NOT be set.
+ */
+bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
+                                       const char *id,
+                                       const char *name,
+                                       QEMUSnapshotInfo *sn_info,
+                                       Error **errp)
+{
+    QEMUSnapshotInfo *sn_tab, *sn;
+    int nb_sns, i;
+    bool ret = false;
+
+    assert(id || name);
+
+    nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+    if (nb_sns < 0) {
+        error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list");
+        return false;
+    } else if (nb_sns == 0) {
+        return false;
+    }
+
+    if (id && name) {
+        for (i = 0; i < nb_sns; i++) {
+            sn = &sn_tab[i];
+            if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) {
+                *sn_info = *sn;
+                ret = true;
+                break;
+            }
+        }
+    } else if (id) {
+        for (i = 0; i < nb_sns; i++) {
+            sn = &sn_tab[i];
+            if (!strcmp(sn->id_str, id)) {
+                *sn_info = *sn;
+                ret = true;
+                break;
+            }
+        }
+    } else if (name) {
+        for (i = 0; i < nb_sns; i++) {
+            sn = &sn_tab[i];
+            if (!strcmp(sn->name, name)) {
+                *sn_info = *sn;
+                ret = true;
+                break;
+            }
+        }
+    }
+
+    g_free(sn_tab);
+    return ret;
+}
+
 int bdrv_can_snapshot(BlockDriverState *bs)
 {
     BlockDriver *drv = bs->drv;
@@ -97,9 +188,9 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
     if (bs->file) {
         drv->bdrv_close(bs);
         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
-        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
+        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL);
         if (open_ret < 0) {
-            bdrv_delete(bs->file);
+            bdrv_unref(bs->file);
             bs->drv = NULL;
             return open_ret;
         }
@@ -109,21 +200,73 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
     return -ENOTSUP;
 }
 
-int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+/**
+ * Delete an internal snapshot by @snapshot_id and @name.
+ * @bs: block device used in the operation
+ * @snapshot_id: unique snapshot ID, or NULL
+ * @name: snapshot name, or NULL
+ * @errp: location to store error
+ *
+ * If both @snapshot_id and @name are specified, delete the first one with
+ * id @snapshot_id and name @name.
+ * If only @snapshot_id is specified, delete the first one with id
+ * @snapshot_id.
+ * If only @name is specified, delete the first one with name @name.
+ * if none is specified, return -EINVAL.
+ *
+ * Returns: 0 on success, -errno on failure. If @bs is not inserted, return
+ * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs
+ * does not support internal snapshot deletion, return -ENOTSUP. If @bs does
+ * not support parameter @snapshot_id or @name, or one of them is not correctly
+ * specified, return -EINVAL. If @bs can't find one matching @id and @name,
+ * return -ENOENT. If @errp != NULL, it will always be filled with error
+ * message on failure.
+ */
+int bdrv_snapshot_delete(BlockDriverState *bs,
+                         const char *snapshot_id,
+                         const char *name,
+                         Error **errp)
 {
     BlockDriver *drv = bs->drv;
     if (!drv) {
+        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
         return -ENOMEDIUM;
     }
+    if (!snapshot_id && !name) {
+        error_setg(errp, "snapshot_id and name are both NULL");
+        return -EINVAL;
+    }
     if (drv->bdrv_snapshot_delete) {
-        return drv->bdrv_snapshot_delete(bs, snapshot_id);
+        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
     }
     if (bs->file) {
-        return bdrv_snapshot_delete(bs->file, snapshot_id);
+        return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
     }
+    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+              drv->format_name, bdrv_get_device_name(bs),
+              "internal snapshot deletion");
     return -ENOTSUP;
 }
 
+void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
+                                        const char *id_or_name,
+                                        Error **errp)
+{
+    int ret;
+    Error *local_err = NULL;
+
+    ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err);
+    if (ret == -ENOENT || ret == -EINVAL) {
+        error_free(local_err);
+        local_err = NULL;
+        ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err);
+    }
+
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+    }
+}
+
 int bdrv_snapshot_list(BlockDriverState *bs,
                        QEMUSnapshotInfo **psn_info)
 {
@@ -140,18 +283,71 @@ int bdrv_snapshot_list(BlockDriverState *bs,
     return -ENOTSUP;
 }
 
+/**
+ * Temporarily load an internal snapshot by @snapshot_id and @name.
+ * @bs: block device used in the operation
+ * @snapshot_id: unique snapshot ID, or NULL
+ * @name: snapshot name, or NULL
+ * @errp: location to store error
+ *
+ * If both @snapshot_id and @name are specified, load the first one with
+ * id @snapshot_id and name @name.
+ * If only @snapshot_id is specified, load the first one with id
+ * @snapshot_id.
+ * If only @name is specified, load the first one with name @name.
+ * if none is specified, return -EINVAL.
+ *
+ * Returns: 0 on success, -errno on fail. If @bs is not inserted, return
+ * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support
+ * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and
+ * @name, return -ENOENT. If @errp != NULL, it will always be filled on
+ * failure.
+ */
 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
-        const char *snapshot_name)
+                           const char *snapshot_id,
+                           const char *name,
+                           Error **errp)
 {
     BlockDriver *drv = bs->drv;
+
     if (!drv) {
+        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
         return -ENOMEDIUM;
     }
+    if (!snapshot_id && !name) {
+        error_setg(errp, "snapshot_id and name are both NULL");
+        return -EINVAL;
+    }
     if (!bs->read_only) {
+        error_setg(errp, "Device is not readonly");
         return -EINVAL;
     }
     if (drv->bdrv_snapshot_load_tmp) {
-        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+        return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
     }
+    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+              drv->format_name, bdrv_get_device_name(bs),
+              "temporarily load internal snapshot");
     return -ENOTSUP;
 }
+
+int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
+                                         const char *id_or_name,
+                                         Error **errp)
+{
+    int ret;
+    Error *local_err = NULL;
+
+    ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err);
+    if (ret == -ENOENT || ret == -EINVAL) {
+        error_free(local_err);
+        local_err = NULL;
+        ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err);
+    }
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+
+    return ret;
+}
diff --git a/block/ssh.c b/block/ssh.c
index d7e7bf8dd..aa63c9d20 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -608,7 +608,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
     return ret;
 }
 
-static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags)
+static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags,
+                         Error **errp)
 {
     BDRVSSHState *s = bs->opaque;
     int ret;
@@ -650,7 +651,8 @@ static QEMUOptionParameter ssh_create_options[] = {
     { NULL }
 };
 
-static int ssh_create(const char *filename, QEMUOptionParameter *options)
+static int ssh_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     int r, ret;
     Error *local_err = NULL;
@@ -740,14 +742,6 @@ static void restart_coroutine(void *opaque)
     qemu_coroutine_enter(co, NULL);
 }
 
-/* Always true because when we have called set_fd_handler there is
- * always a request being processed.
- */
-static int return_true(void *opaque)
-{
-    return 1;
-}
-
 static coroutine_fn void set_fd_handler(BDRVSSHState *s)
 {
     int r;
@@ -766,13 +760,13 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s)
     DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock,
             rd_handler, wr_handler);
 
-    qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co);
+    qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, co);
 }
 
 static coroutine_fn void clear_fd_handler(BDRVSSHState *s)
 {
     DPRINTF("s->sock=%d", s->sock);
-    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
+    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL);
 }
 
 /* A non-blocking call returned EAGAIN, so yield, ensuring the
diff --git a/block/stream.c b/block/stream.c
index 7fe9e486b..dd0b4ac3d 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -57,6 +57,11 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
     BlockDriverState *intermediate;
     intermediate = top->backing_hd;
 
+    /* Must assign before bdrv_delete() to prevent traversing dangling pointer
+     * while we delete backing image instances.
+     */
+    top->backing_hd = base;
+
     while (intermediate) {
         BlockDriverState *unused;
 
@@ -68,9 +73,10 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
         unused = intermediate;
         intermediate = intermediate->backing_hd;
         unused->backing_hd = NULL;
-        bdrv_delete(unused);
+        bdrv_unref(unused);
     }
-    top->backing_hd = base;
+
+    bdrv_refresh_limits(top);
 }
 
 static void coroutine_fn stream_run(void *opaque)
@@ -84,6 +90,11 @@ static void coroutine_fn stream_run(void *opaque)
     int n = 0;
     void *buf;
 
+    if (!bs->backing_hd) {
+        block_job_completed(&s->common, 0);
+        return;
+    }
+
     s->common.len = bdrv_getlength(bs);
     if (s->common.len < 0) {
         block_job_completed(&s->common, s->common.len);
@@ -110,21 +121,22 @@ wait:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
-        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
             break;
         }
 
-        ret = bdrv_co_is_allocated(bs, sector_num,
-                                   STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+        copy = false;
+
+        ret = bdrv_is_allocated(bs, sector_num,
+                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
         if (ret == 1) {
             /* Allocated in the top, no need to copy.  */
-            copy = false;
-        } else {
+        } else if (ret >= 0) {
             /* Copy if allocated in the intermediate images.  Limit to the
              * known-unallocated area [sector_num, sector_num+n).  */
-            ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
-                                             sector_num, n, &n);
+            ret = bdrv_is_allocated_above(bs->backing_hd, base,
+                                          sector_num, n, &n);
 
             /* Finish early if end of backing file has been reached */
             if (ret == 0 && n == 0) {
@@ -134,7 +146,7 @@ wait:
             copy = (ret == 1);
         }
         trace_stream_one_iteration(s, sector_num, n, ret);
-        if (ret >= 0 && copy) {
+        if (copy) {
             if (s->common.speed) {
                 delay_ns = ratelimit_calculate_delay(&s->limit, n);
                 if (delay_ns > 0) {
@@ -198,9 +210,9 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static const BlockJobType stream_job_type = {
+static const BlockJobDriver stream_job_driver = {
     .instance_size = sizeof(StreamBlockJob),
-    .job_type      = "stream",
+    .job_type      = BLOCK_JOB_TYPE_STREAM,
     .set_speed     = stream_set_speed,
 };
 
@@ -219,7 +231,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
+    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
diff --git a/block/vdi.c b/block/vdi.c
index 8a915257e..820cd376b 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -31,7 +31,7 @@
  * Allocation of blocks could be optimized (less writes to block map and
  * header).
  *
- * Read and write of adjacents blocks could be done in one operation
+ * Read and write of adjacent blocks could be done in one operation
  * (current code uses one operation per block (1 MiB).
  *
  * The code is not thread safe (missing locks for changes in header and
@@ -120,6 +120,11 @@ typedef unsigned char uuid_t[16];
 
 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
 
+/* max blocks in image is (0xffffffff / 4) */
+#define VDI_BLOCKS_IN_IMAGE_MAX  0x3fffffff
+#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
+                                  (uint64_t)DEFAULT_CLUSTER_SIZE)
+
 #if !defined(CONFIG_UUID)
 static inline void uuid_generate(uuid_t out)
 {
@@ -165,7 +170,7 @@ typedef struct {
     uuid_t uuid_link;
     uuid_t uuid_parent;
     uint64_t unused2[7];
-} VdiHeader;
+} QEMU_PACKED VdiHeader;
 
 typedef struct {
     /* The block map entries are little endian (even in memory). */
@@ -331,6 +336,7 @@ static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     logout("\n");
     bdi->cluster_size = s->block_size;
     bdi->vm_state_offset = 0;
+    bdi->unallocated_blocks_are_zero = true;
     return 0;
 }
 
@@ -364,7 +370,8 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
     return result;
 }
 
-static int vdi_open(BlockDriverState *bs, QDict *options, int flags)
+static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVVdiState *s = bs->opaque;
     VdiHeader header;
@@ -383,6 +390,14 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags)
     vdi_header_print(&header);
 #endif
 
+    if (header.disk_size > VDI_DISK_SIZE_MAX) {
+        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
+                          ", max supported is 0x%" PRIx64 ")",
+                          header.disk_size, VDI_DISK_SIZE_MAX);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
     if (header.disk_size % SECTOR_SIZE != 0) {
         /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
            We accept them but round the disk size to the next multiple of
@@ -393,43 +408,56 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags)
     }
 
     if (header.signature != VDI_SIGNATURE) {
-        logout("bad vdi signature %08x\n", header.signature);
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "Image not in VDI format (bad signature %08x)", header.signature);
+        ret = -EINVAL;
         goto fail;
     } else if (header.version != VDI_VERSION_1_1) {
-        logout("unsupported version %u.%u\n",
-               header.version >> 16, header.version & 0xffff);
+        error_setg(errp, "unsupported VDI image (version %u.%u)",
+                   header.version >> 16, header.version & 0xffff);
         ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_bmap % SECTOR_SIZE != 0) {
         /* We only support block maps which start on a sector boundary. */
-        logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
+        error_setg(errp, "unsupported VDI image (unaligned block map offset "
+                   "0x%x)", header.offset_bmap);
         ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_data % SECTOR_SIZE != 0) {
         /* We only support data blocks which start on a sector boundary. */
-        logout("unsupported data offset 0x%x B\n", header.offset_data);
+        error_setg(errp, "unsupported VDI image (unaligned data offset 0x%x)",
+                   header.offset_data);
         ret = -ENOTSUP;
         goto fail;
     } else if (header.sector_size != SECTOR_SIZE) {
-        logout("unsupported sector size %u B\n", header.sector_size);
+        error_setg(errp, "unsupported VDI image (sector size %u is not %u)",
+                   header.sector_size, SECTOR_SIZE);
         ret = -ENOTSUP;
         goto fail;
-    } else if (header.block_size != 1 * MiB) {
-        logout("unsupported block size %u B\n", header.block_size);
+    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
+        error_setg(errp, "unsupported VDI image (block size %u is not %u)",
+                   header.block_size, DEFAULT_CLUSTER_SIZE);
         ret = -ENOTSUP;
         goto fail;
     } else if (header.disk_size >
                (uint64_t)header.blocks_in_image * header.block_size) {
-        logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
+        error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", "
+                   "image bitmap has room for %" PRIu64 ")",
+                   header.disk_size,
+                   (uint64_t)header.blocks_in_image * header.block_size);
         ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_link)) {
-        logout("link uuid != 0, unsupported\n");
+        error_setg(errp, "unsupported VDI image (non-NULL link UUID)");
         ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_parent)) {
-        logout("parent uuid != 0, unsupported\n");
+        error_setg(errp, "unsupported VDI image (non-NULL parent UUID)");
+        ret = -ENOTSUP;
+        goto fail;
+    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
+        error_setg(errp, "unsupported VDI image "
+                         "(too many blocks %u, max is %u)",
+                          header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
         ret = -ENOTSUP;
         goto fail;
     }
@@ -470,7 +498,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
-static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
@@ -479,12 +507,23 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
     size_t sector_in_block = sector_num % s->block_sectors;
     int n_sectors = s->block_sectors - sector_in_block;
     uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
+    uint64_t offset;
+    int result;
+
     logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum);
     if (n_sectors > nb_sectors) {
         n_sectors = nb_sectors;
     }
     *pnum = n_sectors;
-    return VDI_IS_ALLOCATED(bmap_entry);
+    result = VDI_IS_ALLOCATED(bmap_entry);
+    if (!result) {
+        return 0;
+    }
+
+    offset = s->header.offset_data +
+                              (uint64_t)bmap_entry * s->block_size +
+                              sector_in_block * SECTOR_SIZE;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }
 
 static int vdi_co_read(BlockDriverState *bs,
@@ -633,7 +672,8 @@ static int vdi_co_write(BlockDriverState *bs,
     return ret;
 }
 
-static int vdi_create(const char *filename, QEMUOptionParameter *options)
+static int vdi_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     int fd;
     int result = 0;
@@ -668,11 +708,20 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
+    if (bytes > VDI_DISK_SIZE_MAX) {
+        result = -ENOTSUP;
+        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
+                          ", max supported is 0x%" PRIx64 ")",
+                          bytes, VDI_DISK_SIZE_MAX);
+        goto exit;
+    }
+
     fd = qemu_open(filename,
                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
                    0644);
     if (fd < 0) {
-        return -errno;
+        result = -errno;
+        goto exit;
     }
 
     /* We need enough blocks to store the given disk size,
@@ -733,6 +782,7 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
         result = -errno;
     }
 
+exit:
     return result;
 }
 
@@ -780,7 +830,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_reopen_prepare = vdi_reopen_prepare,
     .bdrv_create = vdi_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = vdi_co_is_allocated,
+    .bdrv_co_get_block_status = vdi_co_get_block_status,
     .bdrv_make_empty = vdi_make_empty,
 
     .bdrv_read = vdi_co_read,
diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c
new file mode 100644
index 000000000..fe879ed99
--- /dev/null
+++ b/block/vhdx-endian.c
@@ -0,0 +1,216 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/vhdx.h"
+
+#include <uuid/uuid.h>
+
+
+/*
+ * All the VHDX formats on disk are little endian - the following
+ * are helper import/export functions to correctly convert
+ * endianness from disk read to native cpu format, and back again.
+ */
+
+
+/* VHDX File Header */
+
+
+void vhdx_header_le_import(VHDXHeader *h)
+{
+    assert(h != NULL);
+
+    le32_to_cpus(&h->signature);
+    le32_to_cpus(&h->checksum);
+    le64_to_cpus(&h->sequence_number);
+
+    leguid_to_cpus(&h->file_write_guid);
+    leguid_to_cpus(&h->data_write_guid);
+    leguid_to_cpus(&h->log_guid);
+
+    le16_to_cpus(&h->log_version);
+    le16_to_cpus(&h->version);
+    le32_to_cpus(&h->log_length);
+    le64_to_cpus(&h->log_offset);
+}
+
+void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h)
+{
+    assert(orig_h != NULL);
+    assert(new_h != NULL);
+
+    new_h->signature       = cpu_to_le32(orig_h->signature);
+    new_h->checksum        = cpu_to_le32(orig_h->checksum);
+    new_h->sequence_number = cpu_to_le64(orig_h->sequence_number);
+
+    new_h->file_write_guid = orig_h->file_write_guid;
+    new_h->data_write_guid = orig_h->data_write_guid;
+    new_h->log_guid        = orig_h->log_guid;
+
+    cpu_to_leguids(&new_h->file_write_guid);
+    cpu_to_leguids(&new_h->data_write_guid);
+    cpu_to_leguids(&new_h->log_guid);
+
+    new_h->log_version     = cpu_to_le16(orig_h->log_version);
+    new_h->version         = cpu_to_le16(orig_h->version);
+    new_h->log_length      = cpu_to_le32(orig_h->log_length);
+    new_h->log_offset      = cpu_to_le64(orig_h->log_offset);
+}
+
+
+/* VHDX Log Headers */
+
+
+void vhdx_log_desc_le_import(VHDXLogDescriptor *d)
+{
+    assert(d != NULL);
+
+    le32_to_cpus(&d->signature);
+    le32_to_cpus(&d->trailing_bytes);
+    le64_to_cpus(&d->leading_bytes);
+    le64_to_cpus(&d->file_offset);
+    le64_to_cpus(&d->sequence_number);
+}
+
+void vhdx_log_desc_le_export(VHDXLogDescriptor *d)
+{
+    assert(d != NULL);
+
+    cpu_to_le32s(&d->signature);
+    cpu_to_le32s(&d->trailing_bytes);
+    cpu_to_le64s(&d->leading_bytes);
+    cpu_to_le64s(&d->file_offset);
+    cpu_to_le64s(&d->sequence_number);
+}
+
+void vhdx_log_data_le_export(VHDXLogDataSector *d)
+{
+    assert(d != NULL);
+
+    cpu_to_le32s(&d->data_signature);
+    cpu_to_le32s(&d->sequence_high);
+    cpu_to_le32s(&d->sequence_low);
+}
+
+void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    le32_to_cpus(&hdr->signature);
+    le32_to_cpus(&hdr->checksum);
+    le32_to_cpus(&hdr->entry_length);
+    le32_to_cpus(&hdr->tail);
+    le64_to_cpus(&hdr->sequence_number);
+    le32_to_cpus(&hdr->descriptor_count);
+    leguid_to_cpus(&hdr->log_guid);
+    le64_to_cpus(&hdr->flushed_file_offset);
+    le64_to_cpus(&hdr->last_file_offset);
+}
+
+void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    cpu_to_le32s(&hdr->signature);
+    cpu_to_le32s(&hdr->checksum);
+    cpu_to_le32s(&hdr->entry_length);
+    cpu_to_le32s(&hdr->tail);
+    cpu_to_le64s(&hdr->sequence_number);
+    cpu_to_le32s(&hdr->descriptor_count);
+    cpu_to_leguids(&hdr->log_guid);
+    cpu_to_le64s(&hdr->flushed_file_offset);
+    cpu_to_le64s(&hdr->last_file_offset);
+}
+
+
+/* Region table entries */
+void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    le32_to_cpus(&hdr->signature);
+    le32_to_cpus(&hdr->checksum);
+    le32_to_cpus(&hdr->entry_count);
+}
+
+void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    cpu_to_le32s(&hdr->signature);
+    cpu_to_le32s(&hdr->checksum);
+    cpu_to_le32s(&hdr->entry_count);
+}
+
+void vhdx_region_entry_le_import(VHDXRegionTableEntry *e)
+{
+    assert(e != NULL);
+
+    leguid_to_cpus(&e->guid);
+    le64_to_cpus(&e->file_offset);
+    le32_to_cpus(&e->length);
+    le32_to_cpus(&e->data_bits);
+}
+
+void vhdx_region_entry_le_export(VHDXRegionTableEntry *e)
+{
+    assert(e != NULL);
+
+    cpu_to_leguids(&e->guid);
+    cpu_to_le64s(&e->file_offset);
+    cpu_to_le32s(&e->length);
+    cpu_to_le32s(&e->data_bits);
+}
+
+
+/* Metadata headers & table */
+void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    le64_to_cpus(&hdr->signature);
+    le16_to_cpus(&hdr->entry_count);
+}
+
+void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr)
+{
+    assert(hdr != NULL);
+
+    cpu_to_le64s(&hdr->signature);
+    cpu_to_le16s(&hdr->entry_count);
+}
+
+void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e)
+{
+    assert(e != NULL);
+
+    leguid_to_cpus(&e->item_id);
+    le32_to_cpus(&e->offset);
+    le32_to_cpus(&e->length);
+    le32_to_cpus(&e->data_bits);
+}
+void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e)
+{
+    assert(e != NULL);
+
+    cpu_to_leguids(&e->item_id);
+    cpu_to_le32s(&e->offset);
+    cpu_to_le32s(&e->length);
+    cpu_to_le32s(&e->data_bits);
+}
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
new file mode 100644
index 000000000..a77c040ee
--- /dev/null
+++ b/block/vhdx-log.c
@@ -0,0 +1,1021 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
+ *
+ * This file covers the functionality of the metadata log writing, parsing, and
+ * replay.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "block/vhdx.h"
+
+
+typedef struct VHDXLogSequence {
+    bool valid;
+    uint32_t count;
+    VHDXLogEntries log;
+    VHDXLogEntryHeader hdr;
+} VHDXLogSequence;
+
+typedef struct VHDXLogDescEntries {
+    VHDXLogEntryHeader hdr;
+    VHDXLogDescriptor desc[];
+} VHDXLogDescEntries;
+
+static const MSGUID zero_guid = { 0 };
+
+/* The log located on the disk is circular buffer containing
+ * sectors of 4096 bytes each.
+ *
+ * It is assumed for the read/write functions below that the
+ * circular buffer scheme uses a 'one sector open' to indicate
+ * the buffer is full.  Given the validation methods used for each
+ * sector, this method should be compatible with other methods that
+ * do not waste a sector.
+ */
+
+
+/* Allow peeking at the hdr entry at the beginning of the current
+ * read index, without advancing the read index */
+static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
+                             VHDXLogEntryHeader *hdr)
+{
+    int ret = 0;
+    uint64_t offset;
+    uint32_t read;
+
+    assert(hdr != NULL);
+
+    /* peek is only supported on sector boundaries */
+    if (log->read % VHDX_LOG_SECTOR_SIZE) {
+        ret = -EFAULT;
+        goto exit;
+    }
+
+    read = log->read;
+    /* we are guaranteed that a) log sectors are 4096 bytes,
+     * and b) the log length is a multiple of 1MB. So, there
+     * is always a round number of sectors in the buffer */
+    if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
+        read = 0;
+    }
+
+    if (read == log->write) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    offset = log->offset + read;
+
+    ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
+    if (ret < 0) {
+        goto exit;
+    }
+
+exit:
+    return ret;
+}
+
+/* Index increment for log, based on sector boundaries */
+static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
+{
+    idx += VHDX_LOG_SECTOR_SIZE;
+    /* we are guaranteed that a) log sectors are 4096 bytes,
+     * and b) the log length is a multiple of 1MB. So, there
+     * is always a round number of sectors in the buffer */
+    return idx >= length ? 0 : idx;
+}
+
+
+/* Reset the log to empty */
+static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    MSGUID guid = { 0 };
+    s->log.read = s->log.write = 0;
+    /* a log guid of 0 indicates an empty log to any parser of v0
+     * VHDX logs */
+    vhdx_update_headers(bs, s, false, &guid);
+}
+
+/* Reads num_sectors from the log (all log sectors are 4096 bytes),
+ * into buffer 'buffer'.  Upon return, *sectors_read will contain
+ * the number of sectors successfully read.
+ *
+ * It is assumed that 'buffer' is already allocated, and of sufficient
+ * size (i.e. >= 4096*num_sectors).
+ *
+ * If 'peek' is true, then the tail (read) pointer for the circular buffer is
+ * not modified.
+ *
+ * 0 is returned on success, -errno otherwise.  */
+static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
+                                 uint32_t *sectors_read, void *buffer,
+                                 uint32_t num_sectors, bool peek)
+{
+    int ret = 0;
+    uint64_t offset;
+    uint32_t read;
+
+    read = log->read;
+
+    *sectors_read = 0;
+    while (num_sectors) {
+        if (read == log->write) {
+            /* empty */
+            break;
+        }
+        offset = log->offset + read;
+
+        ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
+        if (ret < 0) {
+            goto exit;
+        }
+        read = vhdx_log_inc_idx(read, log->length);
+
+        *sectors_read = *sectors_read + 1;
+        num_sectors--;
+    }
+
+exit:
+    if (!peek) {
+        log->read = read;
+    }
+    return ret;
+}
+
+/* Writes num_sectors to the log (all log sectors are 4096 bytes),
+ * from buffer 'buffer'.  Upon return, *sectors_written will contain
+ * the number of sectors successfully written.
+ *
+ * It is assumed that 'buffer' is at least 4096*num_sectors large.
+ *
+ * 0 is returned on success, -errno otherwise */
+static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
+                                  uint32_t *sectors_written, void *buffer,
+                                  uint32_t num_sectors)
+{
+    int ret = 0;
+    uint64_t offset;
+    uint32_t write;
+    void *buffer_tmp;
+    BDRVVHDXState *s = bs->opaque;
+
+    ret = vhdx_user_visible_write(bs, s);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    write = log->write;
+
+    buffer_tmp = buffer;
+    while (num_sectors) {
+
+        offset = log->offset + write;
+        write = vhdx_log_inc_idx(write, log->length);
+        if (write == log->read) {
+            /* full */
+            break;
+        }
+        ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE);
+        if (ret < 0) {
+            goto exit;
+        }
+        buffer_tmp += VHDX_LOG_SECTOR_SIZE;
+
+        log->write = write;
+        *sectors_written = *sectors_written + 1;
+        num_sectors--;
+    }
+
+exit:
+    return ret;
+}
+
+
+/* Validates a log entry header */
+static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
+                                  BDRVVHDXState *s)
+{
+    int valid = false;
+
+    if (memcmp(&hdr->signature, "loge", 4)) {
+        goto exit;
+    }
+
+    /* if the individual entry length is larger than the whole log
+     * buffer, that is obviously invalid */
+    if (log->length < hdr->entry_length) {
+        goto exit;
+    }
+
+    /* length of entire entry must be in units of 4KB (log sector size) */
+    if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
+        goto exit;
+    }
+
+    /* per spec, sequence # must be > 0 */
+    if (hdr->sequence_number == 0) {
+        goto exit;
+    }
+
+    /* log entries are only valid if they match the file-wide log guid
+     * found in the active header */
+    if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
+        goto exit;
+    }
+
+    if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
+        goto exit;
+    }
+
+    valid = true;
+
+exit:
+    return valid;
+}
+
+/*
+ * Given a log header, this will validate that the descriptors and the
+ * corresponding data sectors (if applicable)
+ *
+ * Validation consists of:
+ *      1. Making sure the sequence numbers matches the entry header
+ *      2. Verifying a valid signature ('zero' or 'desc' for descriptors)
+ *      3. File offset field is a multiple of 4KB
+ *      4. If a data descriptor, the corresponding data sector
+ *         has its signature ('data') and matching sequence number
+ *
+ * @desc: the data buffer containing the descriptor
+ * @hdr:  the log entry header
+ *
+ * Returns true if valid
+ */
+static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
+                                   VHDXLogEntryHeader *hdr)
+{
+    bool ret = false;
+
+    if (desc->sequence_number != hdr->sequence_number) {
+        goto exit;
+    }
+    if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
+        goto exit;
+    }
+
+    if (!memcmp(&desc->signature, "zero", 4)) {
+        if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
+            /* valid */
+            ret = true;
+        }
+    } else if (!memcmp(&desc->signature, "desc", 4)) {
+            /* valid */
+            ret = true;
+    }
+
+exit:
+    return ret;
+}
+
+
+/* Prior to sector data for a log entry, there is the header
+ * and the descriptors referenced in the header:
+ *
+ * [] = 4KB sector
+ *
+ * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ]
+ *
+ * The first sector in a log entry has a 64 byte header, and
+ * up to 126 32-byte descriptors.  If more descriptors than
+ * 126 are required, then subsequent sectors can have up to 128
+ * descriptors.  Each sector is 4KB.  Data follows the descriptor
+ * sectors.
+ *
+ * This will return the number of sectors needed to encompass
+ * the passed number of descriptors in desc_cnt.
+ *
+ * This will never return 0, even if desc_cnt is 0.
+ */
+static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
+{
+    uint32_t desc_sectors;
+
+    desc_cnt += 2; /* account for header in first sector */
+    desc_sectors = desc_cnt / 128;
+    if (desc_cnt % 128) {
+        desc_sectors++;
+    }
+
+    return desc_sectors;
+}
+
+
+/* Reads the log header, and subsequent descriptors (if any).  This
+ * will allocate all the space for buffer, which must be NULL when
+ * passed into this function. Each descriptor will also be validated,
+ * and error returned if any are invalid. */
+static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
+                              VHDXLogEntries *log, VHDXLogDescEntries **buffer)
+{
+    int ret = 0;
+    uint32_t desc_sectors;
+    uint32_t sectors_read;
+    VHDXLogEntryHeader hdr;
+    VHDXLogDescEntries *desc_entries = NULL;
+    int i;
+
+    assert(*buffer == NULL);
+
+    ret = vhdx_log_peek_hdr(bs, log, &hdr);
+    if (ret < 0) {
+        goto exit;
+    }
+    vhdx_log_entry_hdr_le_import(&hdr);
+    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
+    desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
+
+    ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
+                                desc_sectors, false);
+    if (ret < 0) {
+        goto free_and_exit;
+    }
+    if (sectors_read != desc_sectors) {
+        ret = -EINVAL;
+        goto free_and_exit;
+    }
+
+    /* put in proper endianness, and validate each desc */
+    for (i = 0; i < hdr.descriptor_count; i++) {
+        vhdx_log_desc_le_import(&desc_entries->desc[i]);
+        if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
+            ret = -EINVAL;
+            goto free_and_exit;
+        }
+    }
+
+    *buffer = desc_entries;
+    goto exit;
+
+free_and_exit:
+    qemu_vfree(desc_entries);
+exit:
+    return ret;
+}
+
+
+/* Flushes the descriptor described by desc to the VHDX image file.
+ * If the descriptor is a data descriptor, than 'data' must be non-NULL,
+ * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
+ * written.
+ *
+ * Verification is performed to make sure the sequence numbers of a data
+ * descriptor match the sequence number in the desc.
+ *
+ * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
+ * In this case, it should be noted that zeroes are written to disk, and the
+ * image file is not extended as a sparse file.  */
+static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
+                               VHDXLogDataSector *data)
+{
+    int ret = 0;
+    uint64_t seq, file_offset;
+    uint32_t offset = 0;
+    void *buffer = NULL;
+    uint64_t count = 1;
+    int i;
+
+    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
+
+    if (!memcmp(&desc->signature, "desc", 4)) {
+        /* data sector */
+        if (data == NULL) {
+            ret = -EFAULT;
+            goto exit;
+        }
+
+        /* The sequence number of the data sector must match that
+         * in the descriptor */
+        seq = data->sequence_high;
+        seq <<= 32;
+        seq |= data->sequence_low & 0xffffffff;
+
+        if (seq != desc->sequence_number) {
+            ret = -EINVAL;
+            goto exit;
+        }
+
+        /* Each data sector is in total 4096 bytes, however the first
+         * 8 bytes, and last 4 bytes, are located in the descriptor */
+        memcpy(buffer, &desc->leading_bytes, 8);
+        offset += 8;
+
+        memcpy(buffer+offset, data->data, 4084);
+        offset += 4084;
+
+        memcpy(buffer+offset, &desc->trailing_bytes, 4);
+
+    } else if (!memcmp(&desc->signature, "zero", 4)) {
+        /* write 'count' sectors of sector */
+        memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
+        count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
+    }
+
+    file_offset = desc->file_offset;
+
+    /* count is only > 1 if we are writing zeroes */
+    for (i = 0; i < count; i++) {
+        ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
+                               VHDX_LOG_SECTOR_SIZE);
+        if (ret < 0) {
+            goto exit;
+        }
+        file_offset += VHDX_LOG_SECTOR_SIZE;
+    }
+
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+/* Flush the entire log (as described by 'logs') to the VHDX image
+ * file, and then set the log to 'empty' status once complete.
+ *
+ * The log entries should be validate prior to flushing */
+static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
+                          VHDXLogSequence *logs)
+{
+    int ret = 0;
+    int i;
+    uint32_t cnt, sectors_read;
+    uint64_t new_file_size;
+    void *data = NULL;
+    VHDXLogDescEntries *desc_entries = NULL;
+    VHDXLogEntryHeader hdr_tmp = { 0 };
+
+    cnt = logs->count;
+
+    data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
+
+    ret = vhdx_user_visible_write(bs, s);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    /* each iteration represents one log sequence, which may span multiple
+     * sectors */
+    while (cnt--) {
+        ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
+        if (ret < 0) {
+            goto exit;
+        }
+        /* if the log shows a FlushedFileOffset larger than our current file
+         * size, then that means the file has been truncated / corrupted, and
+         * we must refused to open it / use it */
+        if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) {
+            ret = -EINVAL;
+            goto exit;
+        }
+
+        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
+        if (ret < 0) {
+            goto exit;
+        }
+
+        for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
+            if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
+                /* data sector, so read a sector to flush */
+                ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
+                                            data, 1, false);
+                if (ret < 0) {
+                    goto exit;
+                }
+                if (sectors_read != 1) {
+                    ret = -EINVAL;
+                    goto exit;
+                }
+            }
+
+            ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
+            if (ret < 0) {
+                goto exit;
+            }
+        }
+        if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) {
+            new_file_size = desc_entries->hdr.last_file_offset;
+            if (new_file_size % (1024*1024)) {
+                /* round up to nearest 1MB boundary */
+                new_file_size = ((new_file_size >> 20) + 1) << 20;
+                bdrv_truncate(bs->file, new_file_size);
+            }
+        }
+        qemu_vfree(desc_entries);
+        desc_entries = NULL;
+    }
+
+    bdrv_flush(bs);
+    /* once the log is fully flushed, indicate that we have an empty log
+     * now.  This also sets the log guid to 0, to indicate an empty log */
+    vhdx_log_reset(bs, s);
+
+exit:
+    qemu_vfree(data);
+    qemu_vfree(desc_entries);
+    return ret;
+}
+
+static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
+                                   VHDXLogEntries *log, uint64_t seq,
+                                   bool *valid, VHDXLogEntryHeader *entry)
+{
+    int ret = 0;
+    VHDXLogEntryHeader hdr;
+    void *buffer = NULL;
+    uint32_t i, desc_sectors, total_sectors, crc;
+    uint32_t sectors_read = 0;
+    VHDXLogDescEntries *desc_buffer = NULL;
+
+    *valid = false;
+
+    ret = vhdx_log_peek_hdr(bs, log, &hdr);
+    if (ret < 0) {
+        goto inc_and_exit;
+    }
+
+    vhdx_log_entry_hdr_le_import(&hdr);
+
+
+    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
+        goto inc_and_exit;
+    }
+
+    if (seq > 0) {
+        if (hdr.sequence_number != seq + 1) {
+            goto inc_and_exit;
+        }
+    }
+
+    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
+
+    /* Read desc sectors, and calculate log checksum */
+
+    total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
+
+
+    /* read_desc() will increment the read idx */
+    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
+    if (ret < 0) {
+        goto free_and_exit;
+    }
+
+    crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
+                            desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
+    crc ^= 0xffffffff;
+
+    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
+    if (total_sectors > desc_sectors) {
+        for (i = 0; i < total_sectors - desc_sectors; i++) {
+            sectors_read = 0;
+            ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
+                                        1, false);
+            if (ret < 0 || sectors_read != 1) {
+                goto free_and_exit;
+            }
+            crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
+            crc ^= 0xffffffff;
+        }
+    }
+    crc ^= 0xffffffff;
+    if (crc != desc_buffer->hdr.checksum) {
+        goto free_and_exit;
+    }
+
+    *valid = true;
+    *entry = hdr;
+    goto free_and_exit;
+
+inc_and_exit:
+    log->read = vhdx_log_inc_idx(log->read, log->length);
+
+free_and_exit:
+    qemu_vfree(buffer);
+    qemu_vfree(desc_buffer);
+    return ret;
+}
+
+/* Search through the log circular buffer, and find the valid, active
+ * log sequence, if any exists
+ * */
+static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
+                           VHDXLogSequence *logs)
+{
+    int ret = 0;
+    uint32_t tail;
+    bool seq_valid = false;
+    VHDXLogSequence candidate = { 0 };
+    VHDXLogEntryHeader hdr = { 0 };
+    VHDXLogEntries curr_log;
+
+    memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
+    curr_log.write = curr_log.length;   /* assume log is full */
+    curr_log.read = 0;
+
+
+    /* now we will go through the whole log sector by sector, until
+     * we find a valid, active log sequence, or reach the end of the
+     * log buffer */
+    for (;;) {
+        uint64_t curr_seq = 0;
+        VHDXLogSequence current = { 0 };
+
+        tail = curr_log.read;
+
+        ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
+                                      &seq_valid, &hdr);
+        if (ret < 0) {
+            goto exit;
+        }
+
+        if (seq_valid) {
+            current.valid     = true;
+            current.log       = curr_log;
+            current.log.read  = tail;
+            current.log.write = curr_log.read;
+            current.count     = 1;
+            current.hdr       = hdr;
+
+
+            for (;;) {
+                ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
+                                              &seq_valid, &hdr);
+                if (ret < 0) {
+                    goto exit;
+                }
+                if (seq_valid == false) {
+                    break;
+                }
+                current.log.write = curr_log.read;
+                current.count++;
+
+                curr_seq = hdr.sequence_number;
+            }
+        }
+
+        if (current.valid) {
+            if (candidate.valid == false ||
+                current.hdr.sequence_number > candidate.hdr.sequence_number) {
+                candidate = current;
+            }
+        }
+
+        if (curr_log.read < tail) {
+            break;
+        }
+    }
+
+    *logs = candidate;
+
+    if (candidate.valid) {
+        /* this is the next sequence number, for writes */
+        s->log.sequence = candidate.hdr.sequence_number + 1;
+    }
+
+
+exit:
+    return ret;
+}
+
+/* Parse the replay log.  Per the VHDX spec, if the log is present
+ * it must be replayed prior to opening the file, even read-only.
+ *
+ * If read-only, we must replay the log in RAM (or refuse to open
+ * a dirty VHDX file read-only) */
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp)
+{
+    int ret = 0;
+    VHDXHeader *hdr;
+    VHDXLogSequence logs = { 0 };
+
+    hdr = s->headers[s->curr_header];
+
+    *flushed = false;
+
+    /* s->log.hdr is freed in vhdx_close() */
+    if (s->log.hdr == NULL) {
+        s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
+    }
+
+    s->log.offset = hdr->log_offset;
+    s->log.length = hdr->log_length;
+
+    if (s->log.offset < VHDX_LOG_MIN_SIZE ||
+        s->log.offset % VHDX_LOG_MIN_SIZE) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* per spec, only log version of 0 is supported */
+    if (hdr->log_version != 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* If either the log guid, or log length is zero,
+     * then a replay log is not present */
+    if (guid_eq(hdr->log_guid, zero_guid)) {
+        goto exit;
+    }
+
+    if (hdr->log_length == 0) {
+        goto exit;
+    }
+
+    if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+
+    /* The log is present, we need to find if and where there is an active
+     * sequence of valid entries present in the log.  */
+
+    ret = vhdx_log_search(bs, s, &logs);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    if (logs.valid) {
+        if (bs->read_only) {
+            ret = -EPERM;
+            error_setg_errno(errp, EPERM,
+                             "VHDX image file '%s' opened read-only, but "
+                             "contains a log that needs to be replayed.  To "
+                             "replay the log, execute:\n qemu-img check -r "
+                             "all '%s'",
+                             bs->filename, bs->filename);
+            goto exit;
+        }
+        /* now flush the log */
+        ret = vhdx_log_flush(bs, s, &logs);
+        if (ret < 0) {
+            goto exit;
+        }
+        *flushed = true;
+    }
+
+
+exit:
+    return ret;
+}
+
+
+
+static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
+                                      VHDXLogDataSector *sector, void *data,
+                                      uint64_t seq)
+{
+    /* 8 + 4084 + 4 = 4096, 1 log sector */
+    memcpy(&desc->leading_bytes, data, 8);
+    data += 8;
+    cpu_to_le64s(&desc->leading_bytes);
+    memcpy(sector->data, data, 4084);
+    data += 4084;
+    memcpy(&desc->trailing_bytes, data, 4);
+    cpu_to_le32s(&desc->trailing_bytes);
+    data += 4;
+
+    sector->sequence_high  = (uint32_t) (seq >> 32);
+    sector->sequence_low   = (uint32_t) (seq & 0xffffffff);
+    sector->data_signature = VHDX_LOG_DATA_SIGNATURE;
+
+    vhdx_log_desc_le_export(desc);
+    vhdx_log_data_le_export(sector);
+}
+
+
+static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
+                          void *data, uint32_t length, uint64_t offset)
+{
+    int ret = 0;
+    void *buffer = NULL;
+    void *merged_sector = NULL;
+    void *data_tmp, *sector_write;
+    unsigned int i;
+    int sector_offset;
+    uint32_t desc_sectors, sectors, total_length;
+    uint32_t sectors_written = 0;
+    uint32_t aligned_length;
+    uint32_t leading_length = 0;
+    uint32_t trailing_length = 0;
+    uint32_t partial_sectors = 0;
+    uint32_t bytes_written = 0;
+    uint64_t file_offset;
+    VHDXHeader *header;
+    VHDXLogEntryHeader new_hdr;
+    VHDXLogDescriptor *new_desc = NULL;
+    VHDXLogDataSector *data_sector = NULL;
+    MSGUID new_guid = { 0 };
+
+    header = s->headers[s->curr_header];
+
+    /* need to have offset read data, and be on 4096 byte boundary */
+
+    if (length > header->log_length) {
+        /* no log present.  we could create a log here instead of failing */
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    if (guid_eq(header->log_guid, zero_guid)) {
+        vhdx_guid_generate(&new_guid);
+        vhdx_update_headers(bs, s, false, &new_guid);
+    } else {
+        /* currently, we require that the log be flushed after
+         * every write. */
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    /* 0 is an invalid sequence number, but may also represent the first
+     * log write (or a wrapped seq) */
+    if (s->log.sequence == 0) {
+        s->log.sequence = 1;
+    }
+
+    sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
+    file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE;
+
+    aligned_length = length;
+
+    /* add in the unaligned head and tail bytes */
+    if (sector_offset) {
+        leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset);
+        leading_length = leading_length > length ? length : leading_length;
+        aligned_length -= leading_length;
+        partial_sectors++;
+    }
+
+    sectors = aligned_length / VHDX_LOG_SECTOR_SIZE;
+    trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE);
+    if (trailing_length) {
+        partial_sectors++;
+    }
+
+    sectors += partial_sectors;
+
+    /* sectors is now how many sectors the data itself takes, not
+     * including the header and descriptor metadata */
+
+    new_hdr = (VHDXLogEntryHeader) {
+                .signature           = VHDX_LOG_SIGNATURE,
+                .tail                = s->log.tail,
+                .sequence_number     = s->log.sequence,
+                .descriptor_count    = sectors,
+                .reserved            = 0,
+                .flushed_file_offset = bdrv_getlength(bs->file),
+                .last_file_offset    = bdrv_getlength(bs->file),
+              };
+
+    new_hdr.log_guid = header->log_guid;
+
+    desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);
+
+    total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE;
+    new_hdr.entry_length = total_length;
+
+    vhdx_log_entry_hdr_le_export(&new_hdr);
+
+    buffer = qemu_blockalign(bs, total_length);
+    memcpy(buffer, &new_hdr, sizeof(new_hdr));
+
+    new_desc = (VHDXLogDescriptor *) (buffer + sizeof(new_hdr));
+    data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    data_tmp = data;
+
+    /* All log sectors are 4KB, so for any partial sectors we must
+     * merge the data with preexisting data from the final file
+     * destination */
+    merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
+
+    for (i = 0; i < sectors; i++) {
+        new_desc->signature       = VHDX_LOG_DESC_SIGNATURE;
+        new_desc->sequence_number = s->log.sequence;
+        new_desc->file_offset     = file_offset;
+
+        if (i == 0 && leading_length) {
+            /* partial sector at the front of the buffer */
+            ret = bdrv_pread(bs->file, file_offset, merged_sector,
+                             VHDX_LOG_SECTOR_SIZE);
+            if (ret < 0) {
+                goto exit;
+            }
+            memcpy(merged_sector + sector_offset, data_tmp, leading_length);
+            bytes_written = leading_length;
+            sector_write = merged_sector;
+        } else if (i == sectors - 1 && trailing_length) {
+            /* partial sector at the end of the buffer */
+            ret = bdrv_pread(bs->file,
+                            file_offset,
+                            merged_sector + trailing_length,
+                            VHDX_LOG_SECTOR_SIZE - trailing_length);
+            if (ret < 0) {
+                goto exit;
+            }
+            memcpy(merged_sector, data_tmp, trailing_length);
+            bytes_written = trailing_length;
+            sector_write = merged_sector;
+        } else {
+            bytes_written = VHDX_LOG_SECTOR_SIZE;
+            sector_write = data_tmp;
+        }
+
+        /* populate the raw sector data into the proper structures,
+         * as well as update the descriptor, and convert to proper
+         * endianness */
+        vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write,
+                                  s->log.sequence);
+
+        data_tmp += bytes_written;
+        data_sector++;
+        new_desc++;
+        file_offset += VHDX_LOG_SECTOR_SIZE;
+    }
+
+    /* checksum covers entire entry, from the log header through the
+     * last data sector */
+    vhdx_update_checksum(buffer, total_length,
+                         offsetof(VHDXLogEntryHeader, checksum));
+    cpu_to_le32s((uint32_t *)(buffer + 4));
+
+    /* now write to the log */
+    ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
+                                 desc_sectors + sectors);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    if (sectors_written != desc_sectors + sectors) {
+        /* instead of failing, we could flush the log here */
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->log.sequence++;
+    /* write new tail */
+    s->log.tail = s->log.write;
+
+exit:
+    qemu_vfree(buffer);
+    qemu_vfree(merged_sector);
+    return ret;
+}
+
+/* Perform a log write, and then immediately flush the entire log */
+int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
+                             void *data, uint32_t length, uint64_t offset)
+{
+    int ret = 0;
+    VHDXLogSequence logs = { .valid = true,
+                             .count = 1,
+                             .hdr = { 0 } };
+
+
+    /* Make sure data written (new and/or changed blocks) is stable
+     * on disk, before creating log entry */
+    bdrv_flush(bs);
+    ret = vhdx_log_write(bs, s, data, length, offset);
+    if (ret < 0) {
+        goto exit;
+    }
+    logs.log = s->log;
+
+    /* Make sure log is stable on disk */
+    bdrv_flush(bs);
+    ret = vhdx_log_flush(bs, s, &logs);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    s->log = logs.log;
+
+exit:
+    return ret;
+}
+
diff --git a/block/vhdx.c b/block/vhdx.c
index e9704b1fd..509baaf48 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -6,9 +6,9 @@
  * Authors:
  *  Jeff Cody <jcody@redhat.com>
  *
- *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
  *  by Microsoft:
- *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
  *
  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  * See the COPYING.LIB file in the top-level directory.
@@ -20,7 +20,22 @@
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
 #include "block/vhdx.h"
+#include "migration/migration.h"
 
+#include <uuid/uuid.h>
+#include <glib.h>
+
+/* Options for VHDX creation */
+
+#define VHDX_BLOCK_OPT_LOG_SIZE   "log_size"
+#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size"
+#define VHDX_BLOCK_OPT_ZERO "block_state_zero"
+
+typedef enum VHDXImageType {
+    VHDX_TYPE_DYNAMIC = 0,
+    VHDX_TYPE_FIXED,
+    VHDX_TYPE_DIFFERENCING,   /* Currently unsupported */
+} VHDXImageType;
 
 /* Several metadata and region table data entries are identified by
  * guids in  a MS-specific GUID format. */
@@ -103,16 +118,6 @@ static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
      META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
      META_PHYS_SECTOR_SIZE_PRESENT)
 
-typedef struct VHDXMetadataEntries {
-    VHDXMetadataTableEntry file_parameters_entry;
-    VHDXMetadataTableEntry virtual_disk_size_entry;
-    VHDXMetadataTableEntry page83_data_entry;
-    VHDXMetadataTableEntry logical_sector_size_entry;
-    VHDXMetadataTableEntry phys_sector_size_entry;
-    VHDXMetadataTableEntry parent_locator_entry;
-    uint16_t present;
-} VHDXMetadataEntries;
-
 
 typedef struct VHDXSectorInfo {
     uint32_t bat_idx;       /* BAT entry index */
@@ -123,43 +128,31 @@ typedef struct VHDXSectorInfo {
     uint64_t block_offset;  /* block offset, in bytes */
 } VHDXSectorInfo;
 
+/* Calculates new checksum.
+ *
+ * Zero is substituted during crc calculation for the original crc field
+ * crc_offset: byte offset in buf of the buffer crc
+ * buf: buffer pointer
+ * size: size of buffer (must be > crc_offset+4)
+ *
+ * Note: The resulting checksum is in the CPU endianness, not necessarily
+ *       in the file format endianness (LE).  Any header export to disk should
+ *       make sure that vhdx_header_le_export() is used to convert to the
+ *       correct endianness
+ */
+uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
+{
+    uint32_t crc;
 
+    assert(buf != NULL);
+    assert(size > (crc_offset + sizeof(crc)));
 
-typedef struct BDRVVHDXState {
-    CoMutex lock;
-
-    int curr_header;
-    VHDXHeader *headers[2];
-
-    VHDXRegionTableHeader rt;
-    VHDXRegionTableEntry bat_rt;         /* region table for the BAT */
-    VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */
-
-    VHDXMetadataTableHeader metadata_hdr;
-    VHDXMetadataEntries metadata_entries;
-
-    VHDXFileParameters params;
-    uint32_t block_size;
-    uint32_t block_size_bits;
-    uint32_t sectors_per_block;
-    uint32_t sectors_per_block_bits;
-
-    uint64_t virtual_disk_size;
-    uint32_t logical_sector_size;
-    uint32_t physical_sector_size;
-
-    uint64_t chunk_ratio;
-    uint32_t chunk_ratio_bits;
-    uint32_t logical_sector_size_bits;
-
-    uint32_t bat_entries;
-    VHDXBatEntry *bat;
-    uint64_t bat_offset;
-
-    VHDXParentLocatorHeader parent_header;
-    VHDXParentLocatorEntry *parent_entries;
+    memset(buf + crc_offset, 0, sizeof(crc));
+    crc =  crc32c(0xffffffff, buf, size);
+    memcpy(buf + crc_offset, &crc, sizeof(crc));
 
-} BDRVVHDXState;
+    return crc;
+}
 
 uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
                             int crc_offset)
@@ -212,6 +205,71 @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
 
 
 /*
+ * This generates a UUID that is compliant with the MS GUIDs used
+ * in the VHDX spec (and elsewhere).
+ */
+void vhdx_guid_generate(MSGUID *guid)
+{
+    uuid_t uuid;
+    assert(guid != NULL);
+
+    uuid_generate(uuid);
+    memcpy(guid, uuid, sizeof(MSGUID));
+}
+
+/* Check for region overlaps inside the VHDX image */
+static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length)
+{
+    int ret = 0;
+    uint64_t end;
+    VHDXRegionEntry *r;
+
+    end = start + length;
+    QLIST_FOREACH(r, &s->regions, entries) {
+        if (!((start >= r->end) || (end <= r->start))) {
+            ret = -EINVAL;
+            goto exit;
+        }
+    }
+
+exit:
+    return ret;
+}
+
+/* Register a region for future checks */
+static void vhdx_region_register(BDRVVHDXState *s,
+                                 uint64_t start, uint64_t length)
+{
+    VHDXRegionEntry *r;
+
+    r = g_malloc0(sizeof(*r));
+
+    r->start = start;
+    r->end = start + length;
+
+    QLIST_INSERT_HEAD(&s->regions, r, entries);
+}
+
+/* Free all registered regions */
+static void vhdx_region_unregister_all(BDRVVHDXState *s)
+{
+    VHDXRegionEntry *r, *r_next;
+
+    QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) {
+        QLIST_REMOVE(r, entries);
+        g_free(r);
+    }
+}
+
+static void vhdx_set_shift_bits(BDRVVHDXState *s)
+{
+    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
+    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
+    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
+    s->block_size_bits =          31 - clz32(s->block_size);
+}
+
+/*
  * Per the MS VHDX Specification, for every VHDX file:
  *      - The header section is fixed size - 1 MB
  *      - The header section is always the first "object"
@@ -230,30 +288,124 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-/* All VHDX structures on disk are little endian */
-static void vhdx_header_le_import(VHDXHeader *h)
+/*
+ * Writes the header to the specified offset.
+ *
+ * This will optionally read in buffer data from disk (otherwise zero-fill),
+ * and then update the header checksum.  Header is converted to proper
+ * endianness before being written to the specified file offset
+ */
+static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
+                             uint64_t offset, bool read)
+{
+    uint8_t *buffer = NULL;
+    int ret;
+    VHDXHeader header_le;
+
+    assert(bs_file != NULL);
+    assert(hdr != NULL);
+
+    /* the header checksum is not over just the packed size of VHDXHeader,
+     * but rather over the entire 'reserved' range for the header, which is
+     * 4KB (VHDX_HEADER_SIZE). */
+
+    buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE);
+    if (read) {
+        /* if true, we can't assume the extra reserved bytes are 0 */
+        ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE);
+        if (ret < 0) {
+            goto exit;
+        }
+    } else {
+        memset(buffer, 0, VHDX_HEADER_SIZE);
+    }
+
+    /* overwrite the actual VHDXHeader portion */
+    memcpy(buffer, hdr, sizeof(VHDXHeader));
+    hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
+                                         offsetof(VHDXHeader, checksum));
+    vhdx_header_le_export(hdr, &header_le);
+    ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader));
+
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+/* Update the VHDX headers
+ *
+ * This follows the VHDX spec procedures for header updates.
+ *
+ *  - non-current header is updated with largest sequence number
+ */
+static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
+                              bool generate_data_write_guid, MSGUID *log_guid)
 {
-    assert(h != NULL);
+    int ret = 0;
+    int hdr_idx = 0;
+    uint64_t header_offset = VHDX_HEADER1_OFFSET;
+
+    VHDXHeader *active_header;
+    VHDXHeader *inactive_header;
+
+    /* operate on the non-current header */
+    if (s->curr_header == 0) {
+        hdr_idx = 1;
+        header_offset = VHDX_HEADER2_OFFSET;
+    }
+
+    active_header   = s->headers[s->curr_header];
+    inactive_header = s->headers[hdr_idx];
 
-    le32_to_cpus(&h->signature);
-    le32_to_cpus(&h->checksum);
-    le64_to_cpus(&h->sequence_number);
+    inactive_header->sequence_number = active_header->sequence_number + 1;
 
-    leguid_to_cpus(&h->file_write_guid);
-    leguid_to_cpus(&h->data_write_guid);
-    leguid_to_cpus(&h->log_guid);
+    /* a new file guid must be generated before any file write, including
+     * headers */
+    inactive_header->file_write_guid = s->session_guid;
+
+    /* a new data guid only needs to be generated before any guest-visible
+     * writes (i.e. something observable via virtual disk read) */
+    if (generate_data_write_guid) {
+        vhdx_guid_generate(&inactive_header->data_write_guid);
+    }
 
-    le16_to_cpus(&h->log_version);
-    le16_to_cpus(&h->version);
-    le32_to_cpus(&h->log_length);
-    le64_to_cpus(&h->log_offset);
+    /* update the log guid if present */
+    if (log_guid) {
+        inactive_header->log_guid = *log_guid;
+    }
+
+    ret = vhdx_write_header(bs->file, inactive_header, header_offset, true);
+    if (ret < 0) {
+        goto exit;
+    }
+    s->curr_header = hdr_idx;
+
+exit:
+    return ret;
 }
 
+/*
+ * The VHDX spec calls for header updates to be performed twice, so that both
+ * the current and non-current header have valid info
+ */
+int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s,
+                        bool generate_data_write_guid, MSGUID *log_guid)
+{
+    int ret;
+
+    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
+    if (ret < 0) {
+        return ret;
+    }
+    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
+    return ret;
+}
 
 /* opens the specified header block from the VHDX file header section */
-static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
+                              Error **errp)
 {
-    int ret = 0;
+    int ret;
     VHDXHeader *header1;
     VHDXHeader *header2;
     bool h1_valid = false;
@@ -262,6 +414,7 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
     uint64_t h2_seq = 0;
     uint8_t *buffer;
 
+    /* header1 & header2 are freed in vhdx_close() */
     header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
     header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
 
@@ -310,7 +463,6 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
     } else if (!h1_valid && h2_valid) {
         s->curr_header = 1;
     } else if (!h1_valid && !h2_valid) {
-        ret = -EINVAL;
         goto fail;
     } else {
         /* If both headers are valid, then we choose the active one by the
@@ -321,24 +473,22 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
         } else if (h2_seq > h1_seq) {
             s->curr_header = 1;
         } else {
-            ret = -EINVAL;
             goto fail;
         }
     }
 
-    ret = 0;
-
+    vhdx_region_register(s, s->headers[s->curr_header]->log_offset,
+                            s->headers[s->curr_header]->log_length);
     goto exit;
 
 fail:
-    qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
+    error_setg_errno(errp, -ret, "No valid VHDX header found");
     qemu_vfree(header1);
     qemu_vfree(header2);
     s->headers[0] = NULL;
     s->headers[1] = NULL;
 exit:
     qemu_vfree(buffer);
-    return ret;
 }
 
 
@@ -362,10 +512,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
         goto fail;
     }
     memcpy(&s->rt, buffer, sizeof(s->rt));
-    le32_to_cpus(&s->rt.signature);
-    le32_to_cpus(&s->rt.checksum);
-    le32_to_cpus(&s->rt.entry_count);
-    le32_to_cpus(&s->rt.reserved);
+    vhdx_region_header_le_import(&s->rt);
     offset += sizeof(s->rt);
 
     if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
@@ -384,10 +531,16 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
         memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
         offset += sizeof(rt_entry);
 
-        leguid_to_cpus(&rt_entry.guid);
-        le64_to_cpus(&rt_entry.file_offset);
-        le32_to_cpus(&rt_entry.length);
-        le32_to_cpus(&rt_entry.data_bits);
+        vhdx_region_entry_le_import(&rt_entry);
+
+        /* check for region overlap between these entries, and any
+         * other memory regions in the file */
+        ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        vhdx_region_register(s, rt_entry.file_offset, rt_entry.length);
 
         /* see if we recognize the entry */
         if (guid_eq(rt_entry.guid, bat_guid)) {
@@ -419,6 +572,12 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
             goto fail;
         }
     }
+
+    if (!bat_rt_found || !metadata_rt_found) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
     ret = 0;
 
 fail:
@@ -462,9 +621,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
     memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
     offset += sizeof(s->metadata_hdr);
 
-    le64_to_cpus(&s->metadata_hdr.signature);
-    le16_to_cpus(&s->metadata_hdr.reserved);
-    le16_to_cpus(&s->metadata_hdr.entry_count);
+    vhdx_metadata_header_le_import(&s->metadata_hdr);
 
     if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
         ret = -EINVAL;
@@ -483,11 +640,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
         memcpy(&md_entry, buffer + offset, sizeof(md_entry));
         offset += sizeof(md_entry);
 
-        leguid_to_cpus(&md_entry.item_id);
-        le32_to_cpus(&md_entry.offset);
-        le32_to_cpus(&md_entry.length);
-        le32_to_cpus(&md_entry.data_bits);
-        le32_to_cpus(&md_entry.reserved2);
+        vhdx_metadata_entry_le_import(&md_entry);
 
         if (guid_eq(md_entry.item_id, file_param_guid)) {
             if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
@@ -627,12 +780,20 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
     le32_to_cpus(&s->logical_sector_size);
     le32_to_cpus(&s->physical_sector_size);
 
-    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
+    if (s->params.block_size < VHDX_BLOCK_SIZE_MIN ||
+        s->params.block_size > VHDX_BLOCK_SIZE_MAX) {
         ret = -EINVAL;
         goto exit;
     }
 
-    /* both block_size and sector_size are guaranteed powers of 2 */
+    /* only 2 supported sector sizes */
+    if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* Both block_size and sector_size are guaranteed powers of 2, below.
+       Due to range checks above, s->sectors_per_block can never be < 256 */
     s->sectors_per_block = s->params.block_size / s->logical_sector_size;
     s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
                      (uint64_t)s->logical_sector_size /
@@ -660,10 +821,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
         goto exit;
     }
 
-    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
-    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
-    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
-    s->block_size_bits =          31 - clz32(s->block_size);
+    vhdx_set_shift_bits(s);
 
     ret = 0;
 
@@ -672,61 +830,64 @@ exit:
     return ret;
 }
 
-/* Parse the replay log.  Per the VHDX spec, if the log is present
- * it must be replayed prior to opening the file, even read-only.
- *
- * If read-only, we must replay the log in RAM (or refuse to open
- * a dirty VHDX file read-only */
-static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
+/*
+ * Calculate the number of BAT entries, including sector
+ * bitmap entries.
+ */
+static void vhdx_calc_bat_entries(BDRVVHDXState *s)
 {
-    int ret = 0;
-    int i;
-    VHDXHeader *hdr;
-
-    hdr = s->headers[s->curr_header];
+    uint32_t data_blocks_cnt, bitmap_blocks_cnt;
 
-    /* either the log guid, or log length is zero,
-     * then a replay log is present */
-    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
-        ret |= hdr->log_guid.data4[i];
-    }
-    if (hdr->log_guid.data1 == 0 &&
-        hdr->log_guid.data2 == 0 &&
-        hdr->log_guid.data3 == 0 &&
-        ret == 0) {
-        goto exit;
+    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
+    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
+        data_blocks_cnt++;
     }
-
-    /* per spec, only log version of 0 is supported */
-    if (hdr->log_version != 0) {
-        ret = -EINVAL;
-        goto exit;
+    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
+    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
+        bitmap_blocks_cnt++;
     }
 
-    if (hdr->log_length == 0) {
-        goto exit;
+    if (s->parent_entries) {
+        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
+    } else {
+        s->bat_entries = data_blocks_cnt +
+                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
     }
 
-    /* We currently do not support images with logs to replay */
-    ret = -ENOTSUP;
-
-exit:
-    return ret;
 }
 
+static void vhdx_close(BlockDriverState *bs)
+{
+    BDRVVHDXState *s = bs->opaque;
+    qemu_vfree(s->headers[0]);
+    s->headers[0] = NULL;
+    qemu_vfree(s->headers[1]);
+    s->headers[1] = NULL;
+    qemu_vfree(s->bat);
+    s->bat = NULL;
+    qemu_vfree(s->parent_entries);
+    s->parent_entries = NULL;
+    migrate_del_blocker(s->migration_blocker);
+    error_free(s->migration_blocker);
+    qemu_vfree(s->log.hdr);
+    s->log.hdr = NULL;
+    vhdx_region_unregister_all(s);
+}
 
-static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
+static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
     BDRVVHDXState *s = bs->opaque;
     int ret = 0;
     uint32_t i;
     uint64_t signature;
-    uint32_t data_blocks_cnt, bitmap_blocks_cnt;
-
+    Error *local_err = NULL;
 
     s->bat = NULL;
+    s->first_visible_write = true;
 
     qemu_co_mutex_init(&s->lock);
+    QLIST_INIT(&s->regions);
 
     /* validate the file signature */
     ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
@@ -738,46 +899,40 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
-    ret = vhdx_parse_header(bs, s);
-    if (ret) {
+    /* This is used for any header updates, for the file_write_guid.
+     * The spec dictates that a new value should be used for the first
+     * header update */
+    vhdx_guid_generate(&s->session_guid);
+
+    vhdx_parse_header(bs, s, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
         goto fail;
     }
 
-    ret = vhdx_parse_log(bs, s);
-    if (ret) {
+    ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp);
+    if (ret < 0) {
         goto fail;
     }
 
     ret = vhdx_open_region_tables(bs, s);
-    if (ret) {
+    if (ret < 0) {
         goto fail;
     }
 
     ret = vhdx_parse_metadata(bs, s);
-    if (ret) {
+    if (ret < 0) {
         goto fail;
     }
+
     s->block_size = s->params.block_size;
 
     /* the VHDX spec dictates that virtual_disk_size is always a multiple of
      * logical_sector_size */
     bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
 
-    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
-    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
-        data_blocks_cnt++;
-    }
-    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
-    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
-        bitmap_blocks_cnt++;
-    }
-
-    if (s->parent_entries) {
-        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
-    } else {
-        s->bat_entries = data_blocks_cnt +
-                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
-    }
+    vhdx_calc_bat_entries(s);
 
     s->bat_offset = s->bat_rt.file_offset;
 
@@ -787,6 +942,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
+    /* s->bat is freed in vhdx_close() */
     s->bat = qemu_blockalign(bs, s->bat_rt.length);
 
     ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
@@ -794,23 +950,46 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
+    uint64_t payblocks = s->chunk_ratio;
+    /* endian convert, and verify populated BAT field file offsets against
+     * region table and log entries */
     for (i = 0; i < s->bat_entries; i++) {
         le64_to_cpus(&s->bat[i]);
+        if (payblocks--) {
+            /* payload bat entries */
+            if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) ==
+                    PAYLOAD_BLOCK_FULLY_PRESENT) {
+                ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK,
+                                        s->block_size);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+        } else {
+            payblocks = s->chunk_ratio;
+            /* Once differencing files are supported, verify sector bitmap
+             * blocks here */
+        }
     }
 
     if (flags & BDRV_O_RDWR) {
-        ret = -ENOTSUP;
-        goto fail;
+        ret = vhdx_update_headers(bs, s, false, NULL);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
-    /* TODO: differencing files, write */
+    /* TODO: differencing files */
+
+    /* Disable migration when VHDX images are used */
+    error_set(&s->migration_blocker,
+            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+            "vhdx", bs->device_name, "live migration");
+    migrate_add_blocker(s->migration_blocker);
 
     return 0;
 fail:
-    qemu_vfree(s->headers[0]);
-    qemu_vfree(s->headers[1]);
-    qemu_vfree(s->bat);
-    qemu_vfree(s->parent_entries);
+    vhdx_close(bs);
     return ret;
 }
 
@@ -850,7 +1029,7 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
 
     sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
 
-    sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
+    sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK;
 
     sinfo->block_offset = block_offset << s->logical_sector_size_bits;
 
@@ -864,11 +1043,22 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
      * in the block, and add in the payload data block offset
      * in the file, in bytes, to get the final read address */
 
-    sinfo->file_offset <<= 20;  /* now in bytes, rather than 1MB units */
     sinfo->file_offset += sinfo->block_offset;
 }
 
 
+static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    bdi->cluster_size = s->block_size;
+
+    bdi->unallocated_blocks_are_zero =
+        (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0;
+
+    return 0;
+}
+
 
 static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                                       int nb_sectors, QEMUIOVector *qiov)
@@ -905,7 +1095,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                 /* return zero */
                 qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
                 break;
-            case PAYLOAD_BLOCK_FULL_PRESENT:
+            case PAYLOAD_BLOCK_FULLY_PRESENT:
                 qemu_co_mutex_unlock(&s->lock);
                 ret = bdrv_co_readv(bs->file,
                                     sinfo.file_offset >> BDRV_SECTOR_BITS,
@@ -935,24 +1125,792 @@ exit:
     return ret;
 }
 
+/*
+ * Allocate a new payload block at the end of the file.
+ *
+ * Allocation will happen at 1MB alignment inside the file
+ *
+ * Returns the file offset start of the new payload block
+ */
+static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
+                                    uint64_t *new_offset)
+{
+    *new_offset = bdrv_getlength(bs->file);
+
+    /* per the spec, the address for a block is in units of 1MB */
+    *new_offset = ROUND_UP(*new_offset, 1024 * 1024);
+
+    return bdrv_truncate(bs->file, *new_offset + s->block_size);
+}
+
+/*
+ * Update the BAT table entry with the new file offset, and the new entry
+ * state */
+static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s,
+                                       VHDXSectorInfo *sinfo,
+                                       uint64_t *bat_entry_le,
+                                       uint64_t *bat_offset, int state)
+{
+    /* The BAT entry is a uint64, with 44 bits for the file offset in units of
+     * 1MB, and 3 bits for the block state. */
+    s->bat[sinfo->bat_idx]  = sinfo->file_offset;
+
+    s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK;
 
+    *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]);
+    *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry);
+
+}
+
+/* Per the spec, on the first write of guest-visible data to the file the
+ * data write guid must be updated in the header */
+int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    if (s->first_visible_write) {
+        s->first_visible_write = false;
+        ret = vhdx_update_headers(bs, s, true, NULL);
+    }
+    return ret;
+}
 
 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                                       int nb_sectors, QEMUIOVector *qiov)
 {
-    return -ENOTSUP;
+    int ret = -ENOTSUP;
+    BDRVVHDXState *s = bs->opaque;
+    VHDXSectorInfo sinfo;
+    uint64_t bytes_done = 0;
+    uint64_t bat_entry = 0;
+    uint64_t bat_entry_offset = 0;
+    QEMUIOVector hd_qiov;
+    struct iovec iov1 = { 0 };
+    struct iovec iov2 = { 0 };
+    int sectors_to_write;
+    int bat_state;
+    uint64_t bat_prior_offset = 0;
+    bool bat_update = false;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+
+    ret = vhdx_user_visible_write(bs, s);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    while (nb_sectors > 0) {
+        bool use_zero_buffers = false;
+        bat_update = false;
+        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+            /* not supported yet */
+            ret = -ENOTSUP;
+            goto exit;
+        } else {
+            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
+            sectors_to_write = sinfo.sectors_avail;
+
+            qemu_iovec_reset(&hd_qiov);
+            /* check the payload block state */
+            bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK;
+            switch (bat_state) {
+            case PAYLOAD_BLOCK_ZERO:
+                /* in this case, we need to preserve zero writes for
+                 * data that is not part of this write, so we must pad
+                 * the rest of the buffer to zeroes */
+
+                /* if we are on a posix system with ftruncate() that extends
+                 * a file, then it is zero-filled for us.  On Win32, the raw
+                 * layer uses SetFilePointer and SetFileEnd, which does not
+                 * zero fill AFAIK */
+
+                /* Queue another write of zero buffers if the underlying file
+                 * does not zero-fill on file extension */
+
+                if (bdrv_has_zero_init(bs->file) == 0) {
+                    use_zero_buffers = true;
+
+                    /* zero fill the front, if any */
+                    if (sinfo.block_offset) {
+                        iov1.iov_len = sinfo.block_offset;
+                        iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
+                        memset(iov1.iov_base, 0, iov1.iov_len);
+                        qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
+                                              sinfo.block_offset);
+                        sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
+                    }
+
+                    /* our actual data */
+                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
+                                      sinfo.bytes_avail);
+
+                    /* zero fill the back, if any */
+                    if ((sinfo.bytes_avail - sinfo.block_offset) <
+                         s->block_size) {
+                        iov2.iov_len = s->block_size -
+                                      (sinfo.bytes_avail + sinfo.block_offset);
+                        iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
+                        memset(iov2.iov_base, 0, iov2.iov_len);
+                        qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
+                                              sinfo.block_offset);
+                        sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
+                    }
+                }
+
+                /* fall through */
+            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
+            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
+            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
+                bat_prior_offset = sinfo.file_offset;
+                ret = vhdx_allocate_block(bs, s, &sinfo.file_offset);
+                if (ret < 0) {
+                    goto exit;
+                }
+                /* once we support differencing files, this may also be
+                 * partially present */
+                /* update block state to the newly specified state */
+                vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
+                                            &bat_entry_offset,
+                                            PAYLOAD_BLOCK_FULLY_PRESENT);
+                bat_update = true;
+                /* since we just allocated a block, file_offset is the
+                 * beginning of the payload block. It needs to be the
+                 * write address, which includes the offset into the block */
+                if (!use_zero_buffers) {
+                    sinfo.file_offset += sinfo.block_offset;
+                }
+                /* fall through */
+            case PAYLOAD_BLOCK_FULLY_PRESENT:
+                /* if the file offset address is in the header zone,
+                 * there is a problem */
+                if (sinfo.file_offset < (1024 * 1024)) {
+                    ret = -EFAULT;
+                    goto error_bat_restore;
+                }
+
+                if (!use_zero_buffers) {
+                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
+                                      sinfo.bytes_avail);
+                }
+                /* block exists, so we can just overwrite it */
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_writev(bs->file,
+                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
+                                    sectors_to_write, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto error_bat_restore;
+                }
+                break;
+            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
+                /* we don't yet support difference files, fall through
+                 * to error */
+            default:
+                ret = -EIO;
+                goto exit;
+                break;
+            }
+
+            if (bat_update) {
+                /* this will update the BAT entry into the log journal, and
+                 * then flush the log journal out to disk */
+                ret =  vhdx_log_write_and_flush(bs, s, &bat_entry,
+                                                sizeof(VHDXBatEntry),
+                                                bat_entry_offset);
+                if (ret < 0) {
+                    goto exit;
+                }
+            }
+
+            nb_sectors -= sinfo.sectors_avail;
+            sector_num += sinfo.sectors_avail;
+            bytes_done += sinfo.bytes_avail;
+
+        }
+    }
+
+    goto exit;
+
+error_bat_restore:
+    if (bat_update) {
+        /* keep metadata in sync, and restore the bat entry state
+         * if error. */
+        sinfo.file_offset = bat_prior_offset;
+        vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
+                                    &bat_entry_offset, bat_state);
+    }
+exit:
+    qemu_vfree(iov1.iov_base);
+    qemu_vfree(iov2.iov_base);
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
 }
 
 
-static void vhdx_close(BlockDriverState *bs)
+
+/*
+ * Create VHDX Headers
+ *
+ * There are 2 headers, and the highest sequence number will represent
+ * the active header
+ */
+static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
+                                   uint32_t log_size)
+{
+    int ret = 0;
+    VHDXHeader *hdr = NULL;
+
+    hdr = g_malloc0(sizeof(VHDXHeader));
+
+    hdr->signature       = VHDX_HEADER_SIGNATURE;
+    hdr->sequence_number = g_random_int();
+    hdr->log_version     = 0;
+    hdr->version         = 1;
+    hdr->log_length      = log_size;
+    hdr->log_offset      = VHDX_HEADER_SECTION_END;
+    vhdx_guid_generate(&hdr->file_write_guid);
+    vhdx_guid_generate(&hdr->data_write_guid);
+
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false);
+    if (ret < 0) {
+        goto exit;
+    }
+    hdr->sequence_number++;
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false);
+    if (ret < 0) {
+        goto exit;
+    }
+
+exit:
+    g_free(hdr);
+    return ret;
+}
+
+
+/*
+ * Create the Metadata entries.
+ *
+ * For more details on the entries, see section 3.5 (pg 29) in the
+ * VHDX 1.00 specification.
+ *
+ * We support 5 metadata entries (all required by spec):
+ *          File Parameters,
+ *          Virtual Disk Size,
+ *          Page 83 Data,
+ *          Logical Sector Size,
+ *          Physical Sector Size
+ *
+ * The first 64KB of the Metadata section is reserved for the metadata
+ * header and entries; beyond that, the metadata items themselves reside.
+ */
+static int vhdx_create_new_metadata(BlockDriverState *bs,
+                                    uint64_t image_size,
+                                    uint32_t block_size,
+                                    uint32_t sector_size,
+                                    uint64_t metadata_offset,
+                                    VHDXImageType type)
+{
+    int ret = 0;
+    uint32_t offset = 0;
+    void *buffer = NULL;
+    void *entry_buffer;
+    VHDXMetadataTableHeader *md_table;;
+    VHDXMetadataTableEntry  *md_table_entry;
+
+    /* Metadata entries */
+    VHDXFileParameters     *mt_file_params;
+    VHDXVirtualDiskSize    *mt_virtual_size;
+    VHDXPage83Data         *mt_page83;
+    VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size;
+    VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size;
+
+    entry_buffer = g_malloc0(sizeof(VHDXFileParameters)               +
+                             sizeof(VHDXVirtualDiskSize)              +
+                             sizeof(VHDXPage83Data)                   +
+                             sizeof(VHDXVirtualDiskLogicalSectorSize) +
+                             sizeof(VHDXVirtualDiskPhysicalSectorSize));
+
+    mt_file_params = entry_buffer;
+    offset += sizeof(VHDXFileParameters);
+    mt_virtual_size = entry_buffer + offset;
+    offset += sizeof(VHDXVirtualDiskSize);
+    mt_page83 = entry_buffer + offset;
+    offset += sizeof(VHDXPage83Data);
+    mt_log_sector_size = entry_buffer + offset;
+    offset += sizeof(VHDXVirtualDiskLogicalSectorSize);
+    mt_phys_sector_size = entry_buffer + offset;
+
+    mt_file_params->block_size = cpu_to_le32(block_size);
+    if (type == VHDX_TYPE_FIXED) {
+        mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED;
+        cpu_to_le32s(&mt_file_params->data_bits);
+    }
+
+    vhdx_guid_generate(&mt_page83->page_83_data);
+    cpu_to_leguids(&mt_page83->page_83_data);
+    mt_virtual_size->virtual_disk_size        = cpu_to_le64(image_size);
+    mt_log_sector_size->logical_sector_size   = cpu_to_le32(sector_size);
+    mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size);
+
+    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
+    md_table = buffer;
+
+    md_table->signature   = VHDX_METADATA_SIGNATURE;
+    md_table->entry_count = 5;
+    vhdx_metadata_header_le_export(md_table);
+
+
+    /* This will reference beyond the reserved table portion */
+    offset = 64 * KiB;
+
+    md_table_entry = buffer + sizeof(VHDXMetadataTableHeader);
+
+    md_table_entry[0].item_id = file_param_guid;
+    md_table_entry[0].offset  = offset;
+    md_table_entry[0].length  = sizeof(VHDXFileParameters);
+    md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED;
+    offset += md_table_entry[0].length;
+    vhdx_metadata_entry_le_export(&md_table_entry[0]);
+
+    md_table_entry[1].item_id = virtual_size_guid;
+    md_table_entry[1].offset  = offset;
+    md_table_entry[1].length  = sizeof(VHDXVirtualDiskSize);
+    md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
+    offset += md_table_entry[1].length;
+    vhdx_metadata_entry_le_export(&md_table_entry[1]);
+
+    md_table_entry[2].item_id = page83_guid;
+    md_table_entry[2].offset  = offset;
+    md_table_entry[2].length  = sizeof(VHDXPage83Data);
+    md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
+    offset += md_table_entry[2].length;
+    vhdx_metadata_entry_le_export(&md_table_entry[2]);
+
+    md_table_entry[3].item_id = logical_sector_guid;
+    md_table_entry[3].offset  = offset;
+    md_table_entry[3].length  = sizeof(VHDXVirtualDiskLogicalSectorSize);
+    md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
+    offset += md_table_entry[3].length;
+    vhdx_metadata_entry_le_export(&md_table_entry[3]);
+
+    md_table_entry[4].item_id = phys_sector_guid;
+    md_table_entry[4].offset  = offset;
+    md_table_entry[4].length  = sizeof(VHDXVirtualDiskPhysicalSectorSize);
+    md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
+    vhdx_metadata_entry_le_export(&md_table_entry[4]);
+
+    ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
+                      VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+
+
+exit:
+    g_free(buffer);
+    g_free(entry_buffer);
+    return ret;
+}
+
+/* This create the actual BAT itself.  We currently only support
+ * 'Dynamic' and 'Fixed' image types.
+ *
+ *  Dynamic images: default state of the BAT is all zeroes.
+ *
+ *  Fixed images: default state of the BAT is fully populated, with
+ *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
+ */
+static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
+                           uint64_t image_size, VHDXImageType type,
+                           bool use_zero_blocks, VHDXRegionTableEntry *rt_bat)
+{
+    int ret = 0;
+    uint64_t data_file_offset;
+    uint64_t total_sectors = 0;
+    uint64_t sector_num = 0;
+    uint64_t unused;
+    int block_state;
+    VHDXSectorInfo sinfo;
+
+    assert(s->bat == NULL);
+
+    /* this gives a data start after BAT/bitmap entries, and well
+     * past any metadata entries (with a 4 MB buffer for future
+     * expansion */
+    data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB;
+    total_sectors = image_size >> s->logical_sector_size_bits;
+
+    if (type == VHDX_TYPE_DYNAMIC) {
+        /* All zeroes, so we can just extend the file - the end of the BAT
+         * is the furthest thing we have written yet */
+        ret = bdrv_truncate(bs, data_file_offset);
+        if (ret < 0) {
+            goto exit;
+        }
+    } else if (type == VHDX_TYPE_FIXED) {
+        ret = bdrv_truncate(bs, data_file_offset + image_size);
+        if (ret < 0) {
+            goto exit;
+        }
+    } else {
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    if (type == VHDX_TYPE_FIXED ||
+                use_zero_blocks ||
+                bdrv_has_zero_init(bs) == 0) {
+        /* for a fixed file, the default BAT entry is not zero */
+        s->bat = g_malloc0(rt_bat->length);
+        block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
+                                                PAYLOAD_BLOCK_NOT_PRESENT;
+        block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
+        /* fill the BAT by emulating sector writes of sectors_per_block size */
+        while (sector_num < total_sectors) {
+            vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo);
+            sinfo.file_offset = data_file_offset +
+                                (sector_num << s->logical_sector_size_bits);
+            sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB);
+            vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused,
+                                        block_state);
+            cpu_to_le64s(&s->bat[sinfo.bat_idx]);
+            sector_num += s->sectors_per_block;
+        }
+        ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length);
+        if (ret < 0) {
+            goto exit;
+        }
+    }
+
+
+
+exit:
+    g_free(s->bat);
+    return ret;
+}
+
+/* Creates the region table header, and region table entries.
+ * There are 2 supported region table entries: BAT, and Metadata/
+ *
+ * As the calculations for the BAT region table are also needed
+ * to create the BAT itself, we will also cause the BAT to be
+ * created.
+ */
+static int vhdx_create_new_region_table(BlockDriverState *bs,
+                                        uint64_t image_size,
+                                        uint32_t block_size,
+                                        uint32_t sector_size,
+                                        uint32_t log_size,
+                                        bool use_zero_blocks,
+                                        VHDXImageType type,
+                                        uint64_t *metadata_offset)
+{
+    int ret = 0;
+    uint32_t offset = 0;
+    void *buffer = NULL;
+    BDRVVHDXState *s = NULL;
+    VHDXRegionTableHeader *region_table;
+    VHDXRegionTableEntry *rt_bat;
+    VHDXRegionTableEntry *rt_metadata;
+
+    assert(metadata_offset != NULL);
+
+    /* Populate enough of the BDRVVHDXState to be able to use the
+     * pre-existing BAT calculation, translation, and update functions */
+    s = g_malloc0(sizeof(BDRVVHDXState));
+
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+                     (uint64_t) sector_size / (uint64_t) block_size;
+
+    s->sectors_per_block = block_size / sector_size;
+    s->virtual_disk_size = image_size;
+    s->block_size = block_size;
+    s->logical_sector_size = sector_size;
+
+    vhdx_set_shift_bits(s);
+
+    vhdx_calc_bat_entries(s);
+
+    /* At this point the VHDX state is populated enough for creation */
+
+    /* a single buffer is used so we can calculate the checksum over the
+     * entire 64KB block */
+    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
+    region_table = buffer;
+    offset += sizeof(VHDXRegionTableHeader);
+    rt_bat = buffer + offset;
+    offset += sizeof(VHDXRegionTableEntry);
+    rt_metadata  = buffer + offset;
+
+    region_table->signature = VHDX_REGION_SIGNATURE;
+    region_table->entry_count = 2;   /* BAT and Metadata */
+
+    rt_bat->guid        = bat_guid;
+    rt_bat->length      = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB);
+    rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB);
+    s->bat_offset = rt_bat->file_offset;
+
+    rt_metadata->guid        = metadata_guid;
+    rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length,
+                                        MiB);
+    rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
+    *metadata_offset = rt_metadata->file_offset;
+
+    vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
+                         offsetof(VHDXRegionTableHeader, checksum));
+
+
+    /* The region table gives us the data we need to create the BAT,
+     * so do that now */
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat);
+
+    /* Now write out the region headers to disk */
+    vhdx_region_header_le_export(region_table);
+    vhdx_region_entry_le_export(rt_bat);
+    vhdx_region_entry_le_export(rt_metadata);
+
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
+                      VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer,
+                      VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+
+
+exit:
+    g_free(s);
+    g_free(buffer);
+    return ret;
+}
+
+/* We need to create the following elements:
+ *
+ *    .-----------------------------------------------------------------.
+ *    |   (A)    |   (B)    |    (C)    |     (D)       |     (E)       |
+ *    |  File ID |  Header1 |  Header 2 |  Region Tbl 1 |  Region Tbl 2 |
+ *    |          |          |           |               |               |
+ *    .-----------------------------------------------------------------.
+ *    0         64KB      128KB       192KB           256KB           320KB
+ *
+ *
+ *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
+ *    |     (F)     |     (G)       |    (H)    |                        |
+ *    | Journal Log |  BAT / Bitmap |  Metadata |  .... data ......      |
+ *    |             |               |           |                        |
+ *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
+ *   1MB
+ */
+static int vhdx_create(const char *filename, QEMUOptionParameter *options,
+                       Error **errp)
+{
+    int ret = 0;
+    uint64_t image_size = (uint64_t) 2 * GiB;
+    uint32_t log_size   = 1 * MiB;
+    uint32_t block_size = 0;
+    uint64_t signature;
+    uint64_t metadata_offset;
+    bool use_zero_blocks = false;
+
+    gunichar2 *creator = NULL;
+    glong creator_items;
+    BlockDriverState *bs;
+    const char *type = NULL;
+    VHDXImageType image_type;
+    Error *local_err = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_size = options->value.n;
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_LOG_SIZE)) {
+            log_size = options->value.n;
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_BLOCK_SIZE)) {
+            block_size = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
+            type = options->value.s;
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_ZERO)) {
+            use_zero_blocks = options->value.n != 0;
+        }
+        options++;
+    }
+
+    if (image_size > VHDX_MAX_IMAGE_SIZE) {
+        error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB");
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    if (type == NULL) {
+        type = "dynamic";
+    }
+
+    if (!strcmp(type, "dynamic")) {
+        image_type = VHDX_TYPE_DYNAMIC;
+    } else if (!strcmp(type, "fixed")) {
+        image_type = VHDX_TYPE_FIXED;
+    } else if (!strcmp(type, "differencing")) {
+        error_setg_errno(errp, ENOTSUP,
+                         "Differencing files not yet supported");
+        ret = -ENOTSUP;
+        goto exit;
+    } else {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* These are pretty arbitrary, and mainly designed to keep the BAT
+     * size reasonable to load into RAM */
+    if (block_size == 0) {
+        if (image_size > 32 * TiB) {
+            block_size = 64 * MiB;
+        } else if (image_size > (uint64_t) 100 * GiB) {
+            block_size = 32 * MiB;
+        } else if (image_size > 1 * GiB) {
+            block_size = 16 * MiB;
+        } else {
+            block_size = 8 * MiB;
+        }
+    }
+
+
+    /* make the log size close to what was specified, but must be
+     * min 1MB, and multiple of 1MB */
+    log_size = ROUND_UP(log_size, MiB);
+
+    block_size = ROUND_UP(block_size, MiB);
+    block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX :
+                                                    block_size;
+
+    ret = bdrv_create_file(filename, options, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
+    bs = NULL;
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
+    /* Create (A) */
+
+    /* The creator field is optional, but may be useful for
+     * debugging / diagnostics */
+    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
+                              &creator_items, NULL);
+    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
+    ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    if (ret < 0) {
+        goto delete_and_exit;
+    }
+    if (creator) {
+        ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature),
+                          creator, creator_items * sizeof(gunichar2));
+        if (ret < 0) {
+            goto delete_and_exit;
+        }
+    }
+
+
+    /* Creates (B),(C) */
+    ret = vhdx_create_new_headers(bs, image_size, log_size);
+    if (ret < 0) {
+        goto delete_and_exit;
+    }
+
+    /* Creates (D),(E),(G) explicitly. (F) created as by-product */
+    ret = vhdx_create_new_region_table(bs, image_size, block_size, 512,
+                                       log_size, use_zero_blocks, image_type,
+                                       &metadata_offset);
+    if (ret < 0) {
+        goto delete_and_exit;
+    }
+
+    /* Creates (H) */
+    ret = vhdx_create_new_metadata(bs, image_size, block_size, 512,
+                                   metadata_offset, image_type);
+    if (ret < 0) {
+        goto delete_and_exit;
+    }
+
+
+
+delete_and_exit:
+    bdrv_unref(bs);
+exit:
+    g_free(creator);
+    return ret;
+}
+
+/* If opened r/w, the VHDX driver will automatically replay the log,
+ * if one is present, inside the vhdx_open() call.
+ *
+ * If qemu-img check -r all is called, the image is automatically opened
+ * r/w and any log has already been replayed, so there is nothing (currently)
+ * for us to do here
+ */
+static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
 {
     BDRVVHDXState *s = bs->opaque;
-    qemu_vfree(s->headers[0]);
-    qemu_vfree(s->headers[1]);
-    qemu_vfree(s->bat);
-    qemu_vfree(s->parent_entries);
+
+    if (s->log_replayed_on_open) {
+        result->corruptions_fixed++;
+    }
+    return 0;
 }
 
+static QEMUOptionParameter vhdx_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size; max of 64TB."
+    },
+    {
+        .name = VHDX_BLOCK_OPT_LOG_SIZE,
+        .type = OPT_SIZE,
+        .value.n = 1 * MiB,
+        .help = "Log size; min 1MB."
+    },
+    {
+        .name = VHDX_BLOCK_OPT_BLOCK_SIZE,
+        .type = OPT_SIZE,
+        .value.n = 0,
+        .help = "Block Size; min 1MB, max 256MB. " \
+                "0 means auto-calculate based on image size."
+    },
+    {
+        .name = BLOCK_OPT_SUBFMT,
+        .type = OPT_STRING,
+        .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\
+                "Default is 'dynamic'."
+    },
+    {
+        .name = VHDX_BLOCK_OPT_ZERO,
+        .type = OPT_FLAG,
+        .help = "Force use of payload blocks of type 'ZERO'.  Non-standard."
+    },
+    { NULL }
+};
+
 static BlockDriver bdrv_vhdx = {
     .format_name            = "vhdx",
     .instance_size          = sizeof(BDRVVHDXState),
@@ -962,6 +1920,11 @@ static BlockDriver bdrv_vhdx = {
     .bdrv_reopen_prepare    = vhdx_reopen_prepare,
     .bdrv_co_readv          = vhdx_co_readv,
     .bdrv_co_writev         = vhdx_co_writev,
+    .bdrv_create            = vhdx_create,
+    .bdrv_get_info          = vhdx_get_info,
+    .bdrv_check             = vhdx_check,
+
+    .create_options         = vhdx_create_options,
 };
 
 static void bdrv_vhdx_init(void)
diff --git a/block/vhdx.h b/block/vhdx.h
index fb687ed2d..8103d4c44 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -6,9 +6,9 @@
  * Authors:
  *  Jeff Cody <jcody@redhat.com>
  *
- *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
  *  by Microsoft:
- *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
  *
  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  * See the COPYING.LIB file in the top-level directory.
@@ -18,6 +18,11 @@
 #ifndef BLOCK_VHDX_H
 #define BLOCK_VHDX_H
 
+#define KiB              (1 * 1024)
+#define MiB            (KiB * 1024)
+#define GiB            (MiB * 1024)
+#define TiB ((uint64_t) GiB * 1024)
+
 /* Structures and fields present in the VHDX file */
 
 /* The header section has the following blocks,
@@ -30,14 +35,15 @@
  * 0.........64KB...........128KB........192KB..........256KB................1MB
  */
 
-#define VHDX_HEADER_BLOCK_SIZE      (64*1024)
+#define VHDX_HEADER_BLOCK_SIZE      (64 * 1024)
 
 #define VHDX_FILE_ID_OFFSET         0
-#define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE*1)
-#define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE*2)
-#define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE*3)
-
+#define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 1)
+#define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 2)
+#define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE * 3)
+#define VHDX_REGION_TABLE2_OFFSET   (VHDX_HEADER_BLOCK_SIZE * 4)
 
+#define VHDX_HEADER_SECTION_END     (1 * MiB)
 /*
  * A note on the use of MS-GUID fields.  For more details on the GUID,
  * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier.
@@ -55,10 +61,11 @@
 /* These structures are ones that are defined in the VHDX specification
  * document */
 
+#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL  /* "vhdxfile" in ASCII */
 typedef struct VHDXFileIdentifier {
     uint64_t    signature;              /* "vhdxfile" in ASCII */
     uint16_t    creator[256];           /* optional; utf-16 string to identify
-                                           the vhdx file creator.  Diagnotistic
+                                           the vhdx file creator.  Diagnostic
                                            only */
 } VHDXFileIdentifier;
 
@@ -67,7 +74,7 @@ typedef struct VHDXFileIdentifier {
  * Microsoft is not just 16 bytes though - it is a structure that is defined,
  * so we need to follow it here so that endianness does not trip us up */
 
-typedef struct MSGUID {
+typedef struct QEMU_PACKED MSGUID {
     uint32_t  data1;
     uint16_t  data2;
     uint16_t  data3;
@@ -77,14 +84,15 @@ typedef struct MSGUID {
 #define guid_eq(a, b) \
     (memcmp(&(a), &(b), sizeof(MSGUID)) == 0)
 
-#define VHDX_HEADER_SIZE (4*1024)   /* although the vhdx_header struct in disk
-                                       is only 582 bytes, for purposes of crc
-                                       the header is the first 4KB of the 64KB
-                                       block */
+#define VHDX_HEADER_SIZE (4 * 1024)   /* although the vhdx_header struct in disk
+                                         is only 582 bytes, for purposes of crc
+                                         the header is the first 4KB of the 64KB
+                                         block */
 
 /* The full header is 4KB, although the actual header data is much smaller.
  * But for the checksum calculation, it is over the entire 4KB structure,
  * not just the defined portion of it */
+#define VHDX_HEADER_SIGNATURE 0x64616568
 typedef struct QEMU_PACKED VHDXHeader {
     uint32_t    signature;              /* "head" in ASCII */
     uint32_t    checksum;               /* CRC-32C hash of the whole header */
@@ -92,7 +100,7 @@ typedef struct QEMU_PACKED VHDXHeader {
                                            VHDX file has 2 of these headers,
                                            and only the header with the highest
                                            sequence number is valid */
-    MSGUID      file_write_guid;       /* 128 bit unique identifier. Must be
+    MSGUID      file_write_guid;        /* 128 bit unique identifier. Must be
                                            updated to new, unique value before
                                            the first modification is made to
                                            file */
@@ -114,9 +122,9 @@ typedef struct QEMU_PACKED VHDXHeader {
                                            there is no valid log. If non-zero,
                                            log entries with this guid are
                                            valid. */
-    uint16_t    log_version;            /* version of the log format. Mustn't be
-                                           zero, unless log_guid is also zero */
-    uint16_t    version;                /* version of th evhdx file.  Currently,
+    uint16_t    log_version;            /* version of the log format. Must be
+                                           set to zero */
+    uint16_t    version;                /* version of the vhdx file.  Currently,
                                            only supported version is "1" */
     uint32_t    log_length;             /* length of the log.  Must be multiple
                                            of 1MB */
@@ -125,6 +133,7 @@ typedef struct QEMU_PACKED VHDXHeader {
 } VHDXHeader;
 
 /* Header for the region table block */
+#define VHDX_REGION_SIGNATURE  0x69676572  /* "regi" in ASCII */
 typedef struct QEMU_PACKED VHDXRegionTableHeader {
     uint32_t    signature;              /* "regi" in ASCII */
     uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
@@ -151,7 +160,10 @@ typedef struct QEMU_PACKED VHDXRegionTableEntry {
 
 
 /* ---- LOG ENTRY STRUCTURES ---- */
+#define VHDX_LOG_MIN_SIZE (1024 * 1024)
+#define VHDX_LOG_SECTOR_SIZE 4096
 #define VHDX_LOG_HDR_SIZE 64
+#define VHDX_LOG_SIGNATURE 0x65676f6c
 typedef struct QEMU_PACKED VHDXLogEntryHeader {
     uint32_t    signature;              /* "loge" in ASCII */
     uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
@@ -174,7 +186,8 @@ typedef struct QEMU_PACKED VHDXLogEntryHeader {
 } VHDXLogEntryHeader;
 
 #define VHDX_LOG_DESC_SIZE 32
-
+#define VHDX_LOG_DESC_SIGNATURE 0x63736564
+#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a
 typedef struct QEMU_PACKED VHDXLogDescriptor {
     uint32_t    signature;              /* "zero" or "desc" in ASCII */
     union  {
@@ -194,6 +207,7 @@ typedef struct QEMU_PACKED VHDXLogDescriptor {
                                            vhdx_log_entry_header */
 } VHDXLogDescriptor;
 
+#define VHDX_LOG_DATA_SIGNATURE 0x61746164
 typedef struct QEMU_PACKED VHDXLogDataSector {
     uint32_t    data_signature;         /* "data" in ASCII */
     uint32_t    sequence_high;          /* 4 MSB of 8 byte sequence_number */
@@ -212,19 +226,19 @@ typedef struct QEMU_PACKED VHDXLogDataSector {
 #define PAYLOAD_BLOCK_UNDEFINED         1
 #define PAYLOAD_BLOCK_ZERO              2
 #define PAYLOAD_BLOCK_UNMAPPED          5
-#define PAYLOAD_BLOCK_FULL_PRESENT      6
+#define PAYLOAD_BLOCK_FULLY_PRESENT     6
 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7
 
 #define SB_BLOCK_NOT_PRESENT    0
 #define SB_BLOCK_PRESENT        6
 
 /* per the spec */
-#define VHDX_MAX_SECTORS_PER_BLOCK  (1<<23)
+#define VHDX_MAX_SECTORS_PER_BLOCK  (1 << 23)
 
 /* upper 44 bits are the file offset in 1MB units lower 3 bits are the state
    other bits are reserved */
 #define VHDX_BAT_STATE_BIT_MASK 0x07
-#define VHDX_BAT_FILE_OFF_BITS (64-44)
+#define VHDX_BAT_FILE_OFF_MASK  0xFFFFFFFFFFF00000ULL /* upper 44 bits */
 typedef uint64_t VHDXBatEntry;
 
 /* ---- METADATA REGION STRUCTURES ---- */
@@ -233,6 +247,7 @@ typedef uint64_t VHDXBatEntry;
 #define VHDX_METADATA_MAX_ENTRIES 2047  /* not including the header */
 #define VHDX_METADATA_TABLE_MAX_SIZE \
     (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1))
+#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL  /* "metadata" in ASCII */
 typedef struct QEMU_PACKED VHDXMetadataTableHeader {
     uint64_t    signature;              /* "metadata" in ASCII */
     uint16_t    reserved;
@@ -252,8 +267,8 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry {
                                            metadata region */
                                         /* note: if length = 0, so is offset */
     uint32_t    length;                 /* length of metadata. <= 1MB. */
-    uint32_t    data_bits;      /* least-significant 3 bits are flags, the
-                                   rest are reserved (see above) */
+    uint32_t    data_bits;              /* least-significant 3 bits are flags,
+                                           the rest are reserved (see above) */
     uint32_t    reserved2;
 } VHDXMetadataTableEntry;
 
@@ -262,13 +277,16 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry {
                                                    If set indicates a fixed
                                                    size VHDX file */
 #define VHDX_PARAMS_HAS_PARENT           0x02    /* has parent / backing file */
+#define VHDX_BLOCK_SIZE_MIN             (1   * MiB)
+#define VHDX_BLOCK_SIZE_MAX             (256 * MiB)
 typedef struct QEMU_PACKED VHDXFileParameters {
     uint32_t    block_size;             /* size of each payload block, always
                                            power of 2, <= 256MB and >= 1MB. */
-    uint32_t data_bits;     /* least-significant 2 bits are flags, the rest
-                               are reserved (see above) */
+    uint32_t data_bits;                 /* least-significant 2 bits are flags,
+                                           the rest are reserved (see above) */
 } VHDXFileParameters;
 
+#define VHDX_MAX_IMAGE_SIZE  ((uint64_t) 64 * TiB)
 typedef struct QEMU_PACKED VHDXVirtualDiskSize {
     uint64_t    virtual_disk_size;      /* Size of the virtual disk, in bytes.
                                            Must be multiple of the sector size,
@@ -276,7 +294,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskSize {
 } VHDXVirtualDiskSize;
 
 typedef struct QEMU_PACKED VHDXPage83Data {
-    MSGUID      page_83_data[16];       /* unique id for scsi devices that
+    MSGUID      page_83_data;           /* unique id for scsi devices that
                                            support page 0x83 */
 } VHDXPage83Data;
 
@@ -291,7 +309,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize {
 } VHDXVirtualDiskPhysicalSectorSize;
 
 typedef struct QEMU_PACKED VHDXParentLocatorHeader {
-    MSGUID      locator_type[16];       /* type of the parent virtual disk. */
+    MSGUID      locator_type;           /* type of the parent virtual disk. */
     uint16_t    reserved;
     uint16_t    key_value_count;        /* number of key/value pairs for this
                                            locator */
@@ -308,18 +326,125 @@ typedef struct QEMU_PACKED VHDXParentLocatorEntry {
 
 /* ----- END VHDX SPECIFICATION STRUCTURES ---- */
 
+typedef struct VHDXMetadataEntries {
+    VHDXMetadataTableEntry file_parameters_entry;
+    VHDXMetadataTableEntry virtual_disk_size_entry;
+    VHDXMetadataTableEntry page83_data_entry;
+    VHDXMetadataTableEntry logical_sector_size_entry;
+    VHDXMetadataTableEntry phys_sector_size_entry;
+    VHDXMetadataTableEntry parent_locator_entry;
+    uint16_t present;
+} VHDXMetadataEntries;
+
+typedef struct VHDXLogEntries {
+    uint64_t offset;
+    uint64_t length;
+    uint32_t write;
+    uint32_t read;
+    VHDXLogEntryHeader *hdr;
+    void *desc_buffer;
+    uint64_t sequence;
+    uint32_t tail;
+} VHDXLogEntries;
+
+typedef struct VHDXRegionEntry {
+    uint64_t start;
+    uint64_t end;
+    QLIST_ENTRY(VHDXRegionEntry) entries;
+} VHDXRegionEntry;
+
+typedef struct BDRVVHDXState {
+    CoMutex lock;
+
+    int curr_header;
+    VHDXHeader *headers[2];
+
+    VHDXRegionTableHeader rt;
+    VHDXRegionTableEntry bat_rt;         /* region table for the BAT */
+    VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */
+
+    VHDXMetadataTableHeader metadata_hdr;
+    VHDXMetadataEntries metadata_entries;
+
+    VHDXFileParameters params;
+    uint32_t block_size;
+    uint32_t block_size_bits;
+    uint32_t sectors_per_block;
+    uint32_t sectors_per_block_bits;
+
+    uint64_t virtual_disk_size;
+    uint32_t logical_sector_size;
+    uint32_t physical_sector_size;
+
+    uint64_t chunk_ratio;
+    uint32_t chunk_ratio_bits;
+    uint32_t logical_sector_size_bits;
+
+    uint32_t bat_entries;
+    VHDXBatEntry *bat;
+    uint64_t bat_offset;
 
+    bool first_visible_write;
+    MSGUID session_guid;
+
+    VHDXLogEntries log;
+
+    VHDXParentLocatorHeader parent_header;
+    VHDXParentLocatorEntry *parent_entries;
+
+    Error *migration_blocker;
+
+    bool log_replayed_on_open;
+
+    QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions;
+} BDRVVHDXState;
+
+void vhdx_guid_generate(MSGUID *guid);
+
+int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw,
+                        MSGUID *log_guid);
+
+uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset);
 uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
                             int crc_offset);
 
 bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
 
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp);
+
+int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
+                             void *data, uint32_t length, uint64_t offset);
 
-static void leguid_to_cpus(MSGUID *guid)
+static inline void leguid_to_cpus(MSGUID *guid)
 {
     le32_to_cpus(&guid->data1);
     le16_to_cpus(&guid->data2);
     le16_to_cpus(&guid->data3);
 }
 
+static inline void cpu_to_leguids(MSGUID *guid)
+{
+    cpu_to_le32s(&guid->data1);
+    cpu_to_le16s(&guid->data2);
+    cpu_to_le16s(&guid->data3);
+}
+
+void vhdx_header_le_import(VHDXHeader *h);
+void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h);
+void vhdx_log_desc_le_import(VHDXLogDescriptor *d);
+void vhdx_log_desc_le_export(VHDXLogDescriptor *d);
+void vhdx_log_data_le_export(VHDXLogDataSector *d);
+void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr);
+void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr);
+void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr);
+void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr);
+void vhdx_region_entry_le_import(VHDXRegionTableEntry *e);
+void vhdx_region_entry_le_export(VHDXRegionTableEntry *e);
+void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr);
+void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr);
+void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e);
+void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e);
+int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s);
+
 #endif
diff --git a/block/vmdk.c b/block/vmdk.c
index 346bb5cad..b69988d16 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -105,18 +105,22 @@ typedef struct VmdkExtent {
     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
     uint32_t l2_cache_counts[L2_CACHE_SIZE];
 
-    unsigned int cluster_sectors;
+    int64_t cluster_sectors;
+    char *type;
 } VmdkExtent;
 
 typedef struct BDRVVmdkState {
     CoMutex lock;
     uint64_t desc_offset;
     bool cid_updated;
+    bool cid_checked;
+    uint32_t cid;
     uint32_t parent_cid;
     int num_extents;
     /* Extent array with num_extents entries, ascend ordered by address */
     VmdkExtent *extents;
     Error *migration_blocker;
+    char *create_type;
 } BDRVVmdkState;
 
 typedef struct VmdkMetaData {
@@ -197,8 +201,6 @@ static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
     }
 }
 
-#define CHECK_CID 1
-
 #define SECTOR_SIZE 512
 #define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
 #define BUF_SIZE 4096
@@ -215,8 +217,9 @@ static void vmdk_free_extents(BlockDriverState *bs)
         g_free(e->l1_table);
         g_free(e->l2_cache);
         g_free(e->l1_backup_table);
+        g_free(e->type);
         if (e->file != bs->file) {
-            bdrv_delete(e->file);
+            bdrv_unref(e->file);
         }
     }
     g_free(s->extents);
@@ -301,19 +304,18 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 
 static int vmdk_is_cid_valid(BlockDriverState *bs)
 {
-#ifdef CHECK_CID
     BDRVVmdkState *s = bs->opaque;
     BlockDriverState *p_bs = bs->backing_hd;
     uint32_t cur_pcid;
 
-    if (p_bs) {
+    if (!s->cid_checked && p_bs) {
         cur_pcid = vmdk_read_cid(p_bs, 0);
         if (s->parent_cid != cur_pcid) {
             /* CID not valid */
             return 0;
         }
     }
-#endif
+    s->cid_checked = true;
     /* CID valid */
     return 1;
 }
@@ -331,8 +333,7 @@ static int vmdk_reopen_prepare(BDRVReopenState *state,
     assert(state->bs != NULL);
 
     if (queue == NULL) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                 "No reopen queue for VMDK extents");
+        error_setg(errp, "No reopen queue for VMDK extents");
         goto exit;
     }
 
@@ -391,15 +392,24 @@ static int vmdk_add_extent(BlockDriverState *bs,
                            int64_t l1_offset, int64_t l1_backup_offset,
                            uint32_t l1_size,
                            int l2_size, uint64_t cluster_sectors,
-                           VmdkExtent **new_extent)
+                           VmdkExtent **new_extent,
+                           Error **errp)
 {
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
 
     if (cluster_sectors > 0x200000) {
         /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
-        error_report("invalid granularity, image may be corrupt");
-        return -EINVAL;
+        error_setg(errp, "Invalid granularity, image may be corrupt");
+        return -EFBIG;
+    }
+    if (l1_size > 512 * 1024 * 1024) {
+        /* Although with big capacity and small l1_entry_sectors, we can get a
+         * big l1_size, we don't want unbounded value to allocate the table.
+         * Limit it to 512M, which is 16PB for default cluster and L2 table
+         * size */
+        error_setg(errp, "L1 size too big");
+        return -EFBIG;
     }
 
     s->extents = g_realloc(s->extents,
@@ -416,7 +426,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
     extent->l1_size = l1_size;
     extent->l1_entry_sectors = l2_size * cluster_sectors;
     extent->l2_size = l2_size;
-    extent->cluster_sectors = cluster_sectors;
+    extent->cluster_sectors = flat ? sectors : cluster_sectors;
 
     if (s->num_extents > 1) {
         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -430,7 +440,8 @@ static int vmdk_add_extent(BlockDriverState *bs,
     return 0;
 }
 
-static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
+static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
+                            Error **errp)
 {
     int ret;
     int l1_size, i;
@@ -439,10 +450,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
     l1_size = extent->l1_size * sizeof(uint32_t);
     extent->l1_table = g_malloc(l1_size);
     ret = bdrv_pread(extent->file,
-                    extent->l1_table_offset,
-                    extent->l1_table,
-                    l1_size);
+                     extent->l1_table_offset,
+                     extent->l1_table,
+                     l1_size);
     if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "Could not read l1 table from extent '%s'",
+                         extent->file->filename);
         goto fail_l1;
     }
     for (i = 0; i < extent->l1_size; i++) {
@@ -452,10 +466,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
     if (extent->l1_backup_table_offset) {
         extent->l1_backup_table = g_malloc(l1_size);
         ret = bdrv_pread(extent->file,
-                        extent->l1_backup_table_offset,
-                        extent->l1_backup_table,
-                        l1_size);
+                         extent->l1_backup_table_offset,
+                         extent->l1_backup_table,
+                         l1_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Could not read l1 backup table from extent '%s'",
+                             extent->file->filename);
             goto fail_l1b;
         }
         for (i = 0; i < extent->l1_size; i++) {
@@ -473,9 +490,9 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
     return ret;
 }
 
-static int vmdk_open_vmdk3(BlockDriverState *bs,
-                           BlockDriverState *file,
-                           int flags)
+static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
+                                 BlockDriverState *file,
+                                 int flags, Error **errp)
 {
     int ret;
     uint32_t magic;
@@ -484,20 +501,24 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
 
     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "Could not read header from file '%s'",
+                         file->filename);
         return ret;
     }
-
-    ret = vmdk_add_extent(bs,
-                             bs->file, false,
-                             le32_to_cpu(header.disk_sectors),
-                             le32_to_cpu(header.l1dir_offset) << 9,
-                             0, 1 << 6, 1 << 9,
-                             le32_to_cpu(header.granularity),
-                             &extent);
+    ret = vmdk_add_extent(bs, file, false,
+                          le32_to_cpu(header.disk_sectors),
+                          le32_to_cpu(header.l1dir_offset) << 9,
+                          0,
+                          le32_to_cpu(header.l1dir_size),
+                          4096,
+                          le32_to_cpu(header.granularity),
+                          &extent,
+                          errp);
     if (ret < 0) {
         return ret;
     }
-    ret = vmdk_init_tables(bs, extent);
+    ret = vmdk_init_tables(bs, extent, errp);
     if (ret) {
         /* free extent allocated by vmdk_add_extent */
         vmdk_free_last_extent(bs);
@@ -505,31 +526,71 @@ static int vmdk_open_vmdk3(BlockDriverState *bs,
     return ret;
 }
 
-static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
-                               uint64_t desc_offset);
+static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
+                               Error **errp);
+
+static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
+                            Error **errp)
+{
+    int64_t size;
+    char *buf;
+    int ret;
+
+    size = bdrv_getlength(file);
+    if (size < 0) {
+        error_setg_errno(errp, -size, "Could not access file");
+        return NULL;
+    }
+
+    size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
+    buf = g_malloc0(size + 1);
+
+    ret = bdrv_pread(file, desc_offset, buf, size);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not read from file");
+        g_free(buf);
+        return NULL;
+    }
+
+    return buf;
+}
 
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
-                           int flags)
+                           int flags, Error **errp)
 {
     int ret;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
     VmdkExtent *extent;
+    BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
 
     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
-        return ret;
+        error_setg_errno(errp, -ret,
+                         "Could not read header from file '%s'",
+                         file->filename);
+        return -EINVAL;
     }
     if (header.capacity == 0) {
         uint64_t desc_offset = le64_to_cpu(header.desc_offset);
         if (desc_offset) {
-            return vmdk_open_desc_file(bs, flags, desc_offset << 9);
+            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
+            if (!buf) {
+                return -EINVAL;
+            }
+            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            g_free(buf);
+            return ret;
         }
     }
 
+    if (!s->create_type) {
+        s->create_type = g_strdup("monolithicSparse");
+    }
+
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
          * The footer takes precedence over the header, so read it in. The
@@ -577,17 +638,24 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         header = footer.header;
     }
 
-    if (le32_to_cpu(header.version) >= 3) {
+    if (le32_to_cpu(header.version) > 3) {
         char buf[64];
         snprintf(buf, sizeof(buf), "VMDK version %d",
                  le32_to_cpu(header.version));
-        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                bs->device_name, "vmdk", buf);
+        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                  bs->device_name, "vmdk", buf);
         return -ENOTSUP;
+    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+        /* VMware KB 2064959 explains that version 3 added support for
+         * persistent changed block tracking (CBT), and backup software can
+         * read it as version=1 if it doesn't care about the changed area
+         * information. So we are safe to enable read only. */
+        error_setg(errp, "VMDK version 3 must be read only");
+        return -EINVAL;
     }
 
     if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
-        error_report("L2 table size too big");
+        error_setg(errp, "L2 table size too big");
         return -EINVAL;
     }
 
@@ -598,17 +666,16 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     }
     l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                 / l1_entry_sectors;
-    if (l1_size > 512 * 1024 * 1024) {
-        /* although with big capacity and small l1_entry_sectors, we can get a
-         * big l1_size, we don't want unbounded value to allocate the table.
-         * Limit it to 512M, which is 16PB for default cluster and L2 table
-         * size */
-        error_report("L1 size too big");
-        return -EFBIG;
-    }
     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
     }
+    if (bdrv_getlength(file) <
+            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+        error_setg(errp, "File truncated, expecting at least %lld bytes",
+                   le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        return -EINVAL;
+    }
+
     ret = vmdk_add_extent(bs, file, false,
                           le64_to_cpu(header.capacity),
                           le64_to_cpu(header.gd_offset) << 9,
@@ -616,16 +683,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
                           l1_size,
                           le32_to_cpu(header.num_gtes_per_gt),
                           le64_to_cpu(header.granularity),
-                          &extent);
+                          &extent,
+                          errp);
     if (ret < 0) {
         return ret;
     }
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
+    if (extent->compressed) {
+        g_free(s->create_type);
+        s->create_type = g_strdup("streamOptimized");
+    }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
     extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
-    ret = vmdk_init_tables(bs, extent);
+    ret = vmdk_init_tables(bs, extent, errp);
     if (ret) {
         /* free extent allocated by vmdk_add_extent */
         vmdk_free_last_extent(bs);
@@ -662,31 +734,28 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
 
 /* Open an extent file and append to bs array */
 static int vmdk_open_sparse(BlockDriverState *bs,
-                            BlockDriverState *file,
-                            int flags)
+                            BlockDriverState *file, int flags,
+                            char *buf, Error **errp)
 {
     uint32_t magic;
 
-    if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
-        return -EIO;
-    }
-
-    magic = be32_to_cpu(magic);
+    magic = ldl_be_p(buf);
     switch (magic) {
         case VMDK3_MAGIC:
-            return vmdk_open_vmdk3(bs, file, flags);
+            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
             break;
         case VMDK4_MAGIC:
-            return vmdk_open_vmdk4(bs, file, flags);
+            return vmdk_open_vmdk4(bs, file, flags, errp);
             break;
         default:
-            return -EMEDIUMTYPE;
+            error_setg(errp, "Image not in VMDK format");
+            return -EINVAL;
             break;
     }
 }
 
 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
-        const char *desc_file_path)
+                              const char *desc_file_path, Error **errp)
 {
     int ret;
     char access[11];
@@ -697,6 +766,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
     int64_t flat_offset;
     char extent_path[PATH_MAX];
     BlockDriverState *extent_file;
+    BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent;
 
     while (*p) {
         /* parse extent line:
@@ -711,116 +782,141 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             goto next_line;
         } else if (!strcmp(type, "FLAT")) {
             if (ret != 5 || flat_offset < 0) {
+                error_setg(errp, "Invalid extent lines: \n%s", p);
+                return -EINVAL;
+            }
+        } else if (!strcmp(type, "VMFS")) {
+            if (ret == 4) {
+                flat_offset = 0;
+            } else {
+                error_setg(errp, "Invalid extent lines:\n%s", p);
                 return -EINVAL;
             }
         } else if (ret != 4) {
+            error_setg(errp, "Invalid extent lines:\n%s", p);
             return -EINVAL;
         }
 
         if (sectors <= 0 ||
-            (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
+            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
+             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
             (strcmp(access, "RW"))) {
             goto next_line;
         }
 
         path_combine(extent_path, sizeof(extent_path),
                 desc_file_path, fname);
-        ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags);
+        extent_file = NULL;
+        ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
+                        bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
         if (ret) {
             return ret;
         }
 
         /* save to extents array */
-        if (!strcmp(type, "FLAT")) {
+        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
             /* FLAT extent */
-            VmdkExtent *extent;
 
             ret = vmdk_add_extent(bs, extent_file, true, sectors,
-                            0, 0, 0, 0, sectors, &extent);
+                            0, 0, 0, 0, 0, &extent, errp);
             if (ret < 0) {
                 return ret;
             }
             extent->flat_start_offset = flat_offset << 9;
-        } else if (!strcmp(type, "SPARSE")) {
-            /* SPARSE extent */
-            ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
+        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
+            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
+            char *buf = vmdk_read_desc(extent_file, 0, errp);
+            if (!buf) {
+                ret = -EINVAL;
+            } else {
+                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
+            }
             if (ret) {
-                bdrv_delete(extent_file);
+                g_free(buf);
+                bdrv_unref(extent_file);
                 return ret;
             }
+            extent = &s->extents[s->num_extents - 1];
         } else {
-            fprintf(stderr,
-                "VMDK: Not supported extent type \"%s\""".\n", type);
+            error_setg(errp, "Unsupported extent type '%s'", type);
             return -ENOTSUP;
         }
+        extent->type = g_strdup(type);
 next_line:
         /* move to next line */
-        while (*p && *p != '\n') {
+        while (*p) {
+            if (*p == '\n') {
+                p++;
+                break;
+            }
             p++;
         }
-        p++;
     }
     return 0;
 }
 
-static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
-                               uint64_t desc_offset)
+static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
+                               Error **errp)
 {
     int ret;
-    char *buf = NULL;
     char ct[128];
     BDRVVmdkState *s = bs->opaque;
-    int64_t size;
-
-    size = bdrv_getlength(bs->file);
-    if (size < 0) {
-        return -EINVAL;
-    }
 
-    size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
-    buf = g_malloc0(size + 1);
-
-    ret = bdrv_pread(bs->file, desc_offset, buf, size);
-    if (ret < 0) {
-        goto exit;
-    }
     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
-        ret = -EMEDIUMTYPE;
+        error_setg(errp, "invalid VMDK image descriptor");
+        ret = -EINVAL;
         goto exit;
     }
     if (strcmp(ct, "monolithicFlat") &&
+        strcmp(ct, "vmfs") &&
+        strcmp(ct, "vmfsSparse") &&
         strcmp(ct, "twoGbMaxExtentSparse") &&
         strcmp(ct, "twoGbMaxExtentFlat")) {
-        fprintf(stderr,
-                "VMDK: Not supported image type \"%s\""".\n", ct);
+        error_setg(errp, "Unsupported image type '%s'", ct);
         ret = -ENOTSUP;
         goto exit;
     }
+    s->create_type = g_strdup(ct);
     s->desc_offset = 0;
-    ret = vmdk_parse_extents(buf, bs, bs->file->filename);
+    ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp);
 exit:
-    g_free(buf);
     return ret;
 }
 
-static int vmdk_open(BlockDriverState *bs, QDict *options, int flags)
+static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp)
 {
+    char *buf = NULL;
     int ret;
     BDRVVmdkState *s = bs->opaque;
+    uint32_t magic;
 
-    if (vmdk_open_sparse(bs, bs->file, flags) == 0) {
-        s->desc_offset = 0x200;
-    } else {
-        ret = vmdk_open_desc_file(bs, flags, 0);
-        if (ret) {
-            goto fail;
-        }
+    buf = vmdk_read_desc(bs->file, 0, errp);
+    if (!buf) {
+        return -EINVAL;
+    }
+
+    magic = ldl_be_p(buf);
+    switch (magic) {
+        case VMDK3_MAGIC:
+        case VMDK4_MAGIC:
+            ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp);
+            s->desc_offset = 0x200;
+            break;
+        default:
+            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            break;
     }
+    if (ret) {
+        goto fail;
+    }
+
     /* try to open parent images, if exist */
     ret = vmdk_parent_open(bs);
     if (ret) {
         goto fail;
     }
+    s->cid = vmdk_read_cid(bs, 0);
     s->parent_cid = vmdk_read_cid(bs, 1);
     qemu_co_mutex_init(&s->lock);
 
@@ -829,14 +925,34 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags)
               QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
               "vmdk", bs->device_name, "live migration");
     migrate_add_blocker(s->migration_blocker);
-
+    g_free(buf);
     return 0;
 
 fail:
+    g_free(buf);
+    g_free(s->create_type);
+    s->create_type = NULL;
     vmdk_free_extents(bs);
     return ret;
 }
 
+
+static int vmdk_refresh_limits(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        if (!s->extents[i].flat) {
+            bs->bl.write_zeroes_alignment =
+                MAX(bs->bl.write_zeroes_alignment,
+                    s->extents[i].cluster_sectors);
+        }
+    }
+
+    return 0;
+}
+
 static int get_whole_cluster(BlockDriverState *bs,
                 VmdkExtent *extent,
                 uint64_t cluster_offset,
@@ -1039,7 +1155,7 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
-static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
@@ -1056,7 +1172,24 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
                             sector_num * 512, 0, &offset);
     qemu_co_mutex_unlock(&s->lock);
 
-    ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
+    switch (ret) {
+    case VMDK_ERROR:
+        ret = -EIO;
+        break;
+    case VMDK_UNALLOC:
+        ret = 0;
+        break;
+    case VMDK_ZEROED:
+        ret = BDRV_BLOCK_ZERO;
+        break;
+    case VMDK_OK:
+        ret = BDRV_BLOCK_DATA;
+        if (extent->file == bs->file && !extent->compressed) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | offset;
+        }
+
+        break;
+    }
 
     index_in_cluster = sector_num % extent->cluster_sectors;
     n = extent->cluster_sectors - index_in_cluster;
@@ -1254,15 +1387,14 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
-    int n, ret;
-    int64_t index_in_cluster;
+    int ret;
+    int64_t index_in_cluster, n;
     uint64_t extent_begin_sector, extent_relative_sector_num;
     uint64_t cluster_offset;
     VmdkMetaData m_data;
 
     if (sector_num > bs->total_sectors) {
-        fprintf(stderr,
-                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
+        error_report("Wrong offset: sector_num=0x%" PRIx64
                 " total_sectors=0x%" PRIx64 "\n",
                 sector_num, bs->total_sectors);
         return -EIO;
@@ -1282,9 +1414,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (extent->compressed) {
             if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
-                fprintf(stderr,
-                        "VMDK: can't write to allocated cluster"
-                        " for streamOptimized\n");
+                error_report("Could not write to allocated cluster"
+                              " for streamOptimized");
                 return -EIO;
             } else {
                 /* allocate */
@@ -1366,7 +1497,8 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
 
 static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
                                              int64_t sector_num,
-                                             int nb_sectors)
+                                             int nb_sectors,
+                                             BdrvRequestFlags flags)
 {
     int ret;
     BDRVVmdkState *s = bs->opaque;
@@ -1381,25 +1513,36 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
     return ret;
 }
 
-
 static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress, bool zeroed_grain)
+                              bool flat, bool compress, bool zeroed_grain,
+                              Error **errp)
 {
     int ret, i;
-    int fd = 0;
+    BlockDriverState *bs = NULL;
     VMDK4Header header;
-    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+    Error *local_err;
+    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
+    uint32_t *gd_buf = NULL;
+    int gd_buf_size;
+
+    ret = bdrv_create_file(filename, NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
 
-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        return -errno;
+    assert(bs == NULL);
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
     }
+
     if (flat) {
-        ret = ftruncate(fd, filesize);
+        ret = bdrv_truncate(bs, filesize);
         if (ret < 0) {
-            ret = -errno;
+            error_setg_errno(errp, -ret, "Could not truncate file");
         }
         goto exit;
     }
@@ -1410,24 +1553,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                    | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
-    header.capacity = filesize / 512;
+    header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
-    header.num_gtes_per_gt = 512;
+    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = (filesize / 512 + header.granularity - 1) / header.granularity;
-    gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
-    gt_count =
-        (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
-    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
+                           BDRV_SECTOR_SIZE);
+    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
     header.desc_size = 20;
     header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
     header.grain_offset =
-       ((header.gd_offset + gd_size + (gt_size * gt_count) +
-         header.granularity - 1) / header.granularity) *
-        header.granularity;
+        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
+                 header.granularity);
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1447,58 +1589,65 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     header.check_bytes[3] = 0xa;
 
     /* write all the data */
-    ret = qemu_write_full(fd, &magic, sizeof(magic));
-    if (ret != sizeof(magic)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
-    ret = qemu_write_full(fd, &header, sizeof(header));
-    if (ret != sizeof(header)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
 
-    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
+    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
     if (ret < 0) {
-        ret = -errno;
+        error_setg_errno(errp, -ret, "Could not truncate file");
         goto exit;
     }
 
     /* write grain directory */
-    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
+    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     /* write backup grain directory */
-    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
+    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     ret = 0;
- exit:
-    qemu_close(fd);
+exit:
+    if (bs) {
+        bdrv_unref(bs);
+    }
+    g_free(gd_buf);
     return ret;
 }
 
 static int filename_decompose(const char *filename, char *path, char *prefix,
-        char *postfix, size_t buf_len)
+                              char *postfix, size_t buf_len, Error **errp)
 {
     const char *p, *q;
 
     if (filename == NULL || !strlen(filename)) {
-        fprintf(stderr, "Vmdk: no filename provided.\n");
+        error_setg(errp, "No filename provided");
         return VMDK_ERROR;
     }
     p = strrchr(filename, '/');
@@ -1532,10 +1681,13 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
     return VMDK_OK;
 }
 
-static int vmdk_create(const char *filename, QEMUOptionParameter *options)
+static int vmdk_create(const char *filename, QEMUOptionParameter *options,
+                       Error **errp)
 {
-    int fd, idx = 0;
-    char desc[BUF_SIZE];
+    int idx = 0;
+    BlockDriverState *new_bs = NULL;
+    Error *local_err;
+    char *desc = NULL;
     int64_t total_size = 0, filesize;
     const char *adapter_type = NULL;
     const char *backing_file = NULL;
@@ -1543,7 +1695,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     int flags = 0;
     int ret = 0;
     bool flat, split, compress;
-    char ext_desc_lines[BUF_SIZE] = "";
+    GString *ext_desc_lines;
     char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
     const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
     const char *desc_extent_line;
@@ -1551,6 +1703,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     uint32_t parent_cid = 0xffffffff;
     uint32_t number_heads = 16;
     bool zeroed_grain = false;
+    uint32_t desc_offset = 0, desc_len;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
         "version=1\n"
@@ -1571,8 +1724,11 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         "ddb.geometry.sectors = \"63\"\n"
         "ddb.adapterType = \"%s\"\n";
 
-    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
-        return -EINVAL;
+    ext_desc_lines = g_string_new(NULL);
+
+    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
+        ret = -EINVAL;
+        goto exit;
     }
     /* Read out options */
     while (options && options->name) {
@@ -1597,8 +1753,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
                strcmp(adapter_type, "buslogic") &&
                strcmp(adapter_type, "lsilogic") &&
                strcmp(adapter_type, "legacyESX")) {
-        fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type);
-        return -EINVAL;
+        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
+        ret = -EINVAL;
+        goto exit;
     }
     if (strcmp(adapter_type, "ide") != 0) {
         /* that's the number of heads with which vmware operates when
@@ -1613,8 +1770,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
                strcmp(fmt, "twoGbMaxExtentSparse") &&
                strcmp(fmt, "twoGbMaxExtentFlat") &&
                strcmp(fmt, "streamOptimized")) {
-        fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt);
-        return -EINVAL;
+        error_setg(errp, "Unknown subformat: '%s'", fmt);
+        ret = -EINVAL;
+        goto exit;
     }
     split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
               strcmp(fmt, "twoGbMaxExtentSparse"));
@@ -1627,22 +1785,29 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         desc_extent_line = "RW %lld SPARSE \"%s\"\n";
     }
     if (flat && backing_file) {
-        /* not supporting backing file for flat image */
-        return -ENOTSUP;
+        error_setg(errp, "Flat image can't have backing file");
+        ret = -ENOTSUP;
+        goto exit;
+    }
+    if (flat && zeroed_grain) {
+        error_setg(errp, "Flat image can't enable zeroed grain");
+        ret = -ENOTSUP;
+        goto exit;
     }
     if (backing_file) {
-        BlockDriverState *bs = bdrv_new("");
-        ret = bdrv_open(bs, backing_file, NULL, 0, NULL);
+        BlockDriverState *bs = NULL;
+        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_NO_BACKING, NULL,
+                        errp);
         if (ret != 0) {
-            bdrv_delete(bs);
-            return ret;
+            goto exit;
         }
         if (strcmp(bs->drv->format_name, "vmdk")) {
-            bdrv_delete(bs);
-            return -EINVAL;
+            bdrv_unref(bs);
+            ret = -EINVAL;
+            goto exit;
         }
         parent_cid = vmdk_read_cid(bs, 0);
-        bdrv_delete(bs);
+        bdrv_unref(bs);
         snprintf(parent_desc_line, sizeof(parent_desc_line),
                 "parentFileNameHint=\"%s\"", backing_file);
     }
@@ -1672,51 +1837,66 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
                 path, desc_filename);
 
         if (vmdk_create_extent(ext_filename, size,
-                               flat, compress, zeroed_grain)) {
-            return -EINVAL;
+                               flat, compress, zeroed_grain, errp)) {
+            ret = -EINVAL;
+            goto exit;
         }
         filesize -= size;
 
         /* Format description line */
         snprintf(desc_line, sizeof(desc_line),
-                    desc_extent_line, size / 512, desc_filename);
-        pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
+                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
+        g_string_append(ext_desc_lines, desc_line);
     }
     /* generate descriptor file */
-    snprintf(desc, sizeof(desc), desc_template,
-            (unsigned int)time(NULL),
-            parent_cid,
-            fmt,
-            parent_desc_line,
-            ext_desc_lines,
-            (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
-            total_size / (int64_t)(63 * number_heads * 512), number_heads,
-                adapter_type);
-    if (split || flat) {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                       0644);
+    desc = g_strdup_printf(desc_template,
+                           (unsigned int)time(NULL),
+                           parent_cid,
+                           fmt,
+                           parent_desc_line,
+                           ext_desc_lines->str,
+                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           total_size /
+                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
+                           number_heads,
+                           adapter_type);
+    desc_len = strlen(desc);
+    /* the descriptor offset = 0x200 */
+    if (!split && !flat) {
+        desc_offset = 0x200;
     } else {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_BINARY | O_LARGEFILE,
-                       0644);
-    }
-    if (fd < 0) {
-        return -errno;
+        ret = bdrv_create_file(filename, options, &local_err);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not create image file");
+            goto exit;
+        }
     }
-    /* the descriptor offset = 0x200 */
-    if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
-        ret = -errno;
+    assert(new_bs == NULL);
+    ret = bdrv_open(&new_bs, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
     }
-    ret = qemu_write_full(fd, desc, strlen(desc));
-    if (ret != strlen(desc)) {
-        ret = -errno;
+    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
     }
-    ret = 0;
+    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
+     * for description file */
+    if (desc_offset == 0) {
+        ret = bdrv_truncate(new_bs, desc_len);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not truncate file");
+        }
+    }
 exit:
-    qemu_close(fd);
+    if (new_bs) {
+        bdrv_unref(new_bs);
+    }
+    g_free(desc);
+    g_string_free(ext_desc_lines, true);
     return ret;
 }
 
@@ -1725,6 +1905,7 @@ static void vmdk_close(BlockDriverState *bs)
     BDRVVmdkState *s = bs->opaque;
 
     vmdk_free_extents(bs);
+    g_free(s->create_type);
 
     migrate_del_blocker(s->migration_blocker);
     error_free(s->migration_blocker);
@@ -1786,6 +1967,101 @@ static int vmdk_has_zero_init(BlockDriverState *bs)
     return 1;
 }
 
+static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
+{
+    ImageInfo *info = g_new0(ImageInfo, 1);
+
+    *info = (ImageInfo){
+        .filename         = g_strdup(extent->file->filename),
+        .format           = g_strdup(extent->type),
+        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
+        .compressed       = extent->compressed,
+        .has_compressed   = extent->compressed,
+        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
+        .has_cluster_size = !extent->flat,
+    };
+
+    return info;
+}
+
+static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
+                      BdrvCheckMode fix)
+{
+    BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = NULL;
+    int64_t sector_num = 0;
+    int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int ret;
+    uint64_t cluster_offset;
+
+    if (fix) {
+        return -ENOTSUP;
+    }
+
+    for (;;) {
+        if (sector_num >= total_sectors) {
+            return 0;
+        }
+        extent = find_extent(s, sector_num, extent);
+        if (!extent) {
+            fprintf(stderr,
+                    "ERROR: could not find extent for sector %" PRId64 "\n",
+                    sector_num);
+            break;
+        }
+        ret = get_cluster_offset(bs, extent, NULL,
+                                 sector_num << BDRV_SECTOR_BITS,
+                                 0, &cluster_offset);
+        if (ret == VMDK_ERROR) {
+            fprintf(stderr,
+                    "ERROR: could not get cluster_offset for sector %"
+                    PRId64 "\n", sector_num);
+            break;
+        }
+        if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) {
+            fprintf(stderr,
+                    "ERROR: cluster offset for sector %"
+                    PRId64 " points after EOF\n", sector_num);
+            break;
+        }
+        sector_num += extent->cluster_sectors;
+    }
+
+    result->corruptions++;
+    return 0;
+}
+
+static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
+{
+    int i;
+    BDRVVmdkState *s = bs->opaque;
+    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
+    ImageInfoList **next;
+
+    *spec_info = (ImageInfoSpecific){
+        .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK,
+        {
+            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
+        },
+    };
+
+    *spec_info->vmdk = (ImageInfoSpecificVmdk) {
+        .create_type = g_strdup(s->create_type),
+        .cid = s->cid,
+        .parent_cid = s->parent_cid,
+    };
+
+    next = &spec_info->vmdk->extents;
+    for (i = 0; i < s->num_extents; i++) {
+        *next = g_new0(ImageInfoList, 1);
+        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
+        (*next)->next = NULL;
+        next = &(*next)->next;
+    }
+
+    return spec_info;
+}
+
 static QEMUOptionParameter vmdk_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1828,6 +2104,7 @@ static BlockDriver bdrv_vmdk = {
     .instance_size                = sizeof(BDRVVmdkState),
     .bdrv_probe                   = vmdk_probe,
     .bdrv_open                    = vmdk_open,
+    .bdrv_check                   = vmdk_check,
     .bdrv_reopen_prepare          = vmdk_reopen_prepare,
     .bdrv_read                    = vmdk_co_read,
     .bdrv_write                   = vmdk_co_write,
@@ -1835,9 +2112,11 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_close                   = vmdk_close,
     .bdrv_create                  = vmdk_create,
     .bdrv_co_flush_to_disk        = vmdk_co_flush,
-    .bdrv_co_is_allocated         = vmdk_co_is_allocated,
+    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
     .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
     .bdrv_has_zero_init           = vmdk_has_zero_init,
+    .bdrv_get_specific_info       = vmdk_get_specific_info,
+    .bdrv_refresh_limits          = vmdk_refresh_limits,
 
     .create_options               = vmdk_create_options,
 };
diff --git a/block/vpc.c b/block/vpc.c
index fe4f311d5..2e25f5723 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -45,8 +45,10 @@ enum vhd_type {
 // Seconds since Jan 1, 2000 0:00:00 (UTC)
 #define VHD_TIMESTAMP_BASE 946684800
 
+#define VHD_MAX_SECTORS       (65535LL * 255 * 255)
+
 // always big-endian
-struct vhd_footer {
+typedef struct vhd_footer {
     char        creator[8]; // "conectix"
     uint32_t    features;
     uint32_t    version;
@@ -79,9 +81,9 @@ struct vhd_footer {
     uint8_t     uuid[16];
 
     uint8_t     in_saved_state;
-};
+} QEMU_PACKED VHDFooter;
 
-struct vhd_dyndisk_header {
+typedef struct vhd_dyndisk_header {
     char        magic[8]; // "cxsparse"
 
     // Offset of next header structure, 0xFFFFFFFF if none
@@ -111,7 +113,7 @@ struct vhd_dyndisk_header {
         uint32_t    reserved;
         uint64_t    data_offset;
     } parent_locator[8];
-};
+} QEMU_PACKED VHDDynDiskHeader;
 
 typedef struct BDRVVPCState {
     CoMutex lock;
@@ -155,14 +157,16 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 0;
 }
 
-static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
+static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
 {
     BDRVVPCState *s = bs->opaque;
     int i;
-    struct vhd_footer* footer;
-    struct vhd_dyndisk_header* dyndisk_header;
+    VHDFooter *footer;
+    VHDDynDiskHeader *dyndisk_header;
     uint8_t buf[HEADER_SIZE];
     uint32_t checksum;
+    uint64_t computed_size;
     int disk_type = VHD_DYNAMIC;
     int ret;
 
@@ -171,7 +175,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
-    footer = (struct vhd_footer*) s->footer_buf;
+    footer = (VHDFooter *) s->footer_buf;
     if (strncmp(footer->creator, "conectix", 8)) {
         int64_t offset = bdrv_getlength(bs->file);
         if (offset < 0) {
@@ -189,7 +193,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
             goto fail;
         }
         if (strncmp(footer->creator, "conectix", 8)) {
-            ret = -EMEDIUMTYPE;
+            error_setg(errp, "invalid VPC image");
+            ret = -EINVAL;
             goto fail;
         }
         disk_type = VHD_FIXED;
@@ -210,8 +215,17 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
     bs->total_sectors = (int64_t)
         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 
+    /* images created with disk2vhd report a far higher virtual size
+     * than expected with the cyls * heads * sectors_per_cyl formula.
+     * use the footer->size instead if the image was created with
+     * disk2vhd.
+     */
+    if (!strncmp(footer->creator_app, "d2v", 4)) {
+        bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE;
+    }
+
     /* Allow a maximum disk size of approximately 2 TB */
-    if (bs->total_sectors >= 65535LL * 255 * 255) {
+    if (bs->total_sectors >= VHD_MAX_SECTORS) {
         ret = -EFBIG;
         goto fail;
     }
@@ -223,7 +237,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
             goto fail;
         }
 
-        dyndisk_header = (struct vhd_dyndisk_header *) buf;
+        dyndisk_header = (VHDDynDiskHeader *) buf;
 
         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
             ret = -EINVAL;
@@ -231,10 +245,31 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
         }
 
         s->block_size = be32_to_cpu(dyndisk_header->block_size);
+        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
+            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
+            ret = -EINVAL;
+            goto fail;
+        }
         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
 
         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
-        s->pagetable = g_malloc(s->max_table_entries * 4);
+
+        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        computed_size = (uint64_t) s->max_table_entries * s->block_size;
+        if (computed_size < bs->total_sectors * 512) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);
 
         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
 
@@ -259,6 +294,13 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
             }
         }
 
+        if (s->free_data_block_offset > bdrv_getlength(bs->file)) {
+            error_setg(errp, "block-vpc: free_data_block_offset points after "
+                             "the end of file. The image has been truncated.");
+            ret = -EINVAL;
+            goto fail;
+        }
+
         s->last_bitmap_offset = (int64_t) -1;
 
 #ifdef CACHE
@@ -280,7 +322,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags)
     return 0;
 
 fail:
-    g_free(s->pagetable);
+    qemu_vfree(s->pagetable);
 #ifdef CACHE
     g_free(s->pageentry_u8);
 #endif
@@ -438,6 +480,19 @@ fail:
     return -1;
 }
 
+static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
+    VHDFooter *footer = (VHDFooter *) s->footer_buf;
+
+    if (cpu_to_be32(footer->type) != VHD_FIXED) {
+        bdi->cluster_size = s->block_size;
+    }
+
+    bdi->unallocated_blocks_are_zero = true;
+    return 0;
+}
+
 static int vpc_read(BlockDriverState *bs, int64_t sector_num,
                     uint8_t *buf, int nb_sectors)
 {
@@ -445,7 +500,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num,
     int ret;
     int64_t offset;
     int64_t sectors, sectors_per_block;
-    struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf;
+    VHDFooter *footer = (VHDFooter *) s->footer_buf;
 
     if (cpu_to_be32(footer->type) == VHD_FIXED) {
         return bdrv_read(bs->file, sector_num, buf, nb_sectors);
@@ -494,7 +549,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num,
     int64_t offset;
     int64_t sectors, sectors_per_block;
     int ret;
-    struct vhd_footer *footer =  (struct vhd_footer *) s->footer_buf;
+    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
 
     if (cpu_to_be32(footer->type) == VHD_FIXED) {
         return bdrv_write(bs->file, sector_num, buf, nb_sectors);
@@ -596,8 +651,8 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
 
 static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
 {
-    struct vhd_dyndisk_header* dyndisk_header =
-        (struct vhd_dyndisk_header*) buf;
+    VHDDynDiskHeader *dyndisk_header =
+        (VHDDynDiskHeader *) buf;
     size_t block_size, num_bat_entries;
     int i;
     int ret = -EIO;
@@ -683,10 +738,11 @@ static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size)
     return ret;
 }
 
-static int vpc_create(const char *filename, QEMUOptionParameter *options)
+static int vpc_create(const char *filename, QEMUOptionParameter *options,
+                      Error **errp)
 {
     uint8_t buf[1024];
-    struct vhd_footer *footer = (struct vhd_footer *) buf;
+    VHDFooter *footer = (VHDFooter *) buf;
     QEMUOptionParameter *disk_type_param;
     int fd, i;
     uint16_t cyls = 0;
@@ -789,7 +845,7 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
 static int vpc_has_zero_init(BlockDriverState *bs)
 {
     BDRVVPCState *s = bs->opaque;
-    struct vhd_footer *footer =  (struct vhd_footer *) s->footer_buf;
+    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
 
     if (cpu_to_be32(footer->type) == VHD_FIXED) {
         return bdrv_has_zero_init(bs->file);
@@ -801,7 +857,7 @@ static int vpc_has_zero_init(BlockDriverState *bs)
 static void vpc_close(BlockDriverState *bs)
 {
     BDRVVPCState *s = bs->opaque;
-    g_free(s->pagetable);
+    qemu_vfree(s->pagetable);
 #ifdef CACHE
     g_free(s->pageentry_u8);
 #endif
@@ -839,6 +895,8 @@ static BlockDriver bdrv_vpc = {
     .bdrv_read              = vpc_co_read,
     .bdrv_write             = vpc_co_write,
 
+    .bdrv_get_info          = vpc_get_info,
+
     .create_options         = vpc_create_options,
     .bdrv_has_zero_init     = vpc_has_zero_init,
 };
diff --git a/block/vvfat.c b/block/vvfat.c
index cd3b8edd9..1978c9ed6 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -266,8 +266,7 @@ typedef struct mbr_t {
 } QEMU_PACKED mbr_t;
 
 typedef struct direntry_t {
-    uint8_t name[8];
-    uint8_t extension[3];
+    uint8_t name[8 + 3];
     uint8_t attributes;
     uint8_t reserved[2];
     uint16_t ctime;
@@ -518,11 +517,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry)
     uint8_t chksum=0;
     int i;
 
-    for(i=0;i<11;i++) {
-        unsigned char c;
-
-        c = (i < 8) ? entry->name[i] : entry->extension[i-8];
-        chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+    for (i = 0; i < ARRAY_SIZE(entry->name); i++) {
+        chksum = (((chksum & 0xfe) >> 1) |
+                  ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i];
     }
 
     return chksum;
@@ -617,7 +614,7 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
 
     if(is_dot) {
 	entry=array_get_next(&(s->directory));
-	memset(entry->name,0x20,11);
+        memset(entry->name, 0x20, sizeof(entry->name));
 	memcpy(entry->name,filename,strlen(filename));
 	return entry;
     }
@@ -632,12 +629,14 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
 	i = 8;
 
     entry=array_get_next(&(s->directory));
-    memset(entry->name,0x20,11);
+    memset(entry->name, 0x20, sizeof(entry->name));
     memcpy(entry->name, filename, i);
 
-    if(j > 0)
-	for (i = 0; i < 3 && filename[j+1+i]; i++)
-	    entry->extension[i] = filename[j+1+i];
+    if (j > 0) {
+        for (i = 0; i < 3 && filename[j + 1 + i]; i++) {
+            entry->name[8 + i] = filename[j + 1 + i];
+        }
+    }
 
     /* upcase & remove unwanted characters */
     for(i=10;i>=0;i--) {
@@ -861,8 +860,7 @@ static int init_directories(BDRVVVFATState* s,
     {
 	direntry_t* entry=array_get_next(&(s->directory));
 	entry->attributes=0x28; /* archive | volume label */
-	memcpy(entry->name,"QEMU VVF",8);
-	memcpy(entry->extension,"AT ",3);
+        memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name));
     }
 
     /* Now build FAT, and write back information into directory */
@@ -1065,7 +1063,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options,
     qdict_put(options, "rw", qbool_from_int(rw));
 }
 
-static int vvfat_open(BlockDriverState *bs, QDict *options, int flags)
+static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
 {
     BDRVVVFATState *s = bs->opaque;
     int cyls, heads, secs;
@@ -1084,19 +1083,17 @@ DLOG(if (stderr == NULL) {
     setbuf(stderr, NULL);
 })
 
-    opts = qemu_opts_create_nofail(&runtime_opts);
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (error_is_set(&local_err)) {
-        qerror_report_err(local_err);
-        error_free(local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
 
     dirname = qemu_opt_get(opts, "dir");
     if (!dirname) {
-        qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires "
-                      "a 'dir' option");
+        error_setg(errp, "vvfat block driver requires a 'dir' option");
         ret = -EINVAL;
         goto fail;
     }
@@ -1122,6 +1119,7 @@ DLOG(if (stderr == NULL) {
         if (!s->fat_type) {
             s->fat_type = 16;
         }
+        s->first_sectors_number = 0x40;
         cyls = s->fat_type == 12 ? 64 : 1024;
         heads = 16;
         secs = 63;
@@ -1136,8 +1134,7 @@ DLOG(if (stderr == NULL) {
     case 12:
         break;
     default:
-        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only "
-                      "12, 16 and 32");
+        error_setg(errp, "Valid FAT types are only 12, 16 and 32");
         ret = -EINVAL;
         goto fail;
     }
@@ -1150,7 +1147,6 @@ DLOG(if (stderr == NULL) {
 
     s->current_cluster=0xffffffff;
 
-    s->first_sectors_number=0x40;
     /* read only is the default for safety */
     bs->read_only = 1;
     s->qcow = s->write_target = NULL;
@@ -1590,17 +1586,20 @@ static int parse_short_name(BDRVVVFATState* s,
 	    lfn->name[i] = direntry->name[i];
     }
 
-    for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+    for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) {
+    }
     if (j >= 0) {
 	lfn->name[i++] = '.';
 	lfn->name[i + j + 1] = '\0';
 	for (;j >= 0; j--) {
-	    if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
-		return -2;
-	    else if (s->downcase_short_names)
-		lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
-	    else
-		lfn->name[i + j] = direntry->extension[j];
+            uint8_t c = direntry->name[8 + j];
+            if (c <= ' ' || c > 0x7f) {
+                return -2;
+            } else if (s->downcase_short_names) {
+                lfn->name[i + j] = qemu_tolower(c);
+            } else {
+                lfn->name[i + j] = c;
+            }
 	}
     } else
 	lfn->name[i + j + 1] = '\0';
@@ -2874,16 +2873,17 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
 	int64_t sector_num, int nb_sectors, int* n)
 {
     BDRVVVFATState* s = bs->opaque;
     *n = s->sector_count - sector_num;
-    if (*n > nb_sectors)
-	*n = nb_sectors;
-    else if (*n < 0)
-	return 0;
-    return 1;
+    if (*n > nb_sectors) {
+        *n = nb_sectors;
+    } else if (*n < 0) {
+        return 0;
+    }
+    return BDRV_BLOCK_DATA;
 }
 
 static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
@@ -2894,7 +2894,7 @@ static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
 
 static void write_target_close(BlockDriverState *bs) {
     BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
-    bdrv_delete(s->qcow);
+    bdrv_unref(s->qcow);
     g_free(s->qcow_filename);
 }
 
@@ -2908,6 +2908,7 @@ static int enable_write_target(BDRVVVFATState *s)
 {
     BlockDriver *bdrv_qcow;
     QEMUOptionParameter *options;
+    Error *local_err = NULL;
     int ret;
     int size = sector2cluster(s, s->sector_count);
     s->used_clusters = calloc(size, 1);
@@ -2925,17 +2926,20 @@ static int enable_write_target(BDRVVVFATState *s)
     set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
     set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
 
-    ret = bdrv_create(bdrv_qcow, s->qcow_filename, options);
+    ret = bdrv_create(bdrv_qcow, s->qcow_filename, options, &local_err);
     if (ret < 0) {
+        qerror_report_err(local_err);
+        error_free(local_err);
         goto err;
     }
 
-    s->qcow = bdrv_new("");
-
-    ret = bdrv_open(s->qcow, s->qcow_filename, NULL,
-            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
+    s->qcow = NULL;
+    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL,
+            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow,
+            &local_err);
     if (ret < 0) {
-        bdrv_delete(s->qcow);
+        qerror_report_err(local_err);
+        error_free(local_err);
         goto err;
     }
 
@@ -2943,7 +2947,7 @@ static int enable_write_target(BDRVVVFATState *s)
     unlink(s->qcow_filename);
 #endif
 
-    s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+    s->bs->backing_hd = bdrv_new("");
     s->bs->backing_hd->drv = &vvfat_write_target;
     s->bs->backing_hd->opaque = g_malloc(sizeof(void*));
     *(void**)s->bs->backing_hd->opaque = s;
@@ -2984,7 +2988,7 @@ static BlockDriver bdrv_vvfat = {
 
     .bdrv_read              = vvfat_co_read,
     .bdrv_write             = vvfat_co_write,
-    .bdrv_co_is_allocated   = vvfat_co_is_allocated,
+    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };
 
 static void bdrv_vvfat_init(void)
diff --git a/block/win32-aio.c b/block/win32-aio.c
index fcb7c754d..5d1d199b6 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -105,13 +105,6 @@ static void win32_aio_completion_cb(EventNotifier *e)
     }
 }
 
-static int win32_aio_flush_cb(EventNotifier *e)
-{
-    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
-
-    return (s->count > 0) ? 1 : 0;
-}
-
 static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
@@ -201,8 +194,7 @@ QEMUWin32AIOState *win32_aio_init(void)
         goto out_close_efd;
     }
 
-    qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb,
-                                win32_aio_flush_cb);
+    qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb);
 
     return s;